In [1]:
# load libraries
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt


In [2]:
# load saved training/validation/test data
with open('train_val_test_data.pkl', 'rb') as file:
    X_train, X_val, X_test, y_train, y_val, y_test = pickle.load(file)

In [3]:
# loads the data from the pkl file into X_train, X_val, and X_test
full_columns = ['password', 'length', 'length_bin', 'uppercase_count', 'lowercase_count', 'numbers_count', 'special_character_count', 'entropy', 
                'ngram_occurrences', 'character_diversity', 'contains_name', 
                'upper_special', 'length_entropy', 'lower_numbers', 'entropy_special',
                'upper_ratio', 'lower_ratio', 'special_character_ratio', 'numbers_ratio', 'entropy_per_character']
X_train_full_df = pd.DataFrame(X_train, columns=full_columns)
X_val_full_df = pd.DataFrame(X_val, columns=full_columns)
X_test_full_df = pd.DataFrame(X_test, columns=full_columns)

In [4]:
# Model 1: Character-Based Features 
# create X_train, X_val, and X_test for model 1
model1_columns = ['length', 'entropy', 'contains_name']
X_train_model1 = X_train_full_df[model1_columns].to_numpy().astype(np.float64)
X_val_model1 = X_val_full_df[model1_columns].to_numpy().astype(np.float64)
X_test_model1 = X_test_full_df[model1_columns].to_numpy().astype(np.float64)

In [5]:
# Model 2: Semantic Features
model2_columns = ['uppercase_count', 'lowercase_count', 'numbers_count', 'special_character_count', 'contains_name']
X_train_model2 = X_train_full_df[model2_columns].to_numpy().astype(np.float64)
X_val_model2 = X_val_full_df[model2_columns].to_numpy().astype(np.float64)
X_test_model2 = X_test_full_df[model2_columns].to_numpy().astype(np.float64)

In [6]:
# Model 3: Combined Features

model3_columns = ['upper_special', 'length_entropy', 'entropy_special', 'upper_ratio', 'lower_ratio', 
                  'special_character_ratio', 'numbers_ratio', 'entropy_per_character']
X_train_model3 = X_train_full_df[model3_columns].to_numpy().astype(np.float64)
X_val_model3 = X_val_full_df[model3_columns].to_numpy().astype(np.float64)
X_test_model3 = X_test_full_df[model3_columns].to_numpy().astype(np.float64)


In [7]:
# Model 4: Model with all of the features available
full_model_columns = ['length', 'uppercase_count', 'uppercase_count', 'lowercase_count', 'numbers_count', 'special_character_count', 'contains_name',
                      'upper_special', 'length_entropy', 'lower_numbers', 'entropy_special', 'upper_ratio', 'lower_ratio', 
                      'special_character_ratio', 'numbers_ratio', 'entropy_per_character']
X_train_full_model = X_train_full_df[full_model_columns].to_numpy().astype(np.float64)
X_val_full_model = X_val_full_df[full_model_columns].to_numpy().astype(np.float64)
X_test_full_model = X_test_full_df[full_model_columns].to_numpy().astype(np.float64)

In [8]:
# calculate accuracy given the true labels and the predictions
def accuracy(y_truth, y_pred):
    correct_pred = 0
    # iterate through the values and check if the labels are the same, update as required
    for y_t, y_p in zip(y_truth, y_pred):
        if y_t == y_p :
            correct_pred += 1
    # find the proportion by dividing the correct predictions by all the predictions
    return correct_pred / len(y_truth)

In [9]:
# softmax function
def softmax(z):
   exp_z = np.exp(z - np.max(z, axis = 1, keepdims = True))
   return exp_z / np.sum(exp_z, axis = 1, keepdims = True)

In [10]:
# logistic regression function using softmax instead of sigmoid for multinomial classification (gradient descent)
def logistic_regression(X, y, num_classes, iterations, learning_rate):
   # add bias terms
   X = np.hstack((np.ones((X.shape[0], 1)), X))

   # initialize the weights
   w = np.ones((num_classes, X.shape[1]))

   # gradient descent, adjust weights iteratively using the learning rate
   for i in range(iterations):
      # find the predicted 
      class_probabilities = softmax(X.dot(w.T))

      # one hot encoding of labels
      y_one_hot = np.eye(num_classes)[y].reshape(len(y), num_classes)

      # calculate gradient and adjust the weights
      gradient = (class_probabilities - y_one_hot).T.dot(X) / len(y)
      w -= learning_rate * gradient
   return w

In [11]:
# single model predictor -- predict class using softmax and weights
def softmax_prediction(X, w):
   # add bias terms
   X = np.hstack((np.ones((X.shape[0], 1)), X))
   # return the class with the highest probability as the predicted label
   return np.argmax(softmax(X.dot(w.T)), axis = 1)

In [12]:
# makes predictions based on weighted sum
def lr_weighted_sum_voting_predictions(X_models, w_models, weights=[0.33, 0.34, 0.33]):
    # finds the total number of models based on size of X_models input
    num_models = len(X_models)

    # creates array of predictions for each of the models
    predictions = []
    for i in range(num_models):
        curr_model_prediction = softmax_prediction(X_models[i], w_models[i])
        predictions.append(curr_model_prediction)

    # finds expected value for predictions based on given weights
    return (weights[0] * predictions[0] + weights[1] * predictions[1] + weights[2] * predictions[2]).astype(int)


In [28]:
#creates predictions with votes, equal
def lr_hard_voting_predictions(X_models, w_models):

    # create an array of predictions for all three models
    num_models = len(X_models)
    predictions = []

    for i in range(num_models):
        curr_model_prediction = softmax_prediction(X_models[i], w_models[i])
        predictions.append(curr_model_prediction)

    # reshapes the predictions for voting
    predictions = np.stack(predictions, axis=0) 

    _, num_samples = predictions.shape
    num_classes = 5  

    final_preds = []
    for sample_ind in range(num_samples):

        # creates an array to tally up the counts for each of the classes
        vote_counts = np.zeros(num_classes)

        # iterates through the models
        for model_ind in range(num_models):
            # finds the predicted class and adds 1 to the class count
            class_pred = predictions[model_ind, sample_ind]
            vote_counts[class_pred] += 1
        # appends to final prediction
        final_preds.append(np.argmax(vote_counts))

    return np.array(final_preds)

In [33]:
# creates predictions with votes, biased towards more accurate models
def lr_weighted_hard_voting(X_models, w_models, weights):
    # creates predictions
    num_models = len(X_models)
    predictions = []
    for i in range(num_models):
        curr_model_prediction = softmax_prediction(X_models[i], w_models[i])
        predictions.append(curr_model_prediction)

    # reshapes the predictions for voting
    predictions = np.stack(predictions, axis=0) 

    _, num_samples = predictions.shape
    num_classes = 5  

    final_preds = []
    for sample_ind in range(num_samples):

        # creates an array to tally up the counts for each of the classes
        vote_counts = np.zeros(num_classes)

        # iterates through the models
        for model_ind in range(num_models):
            # finds the predicted class and adds weight to the class count
            class_pred = predictions[model_ind, sample_ind]
            vote_counts[class_pred] += weights[i]
        # appends to final prediction
        final_preds.append(np.argmax(vote_counts))

    return np.array(final_preds)

In [84]:
# train log reg model 1
weights_model1 = logistic_regression(X_train_model1, y_train, 5, 100000, 0.025)

In [85]:
# train log reg model 2
weights_model2 = logistic_regression(X_train_model2, y_train, 5, 100000, 0.045)

In [86]:
# train log reg model 3
weights_model3 = logistic_regression(X_train_model3, y_train, 5, 100000, 0.005)

In [87]:
# train log reg model full
weights_full_model = logistic_regression(X_train_full_model, y_train, 5, 100000, 0.03)

In [88]:
# predicts based on weighted sum voting 
train_predictions = lr_weighted_sum_voting_predictions([X_train_model1, X_train_model2, X_train_model3], 
                                           [weights_model1, weights_model2, weights_model3],
                                           [0.2, 0.6, 0.2])
train_accuracy = accuracy(y_train.reshape(1, -1)[0], train_predictions)
print("Train:", train_accuracy)
val_predictions = lr_weighted_sum_voting_predictions([X_val_model1, X_val_model2, X_val_model3], 
                                           [weights_model1, weights_model2, weights_model3],
                                           [0.2, 0.6, 0.2])
val_accuracy = accuracy(y_val.reshape(1, -1)[0], val_predictions)
print("Validation:", val_accuracy)
test_predictions = lr_weighted_sum_voting_predictions([X_test_model1, X_test_model2, X_test_model3], 
                                           [weights_model1, weights_model2, weights_model3],
                                           [0.2, 0.6, 0.2])
test_accuracy = accuracy(y_test.reshape(1, -1)[0], test_predictions)
print("Test:", test_accuracy)

Train: 0.8313
Validation: 0.8307
Test: 0.8293


In [89]:
# predicts based on hard voting, equal 
train_predictions = lr_hard_voting_predictions([X_train_model1, X_train_model2, X_train_model3], 
                                           [weights_model1, weights_model2, weights_model3])
train_accuracy = accuracy(y_train.reshape(1, -1)[0], train_predictions)
print("Train:", train_accuracy)
val_predictions = lr_hard_voting_predictions([X_val_model1, X_val_model2, X_val_model3], 
                                           [weights_model1, weights_model2, weights_model3])
val_accuracy = accuracy(y_val.reshape(1, -1)[0], val_predictions)
print("Validation:", val_accuracy)
train_predictions = lr_hard_voting_predictions([X_train_model1, X_train_model2, X_train_model3], 
                                           [weights_model1, weights_model2, weights_model3])
train_accuracy = accuracy(y_train.reshape(1, -1)[0], train_predictions)
print("Test:", test_accuracy)

Train: 0.9135625
Validation: 0.9149
Test: 0.8293


In [90]:
# predicts based on hard voting, weighted/biased

weights = [1.5, 2, 1.5]
train_predictions = lr_weighted_hard_voting([X_train_model1, X_train_model2, X_train_model3], 
                                           [weights_model1, weights_model2, weights_model3],
                                           weights)
train_accuracy = accuracy(y_train.reshape(1, -1)[0], train_predictions)
print("Train:", train_accuracy)

val_predictions = lr_weighted_hard_voting([X_val_model1, X_val_model2, X_val_model3], 
                                           [weights_model1, weights_model2, weights_model3],
                                           weights)
val_accuracy = accuracy(y_val.reshape(1, -1)[0], val_predictions)
print("Validation:", val_accuracy)
test_predictions = lr_weighted_hard_voting([X_test_model1, X_test_model2, X_test_model3], 
                                           [weights_model1, weights_model2, weights_model3],
                                           weights)
test_accuracy = accuracy(y_test.reshape(1, -1)[0], test_predictions)
print("Test:", test_accuracy)

Train: 0.9135625
Validation: 0.9149
Test: 0.9138


In [None]:
# model 1 - find the accuracy metrics for each set of data using logistic regression weights for a single model
train_predictions = softmax_prediction(X_train_model1, weights_model1)
train_accuracy = accuracy(y_train.reshape(1, -1)[0], train_predictions)
print("Train:", train_accuracy)

val_predictions = softmax_prediction(X_val_model1, weights_model1)
val_accuracy = accuracy(y_val.reshape(1, -1)[0], val_predictions)
print("Validation:", val_accuracy)

test_predictions = softmax_prediction(X_test_model1, weights_model1)
test_accuracy = accuracy(y_test.reshape(1, -1)[0], test_predictions)
print("Test:", test_accuracy)

Train: 0.9174875
Validation: 0.9213
Test: 0.9156


In [None]:
# model 2 - find the accuracy metrics for each set of data using logistic regression weights for a single model
train_predictions = softmax_prediction(X_train_model2, weights_model2)
train_accuracy = accuracy(y_train.reshape(1, -1)[0], train_predictions)
print("Train:", train_accuracy)

val_predictions = softmax_prediction(X_val_model2, weights_model2)
val_accuracy = accuracy(y_val.reshape(1, -1)[0], val_predictions)
print("Validation:", val_accuracy)

test_predictions = softmax_prediction(X_test_model2, weights_model2)
test_accuracy = accuracy(y_test.reshape(1, -1)[0], test_predictions)
print("Test:", test_accuracy)

Train: 0.878225
Validation: 0.8763
Test: 0.8775


In [None]:
# model 3 - find the accuracy metrics for each set of data using logistic regression weights for a single model
train_predictions = softmax_prediction(X_train_model3, weights_model3)
train_accuracy = accuracy(y_train.reshape(1, -1)[0], train_predictions)
print("Train:", train_accuracy)

val_predictions = softmax_prediction(X_val_model3, weights_model3)
val_accuracy = accuracy(y_val.reshape(1, -1)[0], val_predictions)
print("Validation:", val_accuracy)

test_predictions = softmax_prediction(X_test_model3, weights_model3)
test_accuracy = accuracy(y_test.reshape(1, -1)[0], test_predictions)
print("Test:", test_accuracy)

Train: 0.7783625
Validation: 0.7772
Test: 0.7755


In [None]:
# model 4 (full) - find the accuracy metrics for each set of data using logistic regression weights for a single model
train_predictions = softmax_prediction(X_train_full_model, weights_full_model)
train_accuracy = accuracy(y_train.reshape(1, -1)[0], train_predictions)
print("Train:", train_accuracy)

val_predictions = softmax_prediction(X_val_full_model, weights_full_model)
val_accuracy = accuracy(y_val.reshape(1, -1)[0], val_predictions)
print("Validation:", val_accuracy)

test_predictions = softmax_prediction(X_test_full_model, weights_full_model)
test_accuracy = accuracy(y_test.reshape(1, -1)[0], test_predictions)
print("Test :", test_accuracy)

Train: 0.6872875
Validation: 0.6809
Test : 0.6864
