In [36]:
# import libraries
import pandas as pd
from collections import Counter
from nltk.util import ngrams
import math
import re
import numpy as np
from sklearn.model_selection import train_test_split
import heapq
import pickle

In [2]:
# load password dataset
df_passwords = pd.read_csv('dataset.csv')
df_passwords.head()

Unnamed: 0,password,strength,length,class_strength,entropy,crack_time_sec,crack_time
0,bybee,0.088053,5,Very week,11.60964,1.5625e-06,instant
1,n3m0,0.088889,4,Very week,8.0,1.28e-07,instant
2,2509,0.088889,4,Very week,8.0,1.28e-07,instant
3,4622,0.070443,4,Very week,8.0,1.28e-07,instant
4,shrk,0.088889,4,Very week,8.0,1.28e-07,instant


In [3]:
dictEncoding = {'very week': 0, 'week': 1, 'average': 2, 'strong': 3, 'very strong': 4}
df_passwords['numeric_class_strength'] = df_passwords['class_strength'].str.lower().map(dictEncoding)
labels = df_passwords['numeric_class_strength']
print(labels)

0        0
1        0
2        0
3        0
4        0
        ..
99995    4
99996    4
99997    4
99998    4
99999    4
Name: numeric_class_strength, Length: 100000, dtype: int64


In [4]:
# generate ngrams from 1 to 5
def generate_ngrams_counts(character_tokens):
    all_ngrams = []
    for i in range(5):
        all_ngrams.extend([''.join(gram) for gram in ngrams(character_tokens, i + 1)])
    return Counter(all_ngrams)

In [5]:
# calculate shannon_entropy given character counts
def shannon_entropy(character_tokens):
    character_counts = Counter(character_tokens)
    total = len(character_tokens)
    return -sum((count / total) * math.log2(count / total) for count in character_counts.values())

In [6]:
# find counts of special characters in the password
def count_special_characters(password):
    # matches any character that is NOT a letter, number, or whitespace
    pattern = r"[^a-zA-Z0-9\s]"  
    return len(re.findall(pattern, password))

# find counts of numbers in the password
def count_numbers(password):
    # matches any digit (0-9)
    pattern = r"\d" 
    return len(re.findall(pattern, password))

# finds counts of uppercase letters in the password
def count_uppercase(password):
    # matches any uppercase letter (A-Z)
    pattern = r"[A-Z]"  
    return len(re.findall(pattern, password))

# finds counts of lowercase letters in the password
def count_lowercase(password):
    # matches any lowercase letter (a-z)
    pattern = r"[a-z]"  
    return len(re.findall(pattern, password))

In [43]:
# sums the difference in ascii values for characters in the password
def character_diversity(password):
    sum = 0
    # iterates through the characters in the password, find the difference in ascii value, and finds the total
    for index in range(1, len(password)):
        sum += abs(ord(password[index]) - ord(password[index - 1]))
    return sum

In [44]:
# additional feature that indicates whether a name is present in the password

# load names
names_df = pd.read_csv('Popular_Baby_Names.csv')

# make names lowercase and create a set
names_set = set(names_df["Child's First Name"].str.lower())

In [45]:
# checks if the password contains a name
def contains_name(password):
    # make password lowercase
    password_lower = password.lower()
    # returns whether the password contains a name as a number (0 or 1)
    return int(any(name in password_lower for name in names_set))



In [55]:
# feature binning for length of password
def length_binning(password_length):
    # according to NIST, passwords with lengths less than 8 are considered weak
    if password_length < 8:
        return 0
    # similarly, passwords with lengths of 8-10 are considered common
    elif password_length < 11:
        return 1
    # passwords with lengths of 11-14 are considered common but stronger
    elif password_length < 15:
        return 2
    # passwords with lengths greater than 15 are recommended by NIST
    else:
        return 3

In [53]:
# create input dataframe containing passwords and relevant, extracted features
password_inputs = pd.DataFrame()

# add the passwords from the dataset
password_inputs['password'] = df_passwords['password']

# store lengths of each password
password_inputs['length'] = password_inputs['password'].apply(len)

# feature bins the length of each password
password_inputs['length_bin'] = password_inputs['length'].apply(length_binning)

# tokenize input passwords by characters
password_inputs['character_tokens'] = password_inputs['password'].apply(list)

# count upper case letters in password
password_inputs['uppercase_count'] = password_inputs['password'].apply(count_uppercase)

# count lower case letters in password
password_inputs['lowercase_count'] = password_inputs['password'].apply(count_lowercase)

# count numbers in password
password_inputs['numbers_count'] = password_inputs['password'].apply(count_numbers)

# count special character in password
password_inputs['special_character_count'] = password_inputs['password'].apply(count_special_characters)

# find occurrences of each character in the passwords
password_inputs['ngram_occurrences'] = password_inputs['character_tokens'].apply(generate_ngrams_counts)

# find entropy of each passwords (Shannon Entropy)
password_inputs['entropy'] = password_inputs['character_tokens'].apply(shannon_entropy)

# find character diversity of each password
password_inputs['character_diversity'] = password_inputs['password'].apply(character_diversity)

# finds whether each password contains a name or not
password_inputs['contains_name'] = password_inputs['password'].apply(contains_name)

# feature interaction to help model understand the combination of certain password characterists

# emphasizes passwords with both uppercase and special character complexities
password_inputs['upper_special'] = password_inputs['uppercase_count'] * password_inputs['special_character_count']

# shows how length could contribute to unpredictability
password_inputs['length_entropy'] = password_inputs['length'] * password_inputs['entropy']

# measures diversity in alphanumeric character combinations in passwords
password_inputs['lower_numbers'] = password_inputs['lowercase_count'] * password_inputs['numbers_count']

# shows how special characters could contribute to unpredictability
password_inputs['entropy_special'] = password_inputs['entropy'] * password_inputs['special_character_count']

# ratio-based features to normalize counts and help model generalize across varying password lengths

# proportion of uppercase letters in each password
password_inputs['upper_ratio'] = password_inputs['uppercase_count'] / password_inputs['length']

# proportion of lowercase letters in each password
password_inputs['lower_ratio'] = password_inputs['lowercase_count'] / password_inputs['length']

# proportion of special characters in each password
password_inputs['special_character_ratio'] = password_inputs['special_character_count'] / password_inputs['length']

# proportion of numbers in each password
password_inputs['numbers_ratio'] = password_inputs['numbers_count'] / password_inputs['length']

# find average entropy per character for each password
password_inputs['entropy_per_character'] = password_inputs['entropy'] / password_inputs['length']

password_inputs


Unnamed: 0,password,length,length_bin,character_tokens,uppercase_count,lowercase_count,numbers_count,special_character_count,ngram_occurrences,entropy,character_diversity,contains_name
0,bybee,5,0,"[b, y, b, e, e]",0,5,0,0,"{'b': 2, 'y': 1, 'e': 2, 'by': 1, 'yb': 1, 'be...",1.521928,49,0
1,n3m0,4,0,"[n, 3, m, 0]",0,2,2,0,"{'n': 1, '3': 1, 'm': 1, '0': 1, 'n3': 1, '3m'...",2.000000,178,0
2,2509,4,0,"[2, 5, 0, 9]",0,0,4,0,"{'2': 1, '5': 1, '0': 1, '9': 1, '25': 1, '50'...",2.000000,17,0
3,4622,4,0,"[4, 6, 2, 2]",0,0,4,0,"{'4': 1, '6': 1, '2': 2, '46': 1, '62': 1, '22...",1.500000,6,0
4,shrk,4,0,"[s, h, r, k]",0,4,0,0,"{'s': 1, 'h': 1, 'r': 1, 'k': 1, 'sh': 1, 'hr'...",2.000000,28,0
...,...,...,...,...,...,...,...,...,...,...,...,...
99995,sifelizestasdecirmeloquerras,28,3,"[s, i, f, e, l, i, z, e, s, t, a, s, d, e, c, ...",0,28,0,0,"{'s': 4, 'i': 3, 'f': 1, 'e': 5, 'l': 2, 'z': ...",3.624519,240,1
99996,iwillalwayslovemyboyfriend,26,3,"[i, w, i, l, l, a, l, w, a, y, s, l, o, v, e, ...",0,26,0,0,"{'i': 3, 'w': 2, 'l': 4, 'a': 2, 'y': 3, 's': ...",3.719295,279,1
99997,letsyouupdateyourfunNotesandmore,32,3,"[l, e, t, s, y, o, u, u, p, d, a, t, e, y, o, ...",1,31,0,0,"{'l': 1, 'e': 4, 't': 3, 's': 2, 'y': 2, 'o': ...",3.726410,339,0
99998,chocolatesoeusi912134741,24,3,"[c, h, o, c, o, l, a, t, e, s, o, e, u, s, i, ...",0,15,9,0,"{'c': 2, 'h': 1, 'o': 3, 'l': 1, 'a': 1, 't': ...",3.855389,210,0


In [9]:
# create the input matrix and class vector
X = password_inputs[['length', 'uppercase_count', 'lowercase_count', 'numbers_count', 'special_character_count', 'entropy']].to_numpy()
y = labels.to_numpy().reshape(-1, 1)

# split data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [37]:
# save training/validation/test data as a pickle file to preserve data types
# pickle file will be used in other notebooks for running other models
with open('train_val_test_data.pkl', 'wb') as file:
    pickle.dump((X_train, X_val, X_test, y_train, y_val, y_test), file)