In [99]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/common-password-list-rockyoutxt/rockyou.txt


# Initial Setup

In [100]:
# Install silently
!pip install swifter pyahocorasick zxcvbn passwordmeter > /dev/null 2>&1


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk
from nltk.corpus import words, names
import swifter  
import ahocorasick 
import re
import time

# Download NLTK English words corpus if not already downloaded
nltk.download('words')

# Download NLTK names corpus
nltk.download('names')

[nltk_data] Downloading package words to /usr/share/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package names to /usr/share/nltk_data...
[nltk_data]   Package names is already up-to-date!


True

In [101]:
#!ls /kaggle/working
#!rm /kaggle/working/*

# Read the Rockyou.txt and convert it to dataframe

In [102]:
# Read the text file (assuming each line is a password)
txt_file = "/kaggle/input/common-password-list-rockyoutxt/rockyou.txt"  # Change this to your actual file path
column_name = "password"
csv_file = "passwords.csv"

# Read the text file properly
with open(txt_file, encoding="latin-1") as file:  # RockYou uses Latin-1 encoding
    passwords = file.read().splitlines()  # Read each line as a password

# Create DataFrame
df = pd.DataFrame(passwords, columns=["password"])

# Save to CSV
# df.to_csv(csv_file, index=False)

print(f"Converted {txt_file} to {csv_file} successfully! 🚀")

Converted /kaggle/input/common-password-list-rockyoutxt/rockyou.txt to passwords.csv successfully! 🚀


# Extracting the password length, number of uppercase & lowercase character, numeric characters, & special character in each password¶

### Extracting the password length

In [103]:
df['length'] = df['password'].str.len()
df.head()

Unnamed: 0,password,length
0,123456,6
1,12345,5
2,123456789,9
3,password,8
4,iloveyou,8


### Distinct character Count

In [104]:
def count_distinct_characters(password):
    """
    Returns the number of distinct characters in a password.
    """
    return len(set(password))

df['distinct_char_count'] = df['password'].swifter.apply(count_distinct_characters)

df[['password', 'length', 'distinct_char_count']].head()

Pandas Apply:   0%|          | 0/14346469 [00:00<?, ?it/s]

Unnamed: 0,password,length,distinct_char_count
0,123456,6,6
1,12345,5,5
2,123456789,9,9
3,password,8,7
4,iloveyou,8,7


### Extract uppercase

In [105]:
df['uppercase_count'] = df['password'].str.count(r'[A-Z]')
df.head()

Unnamed: 0,password,length,distinct_char_count,uppercase_count
0,123456,6,6,0
1,12345,5,5,0
2,123456789,9,9,0
3,password,8,7,0
4,iloveyou,8,7,0


### Extracting lowercase

In [106]:
df['lowercase_count'] = df['password'].str.count(r'[a-z]')
df.head()

Unnamed: 0,password,length,distinct_char_count,uppercase_count,lowercase_count
0,123456,6,6,0,0
1,12345,5,5,0,0
2,123456789,9,9,0,0
3,password,8,7,0,8
4,iloveyou,8,7,0,8


### Extracting numeric Char

In [107]:
df['numeric_char_count'] = df['password'].str.count(r'\d')
df.head()

Unnamed: 0,password,length,distinct_char_count,uppercase_count,lowercase_count,numeric_char_count
0,123456,6,6,0,0,6
1,12345,5,5,0,0,5
2,123456789,9,9,0,0,9
3,password,8,7,0,8,0
4,iloveyou,8,7,0,8,0


### Extracting Special Char

In [108]:
df['special_char_count'] = df['password'].str.count(r'[^a-zA-Z0-9]')
df.tail()

Unnamed: 0,password,length,distinct_char_count,uppercase_count,lowercase_count,numeric_char_count,special_char_count
14346464,"xCvBnM,",8,8,3,3,0,2
14346465,ie168,6,6,0,2,3,1
14346466,abygurl69,10,10,0,7,2,1
14346467,a6_123,7,7,0,1,4,2
14346468,*7Â¡Vamos!,13,12,1,4,1,7


# Identifying password with patterns like Sequential characters, Repetition characters, Consecutive repetition, Keyboard patterns, Personal Informations, Dictionary word

### IS_L33T_WORD

In [109]:
# Download NLTK words if you haven't already
try:
    english_words = set(words.words())
except LookupError:
    nltk.download('words')
    english_words = set(words.words())

# Comprehensive leet speak substitutions
leet_substitutions_exhaustive = {
    '@': 'a', '$': 's', '0': 'o', '1': 'i', '3': 'e', '!': 'i', '4': 'a',
    '5': 's', '7': 't', '(': 'c', ')': 'c', '[': 'c', ']': 'c', '|': 'l',
    '/': 'l', '\\': 'l', '+': 't', '#': 'h', '%': 'o', '&': 'a', '*': 'a',
    '_': 'u', '`': 'e', '\'': 'i', '"': 'i', ',': 'i', '.': 'i', ';': 'j',
    ':': 'j', '<': 'l', '>': 'l', '?': 's', '9': 'g', '6': 'b', '8': 'b',
    '2': 'z', 'g': 'q', 't': '7', 'b': '8'
}

def un_leetify(text, leet_map):
    un_leet_text = ""
    for char in text:
        un_leet_text += leet_map.get(char, char)
    return un_leet_text

def is_leet_dict_word_only(password, dictionary_set, leet_map):
    lower_password = password.lower()
    if lower_password in dictionary_set and len(lower_password) >= 3:
        return False

    un_leet_password = un_leetify(lower_password, leet_map)
    return un_leet_password in dictionary_set and len(un_leet_password) >= 3
    

def process_chunk(chunk, dictionary_set, leet_map):
    chunk['is_leet_dict_word'] = chunk['password'].swifter.apply(lambda x: is_leet_dict_word_only(x, dictionary_set, leet_map))
    return chunk

# Assuming your DataFrame is named 'df'
chunk_size = 1000000  # Adjust based on your memory
results = []
for i in range(0, len(df), chunk_size):
    chunk = df[i:i + chunk_size].copy()
    processed_chunk = process_chunk(chunk, english_words, leet_substitutions_exhaustive)
    results.append(processed_chunk)

df = pd.concat(results)

leet_words_df = df[df['is_leet_dict_word'] == True]
#print("Leet Dictionary Words:")
print(leet_words_df[['password', 'is_leet_dict_word']].head())

Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/346469 [00:00<?, ?it/s]

       password  is_leet_dict_word
1262   passw0rd               True
3569   pa55word               True
3651     pink13               True
4075  princess3               True
7536   pa55w0rd               True


In [110]:
#leet_words_df.info()

In [111]:
#leet_words_df = df[df['is_leet_dict_word'] == True]
#leet_words_df[['password', 'is_leet_dict_word']].iloc[1:60]

### is_dict_padded_word

In [112]:
# Download NLTK words if you haven't already
try:
    english_words = set(words.words())
except LookupError:
    nltk.download('words')
    english_words = set(words.words())

def is_padded_dict_word_only(password, dictionary_set):
    lower_password = password.lower()
    if lower_password in dictionary_set and len(lower_password) >= 3:
        return False

    n = len(lower_password)

    # Check for leading non-alphanumeric padding
    if n > 3 and not lower_password[0].isalpha():
        for i in range(n):
            if lower_password[i].isalpha():
                padded_word = lower_password[i:]
                if len(padded_word) >= 3 and padded_word in dictionary_set:
                    return True
                break

    # Check for trailing non-alphanumeric padding
    if n > 3 and not lower_password[-1].isalpha():
        for i in range(n - 1, -1, -1):
            if lower_password[i].isalpha():
                padded_word = lower_password[:i+1]
                if len(padded_word) >= 3 and padded_word in dictionary_set:
                    return True
                break

    return False

def process_chunk(chunk, dictionary_set):
    chunk['is_padded_dict_word'] = chunk['password'].swifter.apply(lambda x: is_padded_dict_word_only(x, dictionary_set))
    return chunk

# Assuming your DataFrame is named 'df'
chunk_size = 1000000  # Adjust based on your memory
results = []
for i in range(0, len(df), chunk_size):
    chunk = df[i:i + chunk_size].copy()
    processed_chunk = process_chunk(chunk, english_words)
    results.append(processed_chunk)

df = pd.concat(results)

padded_words_df = df[df['is_padded_dict_word'] == True]
print(padded_words_df[['password', 'is_padded_dict_word']].head())

Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/346469 [00:00<?, ?it/s]

      password  is_padded_dict_word
27   password1                 True
125  princess1                 True
163   blink182                 True
183     angel1                 True
279    monkey1                 True


In [113]:
#padded_words_df = df[df['is_padded_dict_word'] == True]
#print("\nPadded Dictionary Words (Beginning or End):")
#padded_words_df[['password', 'is_leet_dict_word', 'is_padded_dict_word']].iloc[4000:4020]

### Complete Sequential

In [114]:
import pandas as pd
import swifter

def is_completely_sequential(password):
    password = password.lower()
    n = len(password)
    if n <= 1:
        return False

    # Check for completely sequential increasing alphabetical
    if password[0].isalpha():
        is_increasing_alpha = True
        start_char_ord = ord(password[0])
        for i in range(1, n):
            if not password[i].isalpha() or ord(password[i]) != start_char_ord + i:
                is_increasing_alpha = False
                break
        if is_increasing_alpha:
            return True

        # Check for completely sequential decreasing alphabetical
        is_decreasing_alpha = True
        start_char_ord_dec = ord(password[0])
        for i in range(1, n):
            if not password[i].isalpha() or ord(password[i]) != start_char_ord_dec - i:
                is_decreasing_alpha = False
                break
        if is_decreasing_alpha:
            return True

    # Check for completely sequential increasing numerical
    if password[0].isdigit():
        is_increasing_digit = True
        try:
            start_digit = int(password[0])
            for i in range(1, n):
                if not password[i].isdigit() or int(password[i]) != start_digit + i:
                    is_increasing_digit = False
                    break
            if is_increasing_digit:
                return True

            # Check for completely sequential decreasing numerical
            is_decreasing_digit = True
            start_digit_dec = int(password[0])
            for i in range(1, n):
                if not password[i].isdigit() or int(password[i]) != start_digit_dec - i:
                    is_decreasing_digit = False
                    break
            if is_decreasing_digit:
                return True
        except ValueError:
            pass

    return False

# Assuming your DataFrame 'df' is already loaded
df['is_completely_sequential'] = df['password'].swifter.apply(is_completely_sequential)

# Display the head of the DataFrame with the new column
df.head()

Pandas Apply:   0%|          | 0/14346469 [00:00<?, ?it/s]

Unnamed: 0,password,length,distinct_char_count,uppercase_count,lowercase_count,numeric_char_count,special_char_count,is_leet_dict_word,is_padded_dict_word,is_completely_sequential
0,123456,6,6,0,0,6,0,False,False,True
1,12345,5,5,0,0,5,0,False,False,True
2,123456789,9,9,0,0,9,0,False,False,True
3,password,8,7,0,8,0,0,False,False,False
4,iloveyou,8,7,0,8,0,0,False,False,False


In [115]:
# Assuming your DataFrame 'df' now has the 'is_completely_sequential' column
#sequential_passwords_df = df[df['is_completely_sequential'] == 1]
#sequential_passwords_df.info()

In [116]:
# Display the head of the filtered DataFrame showing only the specified columns
#sequential_passwords_df[['password', 'is_completely_sequential']].head(60)

### Identify Password with Complete Repetition of characters

In [117]:
def is_completely_repeated(password):
    # First, check if the original password length is greater than 1
    if len(password) > 1:
        # If it is, then check if the number of unique characters (length of the set) is 1
        return True if len(set(password)) == 1 else False
    else:
        # If the original password length is not greater than 1 (i.e., 0 or 1), return 0 (False)
        return False


# Apply the concise function to check for completely repeated characters
df['is_completely_repeated'] = df['password'].swifter.apply(is_completely_repeated)



df.head()

Pandas Apply:   0%|          | 0/14346469 [00:00<?, ?it/s]

Unnamed: 0,password,length,distinct_char_count,uppercase_count,lowercase_count,numeric_char_count,special_char_count,is_leet_dict_word,is_padded_dict_word,is_completely_sequential,is_completely_repeated
0,123456,6,6,0,0,6,0,False,False,True,False
1,12345,5,5,0,0,5,0,False,False,True,False
2,123456789,9,9,0,0,9,0,False,False,True,False
3,password,8,7,0,8,0,0,False,False,False,False
4,iloveyou,8,7,0,8,0,0,False,False,False,False


In [118]:
# Assuming your DataFrame 'df' now has the 'is_completely_repeated' column
#completely_repeated_df = df[df['is_completely_repeated'] == 1]

# View only the 'password' and 'is_completely_repeated' columns
#completely_repeated_df[['password', 'is_completely_repeated']].head()

### Identify Password that contain keyboard pattern

In [119]:
# Predefined list of common keyboard patterns (lowercase)
keyboard_patterns = [
    'qwerty', 'ytrewq', '123456', '654321', 'asdfghjkl', 'lkjhgfdsa',
    'zxcvbnm', 'mnbvcxz', '12345678', '87654321', '111111', '222222',
    '333333', '444444', '555555', '666666', '777777', '888888',
    '999999', '000000', 'drowssap', 'uoyevoli',
    'llabnogard', 'llabesab', 'yeknom', 'enihsnus', 'ssercnip',
    'retsam', 'olleh', 'emoclew', 'abc123',
    '321cba', 'qwertyuiop', 'poiuytrewq', 'asdfghjklpoiuytrewq',
    'qwertrewqihgfdsalkj',
    'zxcvbnm,./', ',./mnbvcxz', '1234567890', '0987654321',
    '09876543210',
    '0123456789', '1234567890-=', '=-0987654321', 'qwertyuiop[]',
    '][poiuytrewq',
    '1234', '4321', '2345', '5432', '3456', '6543', '4567', '7654',
    '5678', '8765', '6789', '9876', '7890', '0987', '00', '12', '21',
    '123', '321', '0123', '3210', '01234', '43210', '4321', '1234',
    '54321', '12345', '654321', '123456', '7654321', '87654321', '21',
    '321', '1111', '2222', '3333', '4444', '5555', '6666', '7777',
    '8888', '9999', '0000',
    '111', '222', '333', '444', '555', '666', '777', '888', '999', '000',
    '11', '22', '33', '44', '55', '66', '77', '88', '99',
    '01', '02', '03', '04', '05', '06', '07', '08', '09', '10',
    '20', '30', '40', '50', '60', '70', '80', '90', '100',
    'qw', 'er', 'as', 'df', 'qwe', 'wer', 'asd', 'sdf', 'qwer', 'werty',
    'asdf', 'sdfg',
    '12', '23', '89', '90', '123', '567', '890', '1234', '6789', '0-==',
    '~!','@#', '$ %',  '~!@', '#$ %', '^&*',  '~!@#', '$ %^&', '&*()',
    'q1', 'w2', 'a2', 's3', 'q1w', 'w2e', 'a2s', 's3d', 'q1we', 'w2er',
    'a2sd', 's3df',
    'qa', 'ws', 'ed', 'rf', 'qaz', 'wsx', 'edc', 'rfv', 'qazx', 'wsxc',
    'edcv', 'rfvb',
    'we4r', 'er4t', '!@#$', '@#$%',
    'a1s2d3', 'q2w3e4',
    'qwa', 'aws', 'wse',
    'qasw', 'wsxd', 'derf',
    'qwert', 'asdfg',
    'qazwsx', 'wsxedc',
    'qaz1', 'wsx2', 'edc3',  # Mixed vertical/diagonal
    '1qaz', '2wsx', '3edc',  # Reversed mixed vertical/diagonal
    'q1as', 'w2sd', 'e3df',  # Slightly offset mixed patterns
    'asdf12', 'qwerty12',  # Horizontal with number padding
    '12asdf', '12qwerty',  # Number padding at the beginning
    '!QAZ', '@WSX', '#EDC',  # Uppercase with symbols
    'ZAQ1', 'XSW2', 'CDE3',  # Reversed uppercase with symbols
    'P@ssword',  # Common misspelling with symbol
    'P@$$word1',  # Leet speak with padding
    'pass123',  # Common word with number padding
    'password1234',  # Longer word with number sequence
    '123password',  # Number padding at the beginning
    'qwertyui',  # Partial row
    'asdfgh',    # Partial row
    'zxcvbn',    # Partial row
    'yuiop',     # Partial row (other side)
    'hjkl',      # Partial row (other side)
    'bnm,./',    # Partial row (other side)
    '!@#$%^',    # Partial symbol sequence
    '^&*()_',    # Partial symbol sequence
    'QWERTY1',   # Uppercase with number
    'QWERTY!',   # Uppercase with symbol
    '1QWERTY',   # Number at the beginning
    '!QWERTY',   # Symbol at the beginning
    'qwert1234', # Lowercase with number sequence
    '1234qwert', # Number sequence at the beginning
    'qwert!@#$', # Lowercase with symbol sequence
    '!@#$qwert', # Symbol sequence at the beginning
    'asdfg1234', # Lowercase with number sequence
    '1234asdfg', # Number sequence at the beginning
    'asdfg!@#$', # Lowercase with symbol sequence
    '!@#$asdfg', # Symbol sequence at the beginning
    'zxcvbn1234', # Lowercase with number sequence
    '1234zxcvbn', # Number sequence at the beginning
    'zxcvbn!@#$', # Lowercase with symbol sequence
    '!@#$zxcvbn', # Symbol sequence at the beginning
    '1qaz2wsx',   # Complex mixed pattern
    '2wsx3edc',   # Complex mixed pattern
    'qazwsxedc',  # Complex mixed pattern
    'wsxedcrfv',  # Complex mixed pattern
    'edcrfvtgb',  # Complex mixed pattern
    'rfvtgbyhn',  # Complex mixed pattern
]


def contains_keyboard_pattern(password):
    # Convert to lowercase for case-insensitive comparison
    password = password.lower()
    
    # Check if password contains any keyboard pattern as a substring
    return True if any(pattern in password for pattern in keyboard_patterns) else False

# Apply the function
df['contains_keyboard_pattern'] = df['password'].swifter.apply(contains_keyboard_pattern)

# Save to csv
#df.to_csv('password_features.csv', index=False)
df.head()

Pandas Apply:   0%|          | 0/14346469 [00:00<?, ?it/s]

Unnamed: 0,password,length,distinct_char_count,uppercase_count,lowercase_count,numeric_char_count,special_char_count,is_leet_dict_word,is_padded_dict_word,is_completely_sequential,is_completely_repeated,contains_keyboard_pattern
0,123456,6,6,0,0,6,0,False,False,True,False,True
1,12345,5,5,0,0,5,0,False,False,True,False,True
2,123456789,9,9,0,0,9,0,False,False,True,False,True
3,password,8,7,0,8,0,0,False,False,False,False,True
4,iloveyou,8,7,0,8,0,0,False,False,False,False,False


In [120]:
#pd.read_csv('/kaggle/working/password_features.csv').tail(10)

In [121]:
#pd.read_csv('/kaggle/working/password_features.csv').iloc[1000000:1000020]

### Identify Password that is a full dictionary word

In [122]:
import nltk
from nltk.corpus import words, names

# Get English words (convert to lowercase for case-insensitive comparison)
english_words = {word.lower() for word in words.words()}
english_names = {name.lower() for name in names.words()}  # Get lowercase names

def contains_dictionary_word(password):
    password_lower = password.lower()

    # 1. Full password is in English name: return False
    if password_lower in english_names:
        return False

    # 2. Full password is in English word: return True
    if password_lower in english_words:
        return True

    # 3. Full password ends with 's', check the password without the 's' in English word, if match, return True
    if password_lower.endswith('s') and password_lower[:-1] in english_words:
        return True
    
    return False

# Apply the function
# Assuming you have a DataFrame named 'df' with a 'password' column
df['is_dictionary_word'] = df['password'].swifter.apply(contains_dictionary_word)

df.head()

Pandas Apply:   0%|          | 0/14346469 [00:00<?, ?it/s]

Unnamed: 0,password,length,distinct_char_count,uppercase_count,lowercase_count,numeric_char_count,special_char_count,is_leet_dict_word,is_padded_dict_word,is_completely_sequential,is_completely_repeated,contains_keyboard_pattern,is_dictionary_word
0,123456,6,6,0,0,6,0,False,False,True,False,True,False
1,12345,5,5,0,0,5,0,False,False,True,False,True,False
2,123456789,9,9,0,0,9,0,False,False,True,False,True,False
3,password,8,7,0,8,0,0,False,False,False,False,True,True
4,iloveyou,8,7,0,8,0,0,False,False,False,False,False,False


In [123]:
is_dict_df = df[df['is_dictionary_word'] == True]
is_dict_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 76428 entries, 3 to 11479682
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   password                   76428 non-null  object
 1   length                     76428 non-null  int64 
 2   distinct_char_count        76428 non-null  int64 
 3   uppercase_count            76428 non-null  int64 
 4   lowercase_count            76428 non-null  int64 
 5   numeric_char_count         76428 non-null  int64 
 6   special_char_count         76428 non-null  int64 
 7   is_leet_dict_word          76428 non-null  bool  
 8   is_padded_dict_word        76428 non-null  bool  
 9   is_completely_sequential   76428 non-null  bool  
 10  is_completely_repeated     76428 non-null  bool  
 11  contains_keyboard_pattern  76428 non-null  bool  
 12  is_dictionary_word         76428 non-null  bool  
dtypes: bool(6), int64(6), object(1)
memory usage: 5.1+ MB


In [124]:
is_dict_df[['password', 'is_dictionary_word']].iloc[0:5]

Unnamed: 0,password,is_dictionary_word
3,password,True
5,princess,True
13,monkey,True
14,lovely,True
24,tigger,True


### Identify Password that contain dictionary word

In [125]:
import nltk
from nltk.corpus import words, names
import ahocorasick
import pandas as pd
import swifter
import time
import re

# Ensure the word and names lists are downloaded (we only need to do this once)
try:
    english_words = set(word.lower() for word in words.words())
    male_names = set(name.lower() for name in names.words('male.txt'))
    female_names = set(name.lower() for name in names.words('female.txt'))
except LookupError as e:
    print("Error loading NLTK resource. Downloading now...")
    nltk.download('words')
    nltk.download('names')
    english_words = set(word.lower() for word in words.words())
    male_names = set(name.lower() for name in names.words('male.txt'))
    female_names = set(name.lower() for name in all_names_set)

# Prepare the Dictionary and Names for Aho-Corasick
english_words_set = set(word.lower() for word in words.words() if len(word) >= 3)
all_names_set = male_names.union(female_names)
all_names_set_lower = set(name.lower() for name in all_names_set)

# Build the Aho-Corasick automaton for Dictionary Words
aho_dict = ahocorasick.Automaton()
for word in english_words_set:
    aho_dict.add_word(word, word)
aho_dict.make_automaton()

# Build the Aho-Corasick automaton for Names
aho_names = ahocorasick.Automaton()
for name in all_names_set_lower:
    aho_names.add_word(name, name)
aho_names.make_automaton()

# Function to Check for Substrings (Dictionary Words and Names)
def check_substring_ahocorasick(password):
    password_lower = password.lower()

    # First, check if the *entire* password is a name
    is_full_name = password_lower in all_names_set_lower

    if is_full_name:
        return False  # The entire password is a name

    # Optimization: Use regex to quickly check if the password ends with 's'
    if password_lower.endswith('s'):
        # If it ends with 's', check if the full plural word is in english word
        is_full_plural_word = password_lower in english_words_set
        if is_full_plural_word:
            return False

        # If it's not the full plural word, remove the last 's' and check
        # if it is in english word
        singular_password = password_lower[:-1]
        is_full_singular_word = singular_password in english_words_set
        if is_full_singular_word:
            return False

    else:
        # If it doesn't end with 's', check if the full word is in english word
        is_full_singular_word = password_lower in english_words_set
        if is_full_singular_word:
            return False

    # Check for dictionary word/name with number patterns
    if re.match(r'^(?:\w+|\w+\d+)$', password_lower):
        base_word = re.sub(r'\d+$', '', password_lower)  # Remove trailing numbers
        if base_word in english_words_set or base_word in all_names_set_lower:
            return False

    # If it's not a name or a dictionary word (or plural), check for dictionary word substrings
    for end_index, word in aho_dict.iter(password_lower):
        return True  # Found a dictionary word as a substring
    return False  # No dictionary word substring found

# Process DataFrame in Chunks (with swifter)
def process_chunk_ahocorasick(df_chunk):
    mask_chunk = df_chunk['is_dictionary_word'] == 0
    # Apply our Aho-Corasick checking function in parallel
    df_chunk.loc[mask_chunk, 'has_phase_word'] = df_chunk.loc[mask_chunk, 'password'].swifter.apply(check_substring_ahocorasick)
    df_chunk['has_phase_word'] = df_chunk['has_phase_word'].fillna(0).astype('bool')
    return df_chunk

def process_dataframe_in_chunks_ahocorasick(df, chunk_size=1000000):
    results = []
    num_chunks = (len(df) + chunk_size - 1) // chunk_size
    print(f"Processing DataFrame in {num_chunks} chunks using Aho-Corasick...")
    for i in range(0, len(df), chunk_size):
        start_time = time.time()
        df_chunk = df.iloc[i:i + chunk_size].copy()
        processed_chunk = process_chunk_ahocorasick(df_chunk)
        results.append(processed_chunk)
        end_time = time.time()
        print(f"Processed chunk {i // chunk_size + 1}/{num_chunks} in {end_time - start_time:.2f} seconds.")

    final_df = pd.concat(results)
    return final_df

# Process the DataFrame using the Aho-Corasick method
chunk_size = 1000000
# Assuming your DataFrame is named 'df'
df = process_dataframe_in_chunks_ahocorasick(df.copy(), chunk_size=chunk_size)

# Save to CSV (optional)
# df.to_csv('password_features.csv', index=False)
df.head()

Processing DataFrame in 15 chunks using Aho-Corasick...


Pandas Apply:   0%|          | 0/967131 [00:00<?, ?it/s]

Processed chunk 1/15 in 5.18 seconds.


Pandas Apply:   0%|          | 0/991011 [00:00<?, ?it/s]

Processed chunk 2/15 in 5.44 seconds.


Pandas Apply:   0%|          | 0/994463 [00:00<?, ?it/s]

Processed chunk 3/15 in 5.27 seconds.


Pandas Apply:   0%|          | 0/996983 [00:00<?, ?it/s]

Processed chunk 4/15 in 5.29 seconds.


Pandas Apply:   0%|          | 0/997066 [00:00<?, ?it/s]

Processed chunk 5/15 in 5.35 seconds.


Pandas Apply:   0%|          | 0/998529 [00:00<?, ?it/s]

Processed chunk 6/15 in 5.40 seconds.


Pandas Apply:   0%|          | 0/999010 [00:00<?, ?it/s]

Processed chunk 7/15 in 5.36 seconds.


Pandas Apply:   0%|          | 0/998064 [00:00<?, ?it/s]

Processed chunk 8/15 in 5.45 seconds.


Pandas Apply:   0%|          | 0/997211 [00:00<?, ?it/s]

Processed chunk 9/15 in 5.28 seconds.


Pandas Apply:   0%|          | 0/997490 [00:00<?, ?it/s]

Processed chunk 10/15 in 5.22 seconds.


Pandas Apply:   0%|          | 0/992445 [00:00<?, ?it/s]

Processed chunk 11/15 in 5.34 seconds.


Pandas Apply:   0%|          | 0/994169 [00:00<?, ?it/s]

Processed chunk 12/15 in 5.32 seconds.


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 13/15 in 5.43 seconds.


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 14/15 in 5.52 seconds.


Pandas Apply:   0%|          | 0/346469 [00:00<?, ?it/s]

Processed chunk 15/15 in 1.80 seconds.


Unnamed: 0,password,length,distinct_char_count,uppercase_count,lowercase_count,numeric_char_count,special_char_count,is_leet_dict_word,is_padded_dict_word,is_completely_sequential,is_completely_repeated,contains_keyboard_pattern,is_dictionary_word,has_phase_word
0,123456,6,6,0,0,6,0,False,False,True,False,True,False,False
1,12345,5,5,0,0,5,0,False,False,True,False,True,False,False
2,123456789,9,9,0,0,9,0,False,False,True,False,True,False,False
3,password,8,7,0,8,0,0,False,False,False,False,True,True,False
4,iloveyou,8,7,0,8,0,0,False,False,False,False,False,False,True


In [126]:
contain_dict_df = df[df['has_phase_word'] == True]
contain_dict_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8249405 entries, 4 to 14346468
Data columns (total 14 columns):
 #   Column                     Dtype 
---  ------                     ----- 
 0   password                   object
 1   length                     int64 
 2   distinct_char_count        int64 
 3   uppercase_count            int64 
 4   lowercase_count            int64 
 5   numeric_char_count         int64 
 6   special_char_count         int64 
 7   is_leet_dict_word          bool  
 8   is_padded_dict_word        bool  
 9   is_completely_sequential   bool  
 10  is_completely_repeated     bool  
 11  contains_keyboard_pattern  bool  
 12  is_dictionary_word         bool  
 13  has_phase_word             bool  
dtypes: bool(7), int64(6), object(1)
memory usage: 558.6+ MB


In [127]:
contain_dict_df[['password', 'has_phase_word']].iloc[0:5]

Unnamed: 0,password,has_phase_word
4,iloveyou,True
7,rockyou,True
12,babygirl,True
19,qwerty,True
21,iloveu,True


## Identify password with Personal Information

### Contain name

In [128]:
import nltk
from nltk.corpus import names
import ahocorasick
import pandas as pd
import swifter
import time
import re

# Ensure the names list is downloaded (we only need to do this once)
try:
    names.words('male.txt')
    names.words('female.txt')
except LookupError:
    print("Downloading NLTK names...")
    nltk.download('names')

# Prepare English Names for Aho-Corasick
english_names = set(names.words())
english_names_lower = {name.lower() for name in english_names}

# Build the Aho-Corasick automaton for names
name_aho = ahocorasick.Automaton()
for name in english_names_lower:
    name_aho.add_word(name, name)
name_aho.make_automaton()

def contains_name(password):
    """Checks if a password contains an English name (case-insensitive) using Aho-Corasick and regular expressions."""
    password_lower = password.lower()

    # 1. Check if the entire password is a name
    if password_lower in name_aho:
        return True

    # 2. Check for names followed by numbers (e.g., johnny1, kristenanne22)
    if re.match(r"^(?:\w+)(\d+)$", password_lower):
        base_name = re.sub(r"(\d+)$", "", password_lower)
        if base_name in name_aho:
            return True

    # 3. Check for concatenated names (e.g., jennyeva, kristenanne)
    #   - Split the password and check for combinations
    parts = re.split(r'(\w+)', password_lower)
    name_parts = [p for p in parts if p.isalpha() and len(p) > 1]  # Extract potential name parts

    for i in range(len(name_parts)):
        combined_name = "".join(name_parts[i:])
        if combined_name in name_aho:
            return True
        for j in range(i + 1, len(name_parts) + 1):
            combined_name = "".join(name_parts[i:j])
            if combined_name in name_aho:
                return True

    # 4. Check for first, middle, and last name combinations (e.g., kelseylovesbarry)
    #   - This is a more aggressive check and might have more false positives
    #   - You might want to adjust the length thresholds and matching criteria
    if len(password_lower) > 8:  # Adjust this length threshold as needed
        for end_index, name in name_aho.iter(password_lower):
            if len(name) > 3:  # Adjust this minimum name length as needed
                return True

    # If none of the above conditions are met, it's likely not a name
    return False

# Process DataFrame in Chunks with swifter for names
def process_chunk_names(df_chunk):
    df_chunk['contains_name'] = df_chunk['password'].swifter.apply(contains_name)
    return df_chunk

def process_dataframe_in_chunks_names(df, chunk_size=100000):
    results = []
    num_chunks = (len(df) + chunk_size - 1) // chunk_size
    print(f"Processing DataFrame in {num_chunks} chunks for names...")
    for i in range(0, len(df), chunk_size):
        start_time = time.time()
        df_chunk = df.iloc[i:i + chunk_size].copy()
        processed_chunk = process_chunk_names(df_chunk)
        results.append(processed_chunk)
        end_time = time.time()
        print(f"Processed chunk {i // chunk_size + 1}/{num_chunks} in {end_time - start_time:.2f} seconds.")

    final_df = pd.concat(results)
    return final_df

# Process the DataFrame for names
chunk_size_names = 1000000
df = process_dataframe_in_chunks_names(df.copy(), chunk_size=chunk_size_names)

# Save to CSV (optional)
# df_names.to_csv('password_features_names.csv', index=False)

Processing DataFrame in 15 chunks for names...


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 1/15 in 6.11 seconds.


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 2/15 in 6.28 seconds.


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 3/15 in 6.29 seconds.


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 4/15 in 6.65 seconds.


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 5/15 in 6.50 seconds.


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 6/15 in 6.54 seconds.


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 7/15 in 6.37 seconds.


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 8/15 in 6.45 seconds.


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 9/15 in 6.61 seconds.


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 10/15 in 6.55 seconds.


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 11/15 in 6.58 seconds.


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 12/15 in 6.38 seconds.


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 13/15 in 5.87 seconds.


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 14/15 in 6.28 seconds.


Pandas Apply:   0%|          | 0/346469 [00:00<?, ?it/s]

Processed chunk 15/15 in 2.16 seconds.


In [129]:
# Filter the DataFrame where 'contains_name' is True (or 1)
names_found_df = df[df['contains_name'] == True]

names_found_df.info()



<class 'pandas.core.frame.DataFrame'>
Index: 3088820 entries, 10 to 14346468
Data columns (total 15 columns):
 #   Column                     Dtype 
---  ------                     ----- 
 0   password                   object
 1   length                     int64 
 2   distinct_char_count        int64 
 3   uppercase_count            int64 
 4   lowercase_count            int64 
 5   numeric_char_count         int64 
 6   special_char_count         int64 
 7   is_leet_dict_word          bool  
 8   is_padded_dict_word        bool  
 9   is_completely_sequential   bool  
 10  is_completely_repeated     bool  
 11  contains_keyboard_pattern  bool  
 12  is_dictionary_word         bool  
 13  has_phase_word             bool  
 14  contains_name              bool  
dtypes: bool(8), int64(6), object(1)
memory usage: 212.1+ MB


In [130]:
# View only the 'password' and 'contains_name' columns
names_found_df[['password', 'contains_name']].iloc[0:5]

Unnamed: 0,password,contains_name
10,nicole,True
11,daniel,True
15,jessica,True
17,michael,True
18,ashley,True


### Contain Other Personal Info (Phone number, Date of birth, Email address)

In [131]:
import re
import pandas as pd
import swifter
import time

def contains_other_personal_info(password):
    """Checks if a password contains dates, potential years of birth, phone numbers, or email addresses."""
    password_lower = password.lower()

    # Check for dates (YYYY-MM-DD or MM-DD-YYYY)
    if re.search(r'\b\d{4}-\d{2}-\d{2}\b|\b\d{2}-\d{2}-\d{4}\b', password_lower):
        return True

    # Check for potential year of birth (YYYY format within a reasonable range)
    if re.search(r'\b(19\d{2}|20[0-5]\d)\b', password_lower):  # Adjust range as needed
        return True

    # Check for phone numbers (XXX-XXX-XXXX or (XXX) XXX-XXXX)
    if re.search(r'\b\d{3}-\d{3}-\d{4}\b|\b\(\d{3}\) \d{3}-\d{3}-\d{4}\b', password_lower):
        return True

    # Check for email addresses
    if re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', password_lower):
        return True

    return False

# Process DataFrame in Chunks with swifter for other personal info
def process_chunk_other_info(df_chunk):
    df_chunk['contains_other_personal_info'] = df_chunk['password'].swifter.apply(contains_other_personal_info)
    return df_chunk

def process_dataframe_in_chunks_other_info(df, chunk_size=100000):
    results = []
    num_chunks = (len(df) + chunk_size - 1) // chunk_size
    print(f"Processing DataFrame in {num_chunks} chunks for other personal info...")
    for i in range(0, len(df), chunk_size):
        start_time = time.time()
        df_chunk = df.iloc[i:i + chunk_size].copy()
        processed_chunk = process_chunk_other_info(df_chunk)
        results.append(processed_chunk)
        end_time = time.time()
        print(f"Processed chunk {i // chunk_size + 1}/{num_chunks} in {end_time - start_time:.2f} seconds.")

    final_df = pd.concat(results)
    return final_df

# Process the DataFrame for other personal info
chunk_size_other_info = 1000000
df = process_dataframe_in_chunks_other_info(df.copy(), chunk_size=chunk_size_other_info)

# Save to CSV (optional)
# df_other_info.to_csv('password_features_other_info.csv', index=False)

Processing DataFrame in 15 chunks for other personal info...


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 1/15 in 5.06 seconds.


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 2/15 in 5.07 seconds.


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 3/15 in 5.11 seconds.


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 4/15 in 5.26 seconds.


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 5/15 in 5.26 seconds.


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 6/15 in 5.27 seconds.


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 7/15 in 5.20 seconds.


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 8/15 in 5.18 seconds.


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 9/15 in 5.23 seconds.


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 10/15 in 5.20 seconds.


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 11/15 in 5.22 seconds.


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 12/15 in 5.24 seconds.


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 13/15 in 5.15 seconds.


Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

Processed chunk 14/15 in 5.39 seconds.


Pandas Apply:   0%|          | 0/346469 [00:00<?, ?it/s]

Processed chunk 15/15 in 1.88 seconds.


In [132]:
# Filter the DataFrame where 'contains_other_personal_info' is True (or 1)
other_info_found_df = df[df['contains_other_personal_info'] == 1]

other_info_found_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44791 entries, 21794 to 14346451
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   password                      44791 non-null  object
 1   length                        44791 non-null  int64 
 2   distinct_char_count           44791 non-null  int64 
 3   uppercase_count               44791 non-null  int64 
 4   lowercase_count               44791 non-null  int64 
 5   numeric_char_count            44791 non-null  int64 
 6   special_char_count            44791 non-null  int64 
 7   is_leet_dict_word             44791 non-null  bool  
 8   is_padded_dict_word           44791 non-null  bool  
 9   is_completely_sequential      44791 non-null  bool  
 10  is_completely_repeated        44791 non-null  bool  
 11  contains_keyboard_pattern     44791 non-null  bool  
 12  is_dictionary_word            44791 non-null  bool  
 13  has_phase_word

In [133]:
# View only the 'password' and 'contains_other_personal_info' columns
other_info_found_df[['password', 'contains_other_personal_info']].iloc[0:5]

Unnamed: 0,password,contains_other_personal_info
21794,1989,True
22124,1990,True
22460,1987,True
23471,2005,True
27077,1991,True


In [134]:
columns_ = df.columns
columns_

Index(['password', 'length', 'distinct_char_count', 'uppercase_count',
       'lowercase_count', 'numeric_char_count', 'special_char_count',
       'is_leet_dict_word', 'is_padded_dict_word', 'is_completely_sequential',
       'is_completely_repeated', 'contains_keyboard_pattern',
       'is_dictionary_word', 'has_phase_word', 'contains_name',
       'contains_other_personal_info'],
      dtype='object')

### Entropy, Complexity Score

In [135]:
import math
import pandas as pd
import numpy as np
import swifter  # Import swifter

def calculate_shannon_entropy(password):
    """Calculates the Shannon entropy of a password."""
    entropy = 0
    length = len(password)
    if length == 0:
        return 0  # Handle empty passwords

    char_counts = {}
    for char in password:
        char_counts[char] = char_counts.get(char, 0) + 1

    for count in char_counts.values():
        probability = count / length
        entropy -= probability * math.log2(probability)

    return entropy

def calculate_base_score(length, distinct_char_count, uppercase_count, lowercase_count, numeric_char_count, special_char_count):
    """Calculates a base complexity score based on password characteristics."""
    base_score = 0

    # Length (most important factor)
    base_score += length * 0.7  # Give length a high weight

    # Character diversity
    base_score += distinct_char_count * 0.5
    base_score += uppercase_count * 0.2
    base_score += numeric_char_count * 0.1
    base_score += special_char_count * 0.2

    return base_score

def calculate_penalties(df):
    """Calculates the penalty score based on the presence of common password patterns."""
    df['penalty_score'] = 0
    penalty_features = [
        'is_leet_dict_word', 'is_padded_dict_word', 'is_completely_sequential',
        'is_completely_repeated', 'contains_keyboard_pattern', 'is_dictionary_word',
        'has_phase_word', 'contains_name', 'contains_other_personal_info'
    ]
    penalty_values = [
        5, 5, 10, 10, 8, 7, 6, 9, 8
    ]  # Corresponding penalties

    for feature, value in zip(penalty_features, penalty_values):
        df['penalty_score'] += df[feature] * value
    return df

def calculate_complexity_score(df):
    """Calculates the final password complexity score for the entire DataFrame, normalized between 0 and 1."""

    # 1. Calculate Shannon Entropy (using swifter)
    print('Calculating Entropy........................')
    df['entropy_score'] = df['password'].swifter.apply(calculate_shannon_entropy)

    # 2. Calculate Base Score (vectorized)
    print('Calculating Base Score........................')
    df['base_score'] = calculate_base_score(
        df['length'], df['distinct_char_count'], df['uppercase_count'],
        df['lowercase_count'], df['numeric_char_count'], df['special_char_count']
    )

    # 3. Calculate Penalties (vectorized)
    print('Calculating Penality Score........................')
    df = calculate_penalties(df)

    # 4. Calculate Penalized Score
    df['penalized_score'] = (df['base_score'] - df['penalty_score']).clip(0)

    # 5. Combine Entropy and Penalized Score
    print('Calculating Complexity Score........................')
    raw_complexity_score = (df['entropy_score'] * 0.6) + (df['penalized_score'] * 0.4)

    # 6. Normalize the complexity score
    min_score = raw_complexity_score.min()
    max_score = raw_complexity_score.max()

    # Avoid division by zero if all scores are the same
    if min_score == max_score:
        df['complexity_score'] = pd.Series(0.5, index=raw_complexity_score.index)
    else:
        df['complexity_score'] = (raw_complexity_score - min_score) / (max_score - min_score)

    return df


df = calculate_complexity_score(df)
df.head()

Calculating Entropy........................


Pandas Apply:   0%|          | 0/14346469 [00:00<?, ?it/s]

Calculating Base Score........................
Calculating Penality Score........................
Calculating Complexity Score........................


Unnamed: 0,password,length,distinct_char_count,uppercase_count,lowercase_count,numeric_char_count,special_char_count,is_leet_dict_word,is_padded_dict_word,is_completely_sequential,...,contains_keyboard_pattern,is_dictionary_word,has_phase_word,contains_name,contains_other_personal_info,entropy_score,base_score,penalty_score,penalized_score,complexity_score
0,123456,6,6,0,0,6,0,False,False,True,...,True,False,False,False,False,2.584963,7.8,18,0.0,0.016698
1,12345,5,5,0,0,5,0,False,False,True,...,True,False,False,False,False,2.321928,6.5,18,0.0,0.014998
2,123456789,9,9,0,0,9,0,False,False,True,...,True,False,False,False,False,3.169925,11.7,18,0.0,0.020476
3,password,8,7,0,8,0,0,False,False,False,...,True,True,False,False,False,2.75,9.1,15,0.0,0.017764
4,iloveyou,8,7,0,8,0,0,False,False,False,...,False,False,True,False,False,2.75,9.1,6,3.1,0.031113


In [136]:
df[['password','length','distinct_char_count','entropy_score','base_score','penalty_score','penalized_score','complexity_score']].iloc[60:120]

Unnamed: 0,password,length,distinct_char_count,entropy_score,base_score,penalty_score,penalized_score,complexity_score
60,elizabeth,9,8,2.947703,10.3,9,1.3,0.024639
61,hottie,6,5,2.251629,6.7,6,0.7,0.017559
62,tinkerbell,10,8,2.921928,11.0,23,0.0,0.018874
63,charlie,7,7,2.807355,8.4,9,0.0,0.018134
64,samantha,8,6,2.405639,8.6,9,0.0,0.015539
65,barbie,6,5,2.251629,6.7,9,0.0,0.014544
66,chelsea,7,6,2.521641,7.9,9,0.0,0.016288
67,lovers,6,6,2.584963,7.2,15,0.0,0.016698
68,teamo,5,5,2.321928,6.0,6,0.0,0.014998
69,jasmine,7,7,2.807355,8.4,17,0.0,0.018134


In [137]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14346469 entries, 0 to 14346468
Data columns (total 21 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   password                      object 
 1   length                        int64  
 2   distinct_char_count           int64  
 3   uppercase_count               int64  
 4   lowercase_count               int64  
 5   numeric_char_count            int64  
 6   special_char_count            int64  
 7   is_leet_dict_word             bool   
 8   is_padded_dict_word           bool   
 9   is_completely_sequential      bool   
 10  is_completely_repeated        bool   
 11  contains_keyboard_pattern     bool   
 12  is_dictionary_word            bool   
 13  has_phase_word                bool   
 14  contains_name                 bool   
 15  contains_other_personal_info  bool   
 16  entropy_score                 float64
 17  base_score                    float64
 18  penalty_score       

In [138]:
columns = df.columns
columns

Index(['password', 'length', 'distinct_char_count', 'uppercase_count',
       'lowercase_count', 'numeric_char_count', 'special_char_count',
       'is_leet_dict_word', 'is_padded_dict_word', 'is_completely_sequential',
       'is_completely_repeated', 'contains_keyboard_pattern',
       'is_dictionary_word', 'has_phase_word', 'contains_name',
       'contains_other_personal_info', 'entropy_score', 'base_score',
       'penalty_score', 'penalized_score', 'complexity_score'],
      dtype='object')

In [139]:
# Convert to int16 (for columns with larger integers)
df['length'] = df['length'].astype('int16')
df['distinct_char_count'] = df['distinct_char_count'].astype('int16')
df['uppercase_count'] = df['uppercase_count'].astype('int16')
df['lowercase_count'] = df['lowercase_count'].astype('int16')
df['numeric_char_count'] = df['numeric_char_count'].astype('int16')
df['special_char_count'] = df['special_char_count'].astype('int16')
df['entropy_score'] = df['entropy_score'].astype('float16')
df['base_score'] = df['base_score'].astype('float16')
df['penalty_score'] = df['penalty_score'].astype('float16')
df['penalized_score'] = df['penalized_score'].astype('float16')
df['complexity_score'] = df['complexity_score'].astype('float16')


In [140]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14346469 entries, 0 to 14346468
Data columns (total 21 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   password                      object 
 1   length                        int16  
 2   distinct_char_count           int16  
 3   uppercase_count               int16  
 4   lowercase_count               int16  
 5   numeric_char_count            int16  
 6   special_char_count            int16  
 7   is_leet_dict_word             bool   
 8   is_padded_dict_word           bool   
 9   is_completely_sequential      bool   
 10  is_completely_repeated        bool   
 11  contains_keyboard_pattern     bool   
 12  is_dictionary_word            bool   
 13  has_phase_word                bool   
 14  contains_name                 bool   
 15  contains_other_personal_info  bool   
 16  entropy_score                 float16
 17  base_score                    float16
 18  penalty_score       

In [141]:
df.head()

Unnamed: 0,password,length,distinct_char_count,uppercase_count,lowercase_count,numeric_char_count,special_char_count,is_leet_dict_word,is_padded_dict_word,is_completely_sequential,...,contains_keyboard_pattern,is_dictionary_word,has_phase_word,contains_name,contains_other_personal_info,entropy_score,base_score,penalty_score,penalized_score,complexity_score
0,123456,6,6,0,0,6,0,False,False,True,...,True,False,False,False,False,2.585938,7.800781,18.0,0.0,0.016693
1,12345,5,5,0,0,5,0,False,False,True,...,True,False,False,False,False,2.322266,6.5,18.0,0.0,0.014999
2,123456789,9,9,0,0,9,0,False,False,True,...,True,False,False,False,False,3.169922,11.703125,18.0,0.0,0.020477
3,password,8,7,0,8,0,0,False,False,False,...,True,True,False,False,False,2.75,9.101562,15.0,0.0,0.017761
4,iloveyou,8,7,0,8,0,0,False,False,False,...,False,False,True,False,False,2.75,9.101562,6.0,3.099609,0.031113


In [142]:
df.to_csv('password_features.csv', index=False)

---

```
def has_repeated_chars(password):
    # Check if any character is repeated
    return len(password) != len(set(password))


# Apply the function to check for repeated characters
df['has_repeated_chars'] = df['password'].swifter.apply(has_repeated_chars).astype(int)


# Save to CSV
# df.to_csv('password_features.csv', index=False)
df.head()

---

```
def has_sequential_chars(password):
    # Convert password to lowercase for case-insensitive comparison
    password = password.lower()

    # Check for alphabetical sequences (e.g., 'abc', 'bcd', etc.)
    for i in range(len(password) - 2):
        if password[i].isalpha() and password[i+1].isalpha() and password[i+2].isalpha():
            if ord(password[i]) == ord(password[i+1]) - 1 == ord(password[i+2]) - 2:
                return True

    # Check for numerical sequences (e.g., '123', '234', etc.)
    for i in range(len(password) - 2):
        if password[i].isdigit() and password[i+1].isdigit() and password[i+2].isdigit():
            try:
                if int(password[i]) + 1 == int(password[i+1]) and int(password[i+1]) + 1 == int(password[i+2]):
                    return True
            except ValueError:
                continue

    # If no sequential characters are found, return 0
    return False


# Check for Sequential characters using swifter
df['has_sequential_chars'] = df['password'].swifter.apply(has_sequential_chars)

# Save to CSV (optional)
# df.to_csv('password_features.csv', index=False)
df.head()

---

```!pip install wordsegment tqdm > /dev/null 2>&1
from wordsegment import load, segment

# Load the word frequency data (this needs to be done once)
load()

# Examples
print(segment('iloveyou'))
print(segment('ihateyou'))
print(segment('letmein'))
print(segment('iloveu'))

---

```import nltk
from nltk.corpus import words
import pandas as pd
import swifter
from wordsegment import load, segment

# Ensure the word list is downloaded (we only need to do this once)
try:
    english_words = set(word.lower() for word in words.words())
except LookupError as e:
    print(f"Error loading NLTK resource: {e}. Downloading now...")
    nltk.download('words')
    english_words = set(word.lower() for word in words.words())

# Load the wordsegment model
load()

def is_phrase_word_optimized_cleaned(text):
    """
    Checks if a text is a phrase word using the wordsegment library.
    """
    text = text.lower()
    segmented_words = segment(text)
    return len(segmented_words) > 1

# Assuming your DataFrame 'df' is already loaded
# Initialize the 'is_phrase_word' column to 0 (False)
df['is_phrase_word'] = 0

# Filter the DataFrame to only include rows where 'contain_dictionary_word' is 1
# AND the length is less than or equal to 20
df_to_check = df[(df['contain_dictionary_word'] == 1) & (df['length'] <= 258)].copy()

# Apply the is_phrase_word_optimized_cleaned function to this filtered DataFrame
df_to_check['is_phrase_word'] = df_to_check['password'].swifter.apply(
    is_phrase_word_optimized_cleaned
)

# Update the original DataFrame with the results for the filtered rows
df.update(df_to_check)

# Now you can see the results!
phrase_words_df = df[df['is_phrase_word'] == 1]
phrase_words_df[['password', 'is_phrase_word']].head()

---

```
from zxcvbn import zxcvbn

def get_zxcvbn_data(password):
    """
    Uses the zxcvbn library to get password strength information for passwords
    72 characters or shorter. Returns an empty dictionary for longer passwords.
    """
    max_len_zxcvbn_recommended = 32
    if len(password) > max_len_zxcvbn_recommended:
        return {}  # Skip long passwords and return an empty dictionary
    else:
        try:
            result = zxcvbn(password, max_length=max_len_zxcvbn_recommended)
            return result
        except ValueError as e:
            print(f"Warning: Password '{password[:20]}...' caused an error: {e}")
            return {}  # Return an empty dictionary if there's still an error

# Assuming your DataFrame is named 'df'
df['zxcvbn_info'] = df['password'].swifter.apply(get_zxcvbn_data)

# Now, let's expand the 'zxcvbn_info' column into separate columns
df = pd.concat([df.drop(['zxcvbn_info'], axis=1), df['zxcvbn_info'].apply(pd.Series)], axis=1)

print(df.head())
```

---

```
from passwordmeter import Meter

def get_passwordmeter_data(password):
    """
    Uses the passwordmeter library to get password strength information.
    """
    meter = Meter()
    strength_tuple = meter.test(password)
    return strength_tuple

# Assuming your DataFrame is named 'df'
df['passwordmeter_info'] = df['password'].swifter.apply(get_passwordmeter_data)

# Now, let's expand the 'passwordmeter_info' column into separate columns
df[['passwordmeter_strength', 'passwordmeter_improvements']] = df['passwordmeter_info'].apply(pd.Series)

# You might want to drop the original tuple column now
df = df.drop(['passwordmeter_info'], axis=1)

print(df.head())
```