In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Install silently
!pip install swifter pyahocorasick zxcvbn passwordmeter > /dev/null 2>&1


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk
from nltk.corpus import words, names
import swifter  
import ahocorasick 
import re
import time

# Download NLTK English words corpus if not already downloaded
nltk.download('words')

# Download NLTK names corpus
nltk.download('names')


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

[nltk_data] Downloading package words to /usr/share/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package names to /usr/share/nltk_data...
[nltk_data]   Package names is already up-to-date!
/kaggle/input/common-password-list-rockyoutxt/rockyou.txt


In [2]:
#!ls /kaggle/working
#!rm /kaggle/working/password_features_5row.csv

# Read the Rockyou.txt and convert it to dataframe

In [3]:
# Read the text file (assuming each line is a password)
txt_file = "/kaggle/input/common-password-list-rockyoutxt/rockyou.txt"  # Change this to your actual file path
column_name = "password"
csv_file = "passwords.csv"

# Read the text file properly
with open(txt_file, encoding="latin-1") as file:  # RockYou uses Latin-1 encoding
    passwords = file.read().splitlines()  # Read each line as a password

# Create DataFrame
df = pd.DataFrame(passwords, columns=["password"])

# Save to CSV
# df.to_csv(csv_file, index=False)

print(f"Converted {txt_file} to {csv_file} successfully! 🚀")

Converted /kaggle/input/common-password-list-rockyoutxt/rockyou.txt to passwords.csv successfully! 🚀


In [4]:
df.head()

Unnamed: 0,password
0,123456
1,12345
2,123456789
3,password
4,iloveyou


# Extracting the password length, number of uppercase & lowercase character, numeric characters, & special character in each password¶

### Extracting the password length

In [5]:
df['length'] = df['password'].str.len()
df.head()

Unnamed: 0,password,length
0,123456,6
1,12345,5
2,123456789,9
3,password,8
4,iloveyou,8


### Extract uppercase

In [6]:
df['uppercase_count'] = df['password'].str.count(r'[A-Z]')
df.head()

Unnamed: 0,password,length,uppercase_count
0,123456,6,0
1,12345,5,0
2,123456789,9,0
3,password,8,0
4,iloveyou,8,0


### Extracting lowercase

In [7]:
df['lowercase_count'] = df['password'].str.count(r'[a-z]')
df.head()

Unnamed: 0,password,length,uppercase_count,lowercase_count
0,123456,6,0,0
1,12345,5,0,0
2,123456789,9,0,0
3,password,8,0,8
4,iloveyou,8,0,8


### Extracting numeric Char

In [8]:
df['numeric_char_count'] = df['password'].str.count(r'\d')
df.head()

Unnamed: 0,password,length,uppercase_count,lowercase_count,numeric_char_count
0,123456,6,0,0,6
1,12345,5,0,0,5
2,123456789,9,0,0,9
3,password,8,0,8,0
4,iloveyou,8,0,8,0


### Extracting Special Char

In [9]:
df['special_char_count'] = df['password'].str.count(r'[^a-zA-Z0-9]')
df.tail()

Unnamed: 0,password,length,uppercase_count,lowercase_count,numeric_char_count,special_char_count
14346464,"xCvBnM,",8,3,3,0,2
14346465,ie168,6,0,2,3,1
14346466,abygurl69,10,0,7,2,1
14346467,a6_123,7,0,1,4,2
14346468,*7Â¡Vamos!,13,1,4,1,7


# Identifying password with patterns like Sequential characters, Repetition characters, Consecutive repetition, Keyboard patterns, Personal Informations, Dictionary word 

### Identifying Password with Sequential pattern

In [10]:
def has_sequential_chars(password):
    # Convert password to lowercase for case-insensitive comparison
    password = password.lower()

    # Check for alphabetical sequences (e.g., 'abc', 'bcd', etc.)
    for i in range(len(password) - 2):
        if password[i].isalpha() and password[i+1].isalpha() and password[i+2].isalpha():
            if ord(password[i]) == ord(password[i+1]) - 1 == ord(password[i+2]) - 2:
                return 1

    # Check for numerical sequences (e.g., '123', '234', etc.)
    for i in range(len(password) - 2):
        if password[i].isdigit() and password[i+1].isdigit() and password[i+2].isdigit():
            try:
                if int(password[i]) + 1 == int(password[i+1]) and int(password[i+1]) + 1 == int(password[i+2]):
                    return 1
            except ValueError:
                continue

    # If no sequential characters are found, return 0
    return 0


# Check for Sequential characters using swifter
df['has_sequential_chars'] = df['password'].swifter.apply(has_sequential_chars)

# Save to CSV (optional)
# df.to_csv('password_features.csv', index=False)
df.head()

Pandas Apply:   0%|          | 0/14346469 [00:00<?, ?it/s]

Unnamed: 0,password,length,uppercase_count,lowercase_count,numeric_char_count,special_char_count,has_sequential_chars
0,123456,6,0,0,6,0,1
1,12345,5,0,0,5,0,1
2,123456789,9,0,0,9,0,1
3,password,8,0,8,0,0,0
4,iloveyou,8,0,8,0,0,0


In [11]:
# df = pd.read_csv('/kaggle/working/password_features.csv')

# df.head()

### Identifying Password with Character Repetition

In [12]:
def has_repeated_chars(password):
    # Check if any character is repeated
    return len(password) != len(set(password))


# Apply the function to check for repeated characters
df['has_repeated_chars'] = df['password'].swifter.apply(has_repeated_chars).astype(int)


# Save to CSV
# df.to_csv('password_features.csv', index=False)
df.head()

Pandas Apply:   0%|          | 0/14346469 [00:00<?, ?it/s]

Unnamed: 0,password,length,uppercase_count,lowercase_count,numeric_char_count,special_char_count,has_sequential_chars,has_repeated_chars
0,123456,6,0,0,6,0,1,0
1,12345,5,0,0,5,0,1,0
2,123456789,9,0,0,9,0,1,0
3,password,8,0,8,0,0,0,1
4,iloveyou,8,0,8,0,0,0,1


### Identify Password with Consecutive Repetition of characters

In [13]:
def has_consecutive_repeats(password):
    # Check for consecutive repeated characters
    for i in range(len(password) - 1):
        if password[i] == password[i+1]:
            return 1
    return 0


# Apply the function to check for consecutive repeats
df['has_consecutive_repeats'] = df['password'].swifter.apply(has_consecutive_repeats)

# Save to CSV
#df.to_csv('password_features.csv', index=False)
df.head()

Pandas Apply:   0%|          | 0/14346469 [00:00<?, ?it/s]

Unnamed: 0,password,length,uppercase_count,lowercase_count,numeric_char_count,special_char_count,has_sequential_chars,has_repeated_chars,has_consecutive_repeats
0,123456,6,0,0,6,0,1,0,0
1,12345,5,0,0,5,0,1,0,0
2,123456789,9,0,0,9,0,1,0,0
3,password,8,0,8,0,0,0,1,1
4,iloveyou,8,0,8,0,0,0,1,0


### Identify Password that contain keyboard pattern

In [14]:
# Predefined list of common keyboard patterns (lowercase)
keyboard_patterns = [
    'qwerty', 'ytrewq', '123456', '654321', 'asdfghjkl', 'lkjhgfdsa',
    'zxcvbnm', 'mnbvcxz', '12345678', '87654321', '111111', '222222',
    '333333', '444444', '555555', '666666', '777777', '888888',
    '999999', '000000', 'password', 'drowssap', 'iloveyou', 'uoyevoli',
    'dragonball', 'llabnogard', 'baseball', 'llabesab', 'letmein', 'nimelet',
    'monkey', 'yeknom', 'sunshine', 'enihsnus', 'princess', 'ssercnip',
    'master', 'retsam', 'hello', 'olleh', 'welcome', 'emoclew', 'abc123',
    '321cba', 'qwertyuiop', 'poiuytrewq', 'asdfghjklpoiuytrewq', 'qwertrewqihgfdsalkj',
    'zxcvbnm,./', ',./mnbvcxz', '1234567890', '0987654321', '9876543210',
    '0123456789', '1234567890-=', '=-0987654321', 'qwertyuiop[]', '][poiuytrewq',
    '1234', '4321', '2345', '5432', '3456', '6543', '4567', '7654',
    '5678', '8765', '6789', '9876', '7890', '0987', '00', '12', '21',
    '123', '321', '0123', '3210', '01234', '43210', '4321', '1234',
    '54321', '12345', '654321', '123456', '7654321', '87654321', '21',
    '321','1111','2222','3333','4444','5555','6666','7777','8888','9999','0000',
    '111','222','333','444','555','666','777','888','999','000',
    '11','22','33','44','55','66','77','88','99'
    '01','02','03','04','05','06','07','08','09','10',
    '20','30','40','50','60','70','80','90','100'
]


def contains_keyboard_pattern(password):
    # Convert to lowercase for case-insensitive comparison
    password = password.lower()
    
    # Check if password contains any keyboard pattern as a substring
    return 1 if any(pattern in password for pattern in keyboard_patterns) else 0

# Apply the function
df['contains_keyboard_pattern'] = df['password'].swifter.apply(contains_keyboard_pattern)

# Save to csv
#df.to_csv('password_features.csv', index=False)
df.head()

Pandas Apply:   0%|          | 0/14346469 [00:00<?, ?it/s]

Unnamed: 0,password,length,uppercase_count,lowercase_count,numeric_char_count,special_char_count,has_sequential_chars,has_repeated_chars,has_consecutive_repeats,contains_keyboard_pattern
0,123456,6,0,0,6,0,1,0,0,1
1,12345,5,0,0,5,0,1,0,0,1
2,123456789,9,0,0,9,0,1,0,0,1
3,password,8,0,8,0,0,0,1,1,1
4,iloveyou,8,0,8,0,0,0,1,0,1


In [15]:
#pd.read_csv('/kaggle/working/password_features.csv').tail(10)

In [16]:
#pd.read_csv('/kaggle/working/password_features.csv').iloc[1000000:1000020]

### Identify Password that is a full dictionary word

In [17]:
# Get English words (convert to lowercase for case-insensitive comparison)
english_words = {word.lower() for word in words.words()}

def contains_dictionary_word(password):
    # Convert to lowercase
    password = password.lower()
    
    # Check if password is in dictionary (whole-word match)
    return 1 if password in english_words else 0

# Apply the function
df['is_dictionary_word'] = df['password'].swifter.apply(contains_dictionary_word)


df.head()


Pandas Apply:   0%|          | 0/14346469 [00:00<?, ?it/s]

Unnamed: 0,password,length,uppercase_count,lowercase_count,numeric_char_count,special_char_count,has_sequential_chars,has_repeated_chars,has_consecutive_repeats,contains_keyboard_pattern,is_dictionary_word
0,123456,6,0,0,6,0,1,0,0,1,0
1,12345,5,0,0,5,0,1,0,0,1,0
2,123456789,9,0,0,9,0,1,0,0,1,0
3,password,8,0,8,0,0,0,1,1,1,1
4,iloveyou,8,0,8,0,0,0,1,0,1,0


In [18]:
# Get counts for 'is_dictionary_word'
counts = df.groupby('is_dictionary_word').size()
print(counts)

is_dictionary_word
0    14282037
1       64432
dtype: int64


### Identify Password that contain dictionary word

In [19]:
# Prepare the Dictionary for Aho-Corasick
english_words_set = set(word.lower() for word in words.words() if len(word) >= 3)

# Build the Aho-Corasick automaton
aho = ahocorasick.Automaton()
for word in english_words_set:
    aho.add_word(word, word)             # We add each dictionary word as a pattern to search for
aho.make_automaton()                     # This finalizes the automaton, making it ready for fast searching


# Function to Check for Substrings Using Aho-Corasick 
def check_substring_ahocorasick(password):
    password_lower = password.lower()
    for end_index, word in aho.iter(password_lower):
        return 1        # If we find any dictionary word as a substring,
    return 0           # If we go through the whole password and don't find any
    

# Process DataFrame in Chunks (with swifter for good measure)
def process_chunk_ahocorasick(df_chunk):
    mask_chunk = df_chunk['is_dictionary_word'] == 0
    # Apply our Aho-Corasick checking function in parallel (or vectorized if swifter deems it better)
    df_chunk.loc[mask_chunk, 'contain_dictionary_word'] = df_chunk.loc[mask_chunk, 'password'].swifter.apply(check_substring_ahocorasick)
    df_chunk['contain_dictionary_word'] = df_chunk['contain_dictionary_word'].fillna(0).astype(int)
    return df_chunk

def process_dataframe_in_chunks_ahocorasick(df, chunk_size=100000):
    results = []
    num_chunks = (len(df) + chunk_size - 1) // chunk_size
    print(f"Processing DataFrame in {num_chunks} chunks using Aho-Corasick...")
    for i in range(0, len(df), chunk_size):
        start_time = time.time()
        df_chunk = df.iloc[i:i + chunk_size].copy()
        processed_chunk = process_chunk_ahocorasick(df_chunk)
        results.append(processed_chunk)
        end_time = time.time()
        print(f"Processed chunk {i // chunk_size + 1}/{num_chunks} in {end_time - start_time:.2f} seconds.")

    final_df = pd.concat(results)
    return final_df


# Process the DataFrame using the Aho-Corasick method 
chunk_size = 100000 
df = process_dataframe_in_chunks_ahocorasick(df.copy(), chunk_size=chunk_size)

# Save to csv
#df.to_csv('password_features.csv', index=False)
df.head()


Processing DataFrame in 144 chunks using Aho-Corasick...


Pandas Apply:   0%|          | 0/89953 [00:00<?, ?it/s]

Processed chunk 1/144 in 0.29 seconds.


Pandas Apply:   0%|          | 0/95654 [00:00<?, ?it/s]

Processed chunk 2/144 in 0.31 seconds.


Pandas Apply:   0%|          | 0/96647 [00:00<?, ?it/s]

Processed chunk 3/144 in 0.28 seconds.


Pandas Apply:   0%|          | 0/97330 [00:00<?, ?it/s]

Processed chunk 4/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/97891 [00:00<?, ?it/s]

Processed chunk 5/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/97895 [00:00<?, ?it/s]

Processed chunk 6/144 in 0.29 seconds.


Pandas Apply:   0%|          | 0/98590 [00:00<?, ?it/s]

Processed chunk 7/144 in 0.29 seconds.


Pandas Apply:   0%|          | 0/98213 [00:00<?, ?it/s]

Processed chunk 8/144 in 0.30 seconds.


Pandas Apply:   0%|          | 0/99076 [00:00<?, ?it/s]

Processed chunk 9/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/98966 [00:00<?, ?it/s]

Processed chunk 10/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/97940 [00:00<?, ?it/s]

Processed chunk 11/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/99614 [00:00<?, ?it/s]

Processed chunk 12/144 in 0.28 seconds.


Pandas Apply:   0%|          | 0/99247 [00:00<?, ?it/s]

Processed chunk 13/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/99259 [00:00<?, ?it/s]

Processed chunk 14/144 in 0.25 seconds.


Pandas Apply:   0%|          | 0/99514 [00:00<?, ?it/s]

Processed chunk 15/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/99615 [00:00<?, ?it/s]

Processed chunk 16/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/99817 [00:00<?, ?it/s]

Processed chunk 17/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/99357 [00:00<?, ?it/s]

Processed chunk 18/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/99335 [00:00<?, ?it/s]

Processed chunk 19/144 in 0.28 seconds.


Pandas Apply:   0%|          | 0/99377 [00:00<?, ?it/s]

Processed chunk 20/144 in 0.29 seconds.


Pandas Apply:   0%|          | 0/98610 [00:00<?, ?it/s]

Processed chunk 21/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/97583 [00:00<?, ?it/s]

Processed chunk 22/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 23/144 in 0.28 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 24/144 in 0.28 seconds.


Pandas Apply:   0%|          | 0/99965 [00:00<?, ?it/s]

Processed chunk 25/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/99922 [00:00<?, ?it/s]

Processed chunk 26/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/99936 [00:00<?, ?it/s]

Processed chunk 27/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/99805 [00:00<?, ?it/s]

Processed chunk 28/144 in 0.25 seconds.


Pandas Apply:   0%|          | 0/99802 [00:00<?, ?it/s]

Processed chunk 29/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/99586 [00:00<?, ?it/s]

Processed chunk 30/144 in 0.30 seconds.


Pandas Apply:   0%|          | 0/99764 [00:00<?, ?it/s]

Processed chunk 31/144 in 0.30 seconds.


Pandas Apply:   0%|          | 0/99858 [00:00<?, ?it/s]

Processed chunk 32/144 in 0.29 seconds.


Pandas Apply:   0%|          | 0/99800 [00:00<?, ?it/s]

Processed chunk 33/144 in 0.33 seconds.


Pandas Apply:   0%|          | 0/99843 [00:00<?, ?it/s]

Processed chunk 34/144 in 0.25 seconds.


Pandas Apply:   0%|          | 0/99767 [00:00<?, ?it/s]

Processed chunk 35/144 in 0.28 seconds.


Pandas Apply:   0%|          | 0/99625 [00:00<?, ?it/s]

Processed chunk 36/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/99775 [00:00<?, ?it/s]

Processed chunk 37/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/99773 [00:00<?, ?it/s]

Processed chunk 38/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/99900 [00:00<?, ?it/s]

Processed chunk 39/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/99731 [00:00<?, ?it/s]

Processed chunk 40/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/99821 [00:00<?, ?it/s]

Processed chunk 41/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/99895 [00:00<?, ?it/s]

Processed chunk 42/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/99807 [00:00<?, ?it/s]

Processed chunk 43/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/99767 [00:00<?, ?it/s]

Processed chunk 44/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/99783 [00:00<?, ?it/s]

Processed chunk 45/144 in 0.28 seconds.


Pandas Apply:   0%|          | 0/99721 [00:00<?, ?it/s]

Processed chunk 46/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/99764 [00:00<?, ?it/s]

Processed chunk 47/144 in 0.25 seconds.


Pandas Apply:   0%|          | 0/99749 [00:00<?, ?it/s]

Processed chunk 48/144 in 0.25 seconds.


Pandas Apply:   0%|          | 0/99680 [00:00<?, ?it/s]

Processed chunk 49/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/99856 [00:00<?, ?it/s]

Processed chunk 50/144 in 0.28 seconds.


Pandas Apply:   0%|          | 0/99888 [00:00<?, ?it/s]

Processed chunk 51/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/99884 [00:00<?, ?it/s]

Processed chunk 52/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/99924 [00:00<?, ?it/s]

Processed chunk 53/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/99834 [00:00<?, ?it/s]

Processed chunk 54/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/99896 [00:00<?, ?it/s]

Processed chunk 55/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/99934 [00:00<?, ?it/s]

Processed chunk 56/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/99867 [00:00<?, ?it/s]

Processed chunk 57/144 in 0.25 seconds.


Pandas Apply:   0%|          | 0/99916 [00:00<?, ?it/s]

Processed chunk 58/144 in 0.25 seconds.


Pandas Apply:   0%|          | 0/99878 [00:00<?, ?it/s]

Processed chunk 59/144 in 0.25 seconds.


Pandas Apply:   0%|          | 0/99878 [00:00<?, ?it/s]

Processed chunk 60/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/99936 [00:00<?, ?it/s]

Processed chunk 61/144 in 0.25 seconds.


Pandas Apply:   0%|          | 0/99894 [00:00<?, ?it/s]

Processed chunk 62/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/99887 [00:00<?, ?it/s]

Processed chunk 63/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/99876 [00:00<?, ?it/s]

Processed chunk 64/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/99918 [00:00<?, ?it/s]

Processed chunk 65/144 in 0.29 seconds.


Pandas Apply:   0%|          | 0/99940 [00:00<?, ?it/s]

Processed chunk 66/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/99950 [00:00<?, ?it/s]

Processed chunk 67/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/99948 [00:00<?, ?it/s]

Processed chunk 68/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/99953 [00:00<?, ?it/s]

Processed chunk 69/144 in 0.28 seconds.


Pandas Apply:   0%|          | 0/99974 [00:00<?, ?it/s]

Processed chunk 70/144 in 0.33 seconds.


Pandas Apply:   0%|          | 0/99961 [00:00<?, ?it/s]

Processed chunk 71/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/99967 [00:00<?, ?it/s]

Processed chunk 72/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/99933 [00:00<?, ?it/s]

Processed chunk 73/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/99686 [00:00<?, ?it/s]

Processed chunk 74/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/99904 [00:00<?, ?it/s]

Processed chunk 75/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/99845 [00:00<?, ?it/s]

Processed chunk 76/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/99834 [00:00<?, ?it/s]

Processed chunk 77/144 in 0.25 seconds.


Pandas Apply:   0%|          | 0/99766 [00:00<?, ?it/s]

Processed chunk 78/144 in 0.25 seconds.


Pandas Apply:   0%|          | 0/99844 [00:00<?, ?it/s]

Processed chunk 79/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/99808 [00:00<?, ?it/s]

Processed chunk 80/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/99760 [00:00<?, ?it/s]

Processed chunk 81/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/99808 [00:00<?, ?it/s]

Processed chunk 82/144 in 0.28 seconds.


Pandas Apply:   0%|          | 0/99763 [00:00<?, ?it/s]

Processed chunk 83/144 in 0.29 seconds.


Pandas Apply:   0%|          | 0/99878 [00:00<?, ?it/s]

Processed chunk 84/144 in 0.28 seconds.


Pandas Apply:   0%|          | 0/99853 [00:00<?, ?it/s]

Processed chunk 85/144 in 0.29 seconds.


Pandas Apply:   0%|          | 0/99861 [00:00<?, ?it/s]

Processed chunk 86/144 in 0.29 seconds.


Pandas Apply:   0%|          | 0/99715 [00:00<?, ?it/s]

Processed chunk 87/144 in 0.31 seconds.


Pandas Apply:   0%|          | 0/99864 [00:00<?, ?it/s]

Processed chunk 88/144 in 0.30 seconds.


Pandas Apply:   0%|          | 0/99849 [00:00<?, ?it/s]

Processed chunk 89/144 in 0.29 seconds.


Pandas Apply:   0%|          | 0/99584 [00:00<?, ?it/s]

Processed chunk 90/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/99783 [00:00<?, ?it/s]

Processed chunk 91/144 in 0.30 seconds.


Pandas Apply:   0%|          | 0/99833 [00:00<?, ?it/s]

Processed chunk 92/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/99828 [00:00<?, ?it/s]

Processed chunk 93/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/99727 [00:00<?, ?it/s]

Processed chunk 94/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/99838 [00:00<?, ?it/s]

Processed chunk 95/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/99825 [00:00<?, ?it/s]

Processed chunk 96/144 in 0.25 seconds.


Pandas Apply:   0%|          | 0/99869 [00:00<?, ?it/s]

Processed chunk 97/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/99817 [00:00<?, ?it/s]

Processed chunk 98/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/99834 [00:00<?, ?it/s]

Processed chunk 99/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/99798 [00:00<?, ?it/s]

Processed chunk 100/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/99788 [00:00<?, ?it/s]

Processed chunk 101/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/99848 [00:00<?, ?it/s]

Processed chunk 102/144 in 0.28 seconds.


Pandas Apply:   0%|          | 0/99901 [00:00<?, ?it/s]

Processed chunk 103/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/99804 [00:00<?, ?it/s]

Processed chunk 104/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/99679 [00:00<?, ?it/s]

Processed chunk 105/144 in 0.28 seconds.


Pandas Apply:   0%|          | 0/98598 [00:00<?, ?it/s]

Processed chunk 106/144 in 0.28 seconds.


Pandas Apply:   0%|          | 0/98711 [00:00<?, ?it/s]

Processed chunk 107/144 in 0.31 seconds.


Pandas Apply:   0%|          | 0/98709 [00:00<?, ?it/s]

Processed chunk 108/144 in 0.28 seconds.


Pandas Apply:   0%|          | 0/99191 [00:00<?, ?it/s]

Processed chunk 109/144 in 0.28 seconds.


Pandas Apply:   0%|          | 0/99442 [00:00<?, ?it/s]

Processed chunk 110/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/99527 [00:00<?, ?it/s]

Processed chunk 111/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/98631 [00:00<?, ?it/s]

Processed chunk 112/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/98813 [00:00<?, ?it/s]

Processed chunk 113/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/98721 [00:00<?, ?it/s]

Processed chunk 114/144 in 0.25 seconds.


Pandas Apply:   0%|          | 0/99217 [00:00<?, ?it/s]

Processed chunk 115/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 116/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 117/144 in 0.28 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 118/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 119/144 in 0.28 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 120/144 in 0.29 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 121/144 in 0.28 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 122/144 in 0.83 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 123/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 124/144 in 0.28 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 125/144 in 0.28 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 126/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 127/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 128/144 in 0.27 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 129/144 in 0.29 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 130/144 in 0.29 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 131/144 in 0.29 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 132/144 in 0.28 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 133/144 in 0.30 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 134/144 in 0.29 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 135/144 in 0.29 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 136/144 in 0.28 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 137/144 in 0.28 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 138/144 in 0.29 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 139/144 in 0.28 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 140/144 in 0.28 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 141/144 in 0.32 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 142/144 in 0.29 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 143/144 in 0.31 seconds.


Pandas Apply:   0%|          | 0/46469 [00:00<?, ?it/s]

Processed chunk 144/144 in 0.17 seconds.


Unnamed: 0,password,length,uppercase_count,lowercase_count,numeric_char_count,special_char_count,has_sequential_chars,has_repeated_chars,has_consecutive_repeats,contains_keyboard_pattern,is_dictionary_word,contain_dictionary_word
0,123456,6,0,0,6,0,1,0,0,1,0,0
1,12345,5,0,0,5,0,1,0,0,1,0,0
2,123456789,9,0,0,9,0,1,0,0,1,0,0
3,password,8,0,8,0,0,0,1,1,1,1,0
4,iloveyou,8,0,8,0,0,0,1,0,1,0,1


### Identify password with Personal Information

In [20]:
# Prepare English Names for Aho-Corasick
english_names = set(names.words())
english_names_lower = {name.lower() for name in english_names}

# Build the Aho-Corasick automaton for names
name_aho = ahocorasick.Automaton()
for name in english_names_lower:
    name_aho.add_word(name, name)
name_aho.make_automaton()

def contains_personal_info_optimized(password):
    password_lower = password.lower()

    # Check for names using Aho-Corasick
    for end_index, name in name_aho.iter(password_lower):
        return 1

    # Check for dates (YYYY-MM-DD or MM-DD-YYYY)
    if re.search(r'\b\d{4}-\d{2}-\d{2}\b|\b\d{2}-\d{2}-\d{4}\b', password_lower):
        return 1

    # Check for potential year of birth (YYYY format within a reasonable range)
    if re.search(r'\b(19\d{2}|200)\b', password_lower): # Matches years from 1900 to 2050
        return 1

    # Check for phone numbers (XXX-XXX-XXXX or (XXX) XXX-XXXX)
    if re.search(r'\b\d{3}-\d{3}-\d{4}\b|\b\(\d{3}\) \d{3}-\d{4}\b', password_lower):
        return 1

    return 0

# Process DataFrame in Chunks with swifter 
def process_chunk_personal_info(df_chunk):
    df_chunk['contains_personal_info'] = df_chunk['password'].swifter.apply(contains_personal_info_optimized)
    return df_chunk

def process_dataframe_in_chunks_personal_info(df, chunk_size=100000):
    results = []
    num_chunks = (len(df) + chunk_size - 1) // chunk_size
    print(f"Processing DataFrame in {num_chunks} chunks for personal info...")
    for i in range(0, len(df), chunk_size):
        start_time = time.time()
        df_chunk = df.iloc[i:i + chunk_size].copy()
        processed_chunk = process_chunk_personal_info(df_chunk)
        results.append(processed_chunk)
        end_time = time.time()
        print(f"Processed chunk {i // chunk_size + 1}/{num_chunks} in {end_time - start_time:.2f} seconds.")

    final_df = pd.concat(results)
    return final_df


# Process the DataFrame 
chunk_size = 100000                    
df = process_dataframe_in_chunks_personal_info(df.copy(), chunk_size=chunk_size)

# Save to CSV 
# df.to_csv('password_features_personal_info.csv', index=False)

Processing DataFrame in 144 chunks for personal info...


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 1/144 in 0.37 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 2/144 in 0.35 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 3/144 in 0.35 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 4/144 in 0.35 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 5/144 in 0.36 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 6/144 in 0.37 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 7/144 in 0.35 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 8/144 in 0.41 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 9/144 in 0.32 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 10/144 in 0.32 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 11/144 in 0.37 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 12/144 in 0.40 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 13/144 in 0.34 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 14/144 in 0.37 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 15/144 in 0.37 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 16/144 in 0.31 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 17/144 in 0.32 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 18/144 in 0.35 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 19/144 in 0.34 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 20/144 in 0.32 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 21/144 in 0.32 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 22/144 in 0.41 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 23/144 in 0.49 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 24/144 in 0.50 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 25/144 in 0.49 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 26/144 in 0.38 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 27/144 in 0.39 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 28/144 in 0.34 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 29/144 in 0.35 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 30/144 in 0.35 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 31/144 in 0.36 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 32/144 in 0.38 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 33/144 in 0.34 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 34/144 in 0.34 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 35/144 in 0.38 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 36/144 in 0.35 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 37/144 in 0.37 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 38/144 in 0.30 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 39/144 in 0.34 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 40/144 in 0.35 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 41/144 in 0.38 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 42/144 in 0.35 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 43/144 in 0.33 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 44/144 in 0.31 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 45/144 in 0.38 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 46/144 in 0.39 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 47/144 in 0.41 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 48/144 in 0.32 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 49/144 in 0.36 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 50/144 in 0.39 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 51/144 in 0.37 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 52/144 in 0.35 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 53/144 in 0.35 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 54/144 in 0.39 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 55/144 in 0.41 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 56/144 in 0.32 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 57/144 in 0.34 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 58/144 in 0.28 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 59/144 in 0.32 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 60/144 in 0.37 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 61/144 in 0.30 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 62/144 in 0.32 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 63/144 in 0.29 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 64/144 in 0.26 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 65/144 in 0.39 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 66/144 in 0.30 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 67/144 in 0.32 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 68/144 in 0.33 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 69/144 in 0.32 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 70/144 in 0.32 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 71/144 in 0.34 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 72/144 in 0.33 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 73/144 in 0.37 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 74/144 in 0.34 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 75/144 in 0.33 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 76/144 in 0.38 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 77/144 in 0.34 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 78/144 in 0.42 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 79/144 in 0.39 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 80/144 in 0.34 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 81/144 in 0.38 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 82/144 in 0.34 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 83/144 in 0.32 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 84/144 in 0.30 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 85/144 in 0.36 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 86/144 in 0.34 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 87/144 in 0.28 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 88/144 in 0.31 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 89/144 in 0.35 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 90/144 in 0.35 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 91/144 in 0.35 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 92/144 in 0.30 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 93/144 in 0.33 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 94/144 in 0.35 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 95/144 in 0.31 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 96/144 in 0.28 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 97/144 in 0.34 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 98/144 in 0.32 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 99/144 in 0.34 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 100/144 in 0.40 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 101/144 in 0.33 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 102/144 in 0.32 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 103/144 in 0.25 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 104/144 in 0.34 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 105/144 in 0.43 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 106/144 in 0.36 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 107/144 in 0.35 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 108/144 in 0.38 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 109/144 in 0.35 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 110/144 in 0.32 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 111/144 in 0.35 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 112/144 in 0.38 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 113/144 in 0.34 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 114/144 in 0.33 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 115/144 in 0.37 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 116/144 in 0.52 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 117/144 in 0.52 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 118/144 in 0.52 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 119/144 in 0.50 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 120/144 in 0.53 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 121/144 in 0.51 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 122/144 in 0.51 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 123/144 in 0.49 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 124/144 in 0.55 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 125/144 in 0.51 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 126/144 in 0.52 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 127/144 in 0.48 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 128/144 in 0.50 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 129/144 in 0.48 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 130/144 in 0.48 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 131/144 in 0.43 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 132/144 in 0.52 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 133/144 in 0.52 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 134/144 in 0.47 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 135/144 in 0.49 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 136/144 in 0.55 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 137/144 in 0.56 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 138/144 in 0.56 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 139/144 in 0.54 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 140/144 in 0.51 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 141/144 in 0.53 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 142/144 in 0.53 seconds.


Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

Processed chunk 143/144 in 0.52 seconds.


Pandas Apply:   0%|          | 0/46469 [00:00<?, ?it/s]

Processed chunk 144/144 in 0.21 seconds.


### Compute Randomness Score

In [21]:
def calculate_randomness_score(row):
    """
    Calculates a randomness score for a password (0-10) considering length,
    character variety, and the presence of non-random patterns.
    """
    score = 0

    # this use length to score a base score for length (we can adjust the multiplier)
    score += row['length'] * 0.2  # Example: give Up to 2 points for a length of 10. the multiplier can be adjust

    # this Add points for character variety
    if row['uppercase_count'] > 0:
        score += 1.5
    if row['lowercase_count'] > 0:
        score += 1.5
    if row['numeric_char_count'] > 0:
        score += 2
    if row['special_char_count'] > 0:
        score += 3                    # Special characters often add more to perceived randomness

    # this Subtract points for non-random patterns
    if row['has_sequential_chars']:
        score -= 2
    if row['has_repeated_chars']:
        score -= 1.5
    if row['has_consecutive_repeats']:
        score -= 2
    if row['contains_keyboard_pattern']:
        score -= 3
    if row['is_dictionary_word']:
        score -= 4
    if row['contain_dictionary_word']:
        score -= 2.5
    if row['contains_personal_info']:
        score -= 5

    # this Ensure the score stays within the 0-10 range
    return max(0, min(10, round(score)))

# apply to dataframe
df['randomness_score'] = df.swifter.apply(calculate_randomness_score, axis=1)

print(df[['password', 'randomness_score']].head())

Pandas Apply:   0%|          | 0/14346469 [00:00<?, ?it/s]

    password  randomness_score
0     123456                 0
1      12345                 0
2  123456789                 0
3   password                 0
4   iloveyou                 0


### Calculate the password Entropy with Shannon Entropy

In [22]:
import math

def calculate_entropy(row):
    """
    Calculates the estimated entropy (in bits) for a password based on its features.
    """
    length = row['length']
    has_upper = row['uppercase_count'] > 0
    has_lower = row['lowercase_count'] > 0
    has_numeric = row['numeric_char_count'] > 0
    has_special = row['special_char_count'] > 0

    character_set_size = 0
    if has_upper:
        character_set_size += 26
    if has_lower:
        character_set_size += 26
    if has_numeric:
        character_set_size += 10
    if has_special:
        character_set_size += 32 # Approximate number of special characters

    if character_set_size > 0 and length > 0:
        entropy = length * math.log2(character_set_size)
        return entropy
    else:
        return 0

# Assuming your DataFrame is named 'df'
df['entropy'] = df.swifter.apply(calculate_entropy, axis=1)

print(df[['password', 'entropy']].head())

Pandas Apply:   0%|          | 0/14346469 [00:00<?, ?it/s]

    password    entropy
0     123456  19.931569
1      12345  16.609640
2  123456789  29.897353
3   password  37.603518
4   iloveyou  37.603518


### Calculate entropy by taking into account the bias pattern (sequential, Repetition, etc.)

In [23]:
def calculate_entropy_with_patterns(row):
    """
    Calculates the estimated entropy (in bits) for a password, factoring in identified patterns.
    """
    length = row['length']
    has_upper = row['uppercase_count'] > 0
    has_lower = row['lowercase_count'] > 0
    has_numeric = row['numeric_char_count'] > 0
    has_special = row['special_char_count'] > 0

    character_set_size = 0
    if has_upper:
        character_set_size += 26
    if has_lower:
        character_set_size += 26
    if has_numeric:
        character_set_size += 10
    if has_special:
        character_set_size += 32

    if character_set_size > 0 and length > 0:
        initial_entropy = length * math.log2(character_set_size) # calcualate the shannon entropy
        adjusted_entropy = initial_entropy

        # Reduce entropy for identified patterns (these values are heuristic and can be tweaked)
        if row['has_sequential_chars']:
            adjusted_entropy -= 5  # Sequential chars make it easier to guess
        if row['has_repeated_chars']:
            adjusted_entropy -= 3  # Repeated chars reduce the effective character space
        if row['has_consecutive_repeats']:
            adjusted_entropy -= 2  # Similar to repeated chars
        if row['contains_keyboard_pattern']:
            adjusted_entropy -= 7  # Keyboard patterns are very common
        if row['is_dictionary_word']:
            adjusted_entropy -= 10 # Dictionary words are highly predictable
        if row['contain_dictionary_word']:
            adjusted_entropy -= 8  # Containing a dictionary word is also a weakness
        if row['contains_personal_info']:
            adjusted_entropy -= 12 # Personal info is very guessable

        return max(0, adjusted_entropy) # Ensure entropy doesn't go negative
    else:
        return 0

# Assuming your DataFrame is named 'df'
df['entropy_with_patterns'] = df.swifter.apply(calculate_entropy_with_patterns, axis=1)

print(df[['password', 'entropy', 'entropy_with_patterns']].head())

Pandas Apply:   0%|          | 0/14346469 [00:00<?, ?it/s]

    password    entropy  entropy_with_patterns
0     123456  19.931569               7.931569
1      12345  16.609640               4.609640
2  123456789  29.897353              17.897353
3   password  37.603518              15.603518
4   iloveyou  37.603518               7.603518


### Calculate entropy by taking into account the bias pattern (sequential, Repetition, etc.)

In [24]:
def calculate_complexity(row):
    length = row['length']
    has_upper = row['uppercase_count']
    has_lower = row['lowercase_count']
    has_numeric = row['numeric_char_count']
    has_special = row['special_char_count']
    has_sequential = row['has_sequential_chars']
    has_repeated = row['has_repeated_chars']
    has_consecutive = row['has_consecutive_repeats']
    has_keyboard = row['contains_keyboard_pattern']
    is_dict = row['is_dictionary_word']
    contains_dict_word = row['contain_dictionary_word']
    contains_personal = row['contains_personal_info']

    score = length * 2 # Base score based on length

    if length > 8:
        score += 1
    if has_upper:
        score += 2
    if has_lower:
        score += 2
    if has_numeric:
        score += 3
    if has_special:
        score += 4

    # Penalty for being only one type of character (and longer than a few chars)
    if length > 5 and ((has_upper or has_lower) and not has_numeric and not has_special):
        score -= 2
    elif length > 5 and (has_numeric and not has_upper and not has_lower and not has_special):
        score -= 2

    if has_sequential:
        score -= 5
    if has_repeated:
        score -= 3
    if has_consecutive:
        score -= 2
    if has_keyboard:
        score -= 7
    if is_dict or contains_dict_word:
        score -= 10
    if contains_personal:
        score -= 15

    # Normalize the score to the range 0-1
    max_possible_score = 60 # This is our educated guess for a very strong password
    normalized_score = max(0, min(1, score / max_possible_score))
    return normalized_score


# Assuming your DataFrame is named 'df'
df['complexity_score'] = df.swifter.apply(calculate_complexity, axis=1)

print(df[['password', 'complexity_score']].head())

Pandas Apply:   0%|          | 0/14346469 [00:00<?, ?it/s]

    password  complexity_score
0     123456          0.016667
1      12345          0.016667
2  123456789          0.133333
3   password          0.000000
4   iloveyou          0.000000


In [25]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14346469 entries, 0 to 14346468
Data columns (total 17 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   password                   object 
 1   length                     int64  
 2   uppercase_count            int64  
 3   lowercase_count            int64  
 4   numeric_char_count         int64  
 5   special_char_count         int64  
 6   has_sequential_chars       int64  
 7   has_repeated_chars         int64  
 8   has_consecutive_repeats    int64  
 9   contains_keyboard_pattern  int64  
 10  is_dictionary_word         int64  
 11  contain_dictionary_word    int64  
 12  contains_personal_info     int64  
 13  randomness_score           int64  
 14  entropy                    float64
 15  entropy_with_patterns      float64
 16  complexity_score           float64
dtypes: float64(3), int64(13), object(1)
memory usage: 2.6 GB


In [26]:
columns = df.columns
columns

Index(['password', 'length', 'uppercase_count', 'lowercase_count',
       'numeric_char_count', 'special_char_count', 'has_sequential_chars',
       'has_repeated_chars', 'has_consecutive_repeats',
       'contains_keyboard_pattern', 'is_dictionary_word',
       'contain_dictionary_word', 'contains_personal_info', 'randomness_score',
       'entropy', 'entropy_with_patterns', 'complexity_score'],
      dtype='object')

In [27]:
# Convert to int16 (for columns with larger integers)
df['length'] = df['length'].astype('int16')
df['uppercase_count'] = df['uppercase_count'].astype('int16')
df['lowercase_count'] = df['lowercase_count'].astype('int16')
df['numeric_char_count'] = df['numeric_char_count'].astype('int16')
df['special_char_count'] = df['special_char_count'].astype('int16')
df['randomness_score'] = df['randomness_score'].astype('int16')
df['entropy'] = df['entropy'].astype('int16')
df['entropy_with_patterns'] = df['entropy_with_patterns'].astype('int16')
df['complexity_score'] = df['complexity_score'].astype('int16')

# Convert to bool (for binary columns)
df['has_sequential_chars'] = df['has_sequential_chars'].astype('bool')
df['has_repeated_chars'] = df['has_repeated_chars'].astype('bool')
df['has_consecutive_repeats'] = df['has_consecutive_repeats'].astype('bool')
df['contains_keyboard_pattern'] = df['contains_keyboard_pattern'].astype('bool')
df['is_dictionary_word'] = df['is_dictionary_word'].astype('bool')
df['contain_dictionary_word'] = df['contain_dictionary_word'].astype('bool')
df['contains_personal_info'] = df['contains_personal_info'].astype('bool')

In [28]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14346469 entries, 0 to 14346468
Data columns (total 17 columns):
 #   Column                     Dtype 
---  ------                     ----- 
 0   password                   object
 1   length                     int16 
 2   uppercase_count            int16 
 3   lowercase_count            int16 
 4   numeric_char_count         int16 
 5   special_char_count         int16 
 6   has_sequential_chars       bool  
 7   has_repeated_chars         bool  
 8   has_consecutive_repeats    bool  
 9   contains_keyboard_pattern  bool  
 10  is_dictionary_word         bool  
 11  contain_dictionary_word    bool  
 12  contains_personal_info     bool  
 13  randomness_score           int16 
 14  entropy                    int16 
 15  entropy_with_patterns      int16 
 16  complexity_score           int16 
dtypes: bool(7), int16(9), object(1)
memory usage: 1.2 GB


In [29]:
# df.to_csv('password_features_personal_info.csv', index=False)

```
def calculate_complexity_v2(row):
    """
    Calculates a complexity score for a password (1-10, float allowed) based on its features.
    """
    score = 1.0  # Start with a base score

    # Factor in length (longer is generally more complex)
    length_factor = row['length'] / 20.0  # Normalize length to roughly contribute up to 5 points (adjust 20 as needed)
    score += min(5.0, length_factor * 5) # Cap the length contribution

    # Factor in character variety
    variety_bonus = 0
    if row['uppercase_count'] > 0:
        variety_bonus += 1
    if row['lowercase_count'] > 0:
        variety_bonus += 1
    if row['numeric_char_count'] > 0:
        variety_bonus += 1.5  # Numbers often add more complexity
    if row['special_char_count'] > 0:
        variety_bonus += 2.5  # Special chars are great for complexity
    score += variety_bonus

    # Subtract points for patterns (these will reduce complexity)
    if row['has_sequential_chars']:
        score -= 2
    if row['has_repeated_chars']:
        score -= 1.5
    if row['has_consecutive_repeats']:
        score -= 1
    if row['contains_keyboard_pattern']:
        score -= 3
    if row['is_dictionary_word']:
        score -= 4
    if row['contain_dictionary_word']:
        score -= 2
    if row['contains_personal_info']:
        score -= 5

    # Ensure the score stays within the 1-10 range
    return max(1.0, min(10.0, score))

# Assuming your DataFrame is named 'df'
df['complexity_score'] = df.swifter.apply(calculate_complexity_v2, axis=1)

print(df[['password', 'complexity_score']].head())
``

```
from zxcvbn import zxcvbn

def get_zxcvbn_data(password):
    """
    Uses the zxcvbn library to get password strength information for passwords
    72 characters or shorter. Returns an empty dictionary for longer passwords.
    """
    max_len_zxcvbn_recommended = 32
    if len(password) > max_len_zxcvbn_recommended:
        return {}  # Skip long passwords and return an empty dictionary
    else:
        try:
            result = zxcvbn(password, max_length=max_len_zxcvbn_recommended)
            return result
        except ValueError as e:
            print(f"Warning: Password '{password[:20]}...' caused an error: {e}")
            return {}  # Return an empty dictionary if there's still an error

# Assuming your DataFrame is named 'df'
df['zxcvbn_info'] = df['password'].swifter.apply(get_zxcvbn_data)

# Now, let's expand the 'zxcvbn_info' column into separate columns
df = pd.concat([df.drop(['zxcvbn_info'], axis=1), df['zxcvbn_info'].apply(pd.Series)], axis=1)

print(df.head())
```

```
from passwordmeter import Meter

password = "YourReallyLongAndComplexPasswordHere!"
meter = Meter()
strength = meter.test(password)
print(f"Password: {password}, Strength Score: {strength[0]}") # The strength is the first element of the tuple
print(f"Strength Category: {strength[0]}") # We only printed the score before, let's stick to that for now
print(strength)
```

```
from passwordmeter import Meter

def get_passwordmeter_data(password):
    """
    Uses the passwordmeter library to get password strength information.
    """
    meter = Meter()
    strength_tuple = meter.test(password)
    return strength_tuple

# Assuming your DataFrame is named 'df'
df['passwordmeter_info'] = df['password'].swifter.apply(get_passwordmeter_data)

# Now, let's expand the 'passwordmeter_info' column into separate columns
df[['passwordmeter_strength', 'passwordmeter_improvements']] = df['passwordmeter_info'].apply(pd.Series)

# You might want to drop the original tuple column now
df = df.drop(['passwordmeter_info'], axis=1)

print(df.head())
```

```
from zxcvbn import zxcvbn

def get_zxcvbn_data(password):
    """
    Uses the zxcvbn library to get password strength information,
    allowing for passwords up to a specified length.
    Returns an empty dictionary if an error occurs.
    """
    max_len = 260  # Set a max length that accommodates your longest passwords (e.g., 260)
    try:
        result = zxcvbn(password, max_length=max_len)
        return result
    except ValueError as e:
        print(f"Warning: Password '{password[:20]}...' exceeded maximum length and was skipped. Error: {e}")
        return {}  # Return an empty dictionary instead of None


# Assuming your DataFrame is named 'df'
df['zxcvbn_info'] = df['password'].swifter.apply(get_zxcvbn_data)

# Now, let's expand the 'zxcvbn_info' column into separate columns
df = pd.concat([df.drop(['zxcvbn_info'], axis=1), df['zxcvbn_info'].apply(pd.Series)], axis=1)

print(df.head())
```