In [None]:
import pandas as pd
import numpy as np
import re
from collections import Counter

# I. Password Preprocessing

# Step 1: Load the dataset

In [None]:
rockyou_path = "rockyou.txt"  # Update this if your dataset is in a different location

In [None]:
# Read the dataset
with open(rockyou_path, encoding="latin-1") as f:
    passwords = f.read().splitlines()

print(f"Total passwords loaded: {len(passwords)}")

# Step 2: Clean the dataset

In [None]:
# Remove duplicates
passwords = list(set(passwords))
print(f"Total passwords after removing duplicates: {len(passwords)}")

In [None]:
# Remove passwords shorter than 4 characters
passwords = [pwd for pwd in passwords if len(pwd) >= 4]
print(f"Total passwords after removing short ones: {len(passwords)}")

# Step 3: Label passwords based on frequency

In [None]:
# Count password occurrences
password_counts = Counter(passwords)

In [None]:
# Sort passwords by frequency
sorted_passwords = sorted(password_counts.items(), key=lambda x: x[1], reverse=True)

In [None]:
# Define thresholds for Weak, Moderate, and Strong
weak_threshold = 100000  # Top 100K most common passwords
moderate_threshold = 1000000  # Next 900K passwords

In [None]:
# Assign labels
labeled_passwords = []
for i, (password, count) in enumerate(sorted_passwords):
    if i < weak_threshold:
        label = "Weak"
    elif i < moderate_threshold:
        label = "Moderate"
    else:
        label = "Strong"
    labeled_passwords.append((password, label))


In [None]:
# Convert to DataFrame
df = pd.DataFrame(labeled_passwords, columns=["Password", "Strength"])
print(df.head())


In [None]:
# Save preprocessed dataset
df.to_csv("preprocessed_passwords.csv", index=False)
print("Preprocessed dataset saved as 'preprocessed_passwords.csv'")


# II. Data Preprocessing for LSTM Training

In [None]:
#!pip install tensorflow[and-cuda]
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
# Load preprocessed dataset
df = pd.read_csv("preprocessed_passwords.csv", dtype=str, low_memory=False)

In [None]:
# Drop NaN values (they cause tokenizer errors)
df = df.dropna(subset=["Password"])

In [None]:
# Convert password strength labels to numerical values
label_encoder = LabelEncoder()
df["Strength"] = label_encoder.fit_transform(df["Strength"])

In [None]:
# Ensure all passwords are strings
df["Password"] = df["Password"].astype(str)

In [None]:
# Tokenizer with limited vocab size to reduce memory usage
tokenizer = Tokenizer(char_level=True, num_words=10000)  # Limit vocab size to optimize memory
tokenizer.fit_on_texts(df["Password"])

In [None]:
# Convert passwords to sequences
sequences = tokenizer.texts_to_sequences(df["Password"])

In [None]:
# Pad sequences dynamically based on 95th percentile length to avoid long padding
max_length = int(np.percentile([len(seq) for seq in sequences], 95))  # Avoid extreme long passwords
X = pad_sequences(sequences, maxlen=max_length, padding='post')

In [None]:
# Convert labels to numpy array
y = np.array(df["Strength"])

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")


In [None]:
# Save preprocessed data
np.save("X_train.npy", X_train)
np.save("X_test.npy", X_test)
np.save("y_train.npy", y_train)
np.save("y_test.npy", y_test)
print("Preprocessed data saved for LSTM training.")

# III. Building & Training the LSTM Model

In [None]:
# Build the LSTM model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=10000, output_dim=128),
    tf.keras.layers.LSTM(64, return_sequences=True),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')  # 3 classes: Weak, Moderate, Strong
])

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=3, batch_size=64, validation_data=(X_test, y_test))

In [None]:
# Save the trained model
model.save("password_strength_lstm.h5")
print("LSTM model trained and saved successfully.")

# IV. Password Strength UI