In [None]:
import pandas as pd
import numpy as np
import re
from collections import Counter

# I. Password Preprocessing

# Step 1: Load the dataset

In [None]:
rockyou_path = "rockyou.txt"  # Update this if your dataset is in a different location

In [None]:
# Read the dataset
with open(rockyou_path, encoding="latin-1") as f:
    passwords = f.read().splitlines()

print(f"Total passwords loaded: {len(passwords)}")

# Step 2: Clean the dataset

In [None]:
# Remove duplicates
passwords = list(set(passwords))
print(f"Total passwords after removing duplicates: {len(passwords)}")

In [None]:
# Remove passwords shorter than 4 characters
passwords = [pwd for pwd in passwords if len(pwd) >= 4]
print(f"Total passwords after removing short ones: {len(passwords)}")

# Step 3: Label passwords based on frequency

In [None]:
# Count password occurrences
password_counts = Counter(passwords)

In [None]:
# Sort passwords by frequency
sorted_passwords = sorted(password_counts.items(), key=lambda x: x[1], reverse=True)

In [None]:
# Define thresholds for Weak, Moderate, and Strong
weak_threshold = 100000  # Top 100K most common passwords
moderate_threshold = 1000000  # Next 900K passwords

In [None]:
# Assign labels
labeled_passwords = []
for i, (password, count) in enumerate(sorted_passwords):
    if i < weak_threshold:
        label = "Weak"
    elif i < moderate_threshold:
        label = "Moderate"
    else:
        label = "Strong"
    labeled_passwords.append((password, label))


In [None]:
# Convert to DataFrame
df = pd.DataFrame(labeled_passwords, columns=["Password", "Strength"])
print(df.head())


In [None]:
# Save preprocessed dataset
df.to_csv("preprocessed_passwords.csv", index=False)
print("Preprocessed dataset saved as 'preprocessed_passwords.csv'")


# II. Data Preprocessing for LSTM Training

In [None]:
#!pip install tensorflow[and-cuda]
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
# Load preprocessed dataset
df = pd.read_csv("preprocessed_passwords.csv", dtype=str, low_memory=False)

In [None]:
# Drop NaN values (they cause tokenizer errors)
df = df.dropna(subset=["Password"])

In [None]:
# Convert password strength labels to numerical values
label_encoder = LabelEncoder()
df["Strength"] = label_encoder.fit_transform(df["Strength"])

In [None]:
# Ensure all passwords are strings
df["Password"] = df["Password"].astype(str)

In [None]:
# Tokenizer with limited vocab size to reduce memory usage
tokenizer = Tokenizer(char_level=True, num_words=10000)  # Limit vocab size to optimize memory
tokenizer.fit_on_texts(df["Password"])

In [None]:
# Convert passwords to sequences
sequences = tokenizer.texts_to_sequences(df["Password"])

In [None]:
# Pad sequences dynamically based on 95th percentile length to avoid long padding
max_length = int(np.percentile([len(seq) for seq in sequences], 95))  # Avoid extreme long passwords
X = pad_sequences(sequences, maxlen=max_length, padding='post')

In [None]:
# Convert labels to numpy array
y = np.array(df["Strength"])

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")


In [None]:
# Save preprocessed data
np.save("X_train.npy", X_train)
np.save("X_test.npy", X_test)
np.save("y_train.npy", y_train)
np.save("y_test.npy", y_test)
print("Preprocessed data saved for LSTM training.")

# III. Building & Training the LSTM Model

In [None]:
# Build the LSTM model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=10000, output_dim=128),
    tf.keras.layers.LSTM(64, return_sequences=True),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')  # 3 classes: Weak, Moderate, Strong
])

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=3, batch_size=64, validation_data=(X_test, y_test))

In [None]:
# Save the trained model
model.save("password_strength_lstm.h5")
print("LSTM model trained and saved successfully.")

# IV. Password Strength UI

In [None]:
import streamlit as st
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import pickle

In [3]:
!pip install "fastapi[all]"

Collecting fastapi[all]
  Downloading fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting starlette<0.47.0,>=0.40.0 (from fastapi[all])
  Downloading starlette-0.46.1-py3-none-any.whl.metadata (6.2 kB)
Collecting fastapi-cli>=0.0.5 (from fastapi-cli[standard]>=0.0.5; extra == "all"->fastapi[all])
  Downloading fastapi_cli-0.0.7-py3-none-any.whl.metadata (6.2 kB)
Collecting jinja2>=3.1.5 (from fastapi[all])
  Using cached jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting python-multipart>=0.0.18 (from fastapi[all])
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting orjson>=3.2.1 (from fastapi[all])
  Downloading orjson-3.10.15-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (41 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.8/41.8 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting email-validator>=2.0.0 (from fastapi[all])
  Downloading email_validator-2.2.0-py3-none-

In [4]:
from fastapi import FastAPI
import tensorflow as tf
import numpy as np
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences

app = FastAPI()

# Load model and tokenizer
model = tf.keras.models.load_model("password_strength_lstm.h5")
with open("tokenizer.pkl", "rb") as handle:
    tokenizer = pickle.load(handle)

max_length = 50  # Must match training

def predict_strength(password):
    sequence = tokenizer.texts_to_sequences([password])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post')
    prediction = model.predict(padded_sequence)
    strength_labels = ["Weak", "Moderate", "Strong"]
    return {"strength": strength_labels[np.argmax(prediction)], "confidence": float(np.max(prediction))}

@app.get("/")
def home():
    return {"message": "FastAPI is running successfully!"}
    
@app.get("/predict/")
def get_strength(password: str):
    return predict_strength(password)

2025-03-22 07:52:22.229851: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742626342.302546  178134 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742626342.323278  178134 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-22 07:52:22.484121: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-03-22 07:52:25.461313: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL

In [None]:
# Recompile the model (if needed)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Load the trained model
model = tf.keras.models.load_model("password_strength_lstm.h5")


In [None]:
# Load tokenizer

with open("tokenizer.pkl", "wb") as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [None]:
# Define max_length (must match training configuration)
max_length = 50

In [None]:
# Function to predict password strength
def predict_strength(password):
    st.write("Debug: Checking if model and tokenizer are loaded.")
    sequence = tokenizer.texts_to_sequences([password])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post')
    prediction = model.predict(padded_sequence)
    strength_labels = ["Weak", "Moderate", "Strong"]
    return strength_labels[np.argmax(prediction)], np.max(prediction)


In [1]:
# Streamlit UI
def main():
    # Debugging logs
    st.set_page_config(layout="wide")
    st.write("🔍 Checking model and tokenizer loading...")

    try:
        model = tf.keras.models.load_model("password_strength_lstm.h5")
        st.write("✅ Model loaded successfully.")
    except Exception as e:
        st.error(f"❌ Model loading failed: {e}")
    
    try:
        with open("tokenizer.pkl", "rb") as handle:
            tokenizer = pickle.load(handle)
        st.write("✅ Tokenizer loaded successfully.")
    except Exception as e:
        st.error(f"❌ Tokenizer loading failed: {e}")

    st.title("🔐 AI-Based Password Strength Analyzer")
    st.write("Enter a password to check its strength using an AI model.")
    
    password = st.text_input("Enter Password:", type="password")
    if st.button("Check Strength"):
        if password:
            strength, confidence = predict_strength(password)
            st.success(f"Password Strength: {strength} (Confidence: {confidence:.2f})")
        else:
            st.warning("Please enter a password.")

if __name__ == "__main__":
    main()


NameError: name 'st' is not defined