In [1]:
import itertools
import time
import math
from difflib import SequenceMatcher

# Brute-force password cracking simulation
def brute_force(target_password, max_length=None):
    """
    Simulate brute-force cracking by iterating through combinations.
    For demonstration, the search stops at max_length (defaults to len(target_password)).
    Returns the cracked password, number of attempts, and elapsed time.
    """
    chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
    if max_length is None:
        max_length = len(target_password)
    length = 1
    attempts = 0
    start_time = time.time()

    while length <= max_length:
        for attempt in itertools.product(chars, repeat=length):
            guess = ''.join(attempt)
            attempts += 1
            # For demonstration, we print only every 10000th attempt to avoid flooding the output
            if attempts % 10000 == 0:
                print(f"Brute-force attempt {attempts}: {guess}")
            if guess == target_password:
                elapsed = time.time() - start_time
                return guess, attempts, elapsed
        length += 1
    return None, attempts, time.time() - start_time

# Simulated Hashcat analysis based on entropy calculation
def hashcat_simulation(target_password):
    """
    Estimates the cracking time using an entropy-based model.
    Calculates the number of possible combinations and simulates a cracking time
    given a hypothetical guess rate.
    """
    # Define the character pool based on the password's composition
    pool = 0
    if any(c.islower() for c in target_password):
        pool += 26
    if any(c.isupper() for c in target_password):
        pool += 26
    if any(c.isdigit() for c in target_password):
        pool += 10
    if any(not c.isalnum() for c in target_password):
        pool += 32  # common symbols estimate

    if pool == 0:
        pool = 1  # avoid division by zero

    # Estimate entropy (bits)
    entropy = len(target_password) * math.log2(pool)

    # Calculate total possibilities and simulated time (using a hypothetical guess rate)
    possibilities = 2 ** entropy
    hashcat_rate = 1e11  # e.g., 100 billion guesses per second
    time_estimate = possibilities / hashcat_rate
    return entropy, possibilities, time_estimate

# XAI explanation based on the results from brute-force and hashcat simulation
def generate_xai_explanation(target_password, brute_info, hashcat_info):
    """
    Combines the brute-force results and simulated hashcat analysis to explain password strength.
    Also uses a similarity metric against a common password pattern.
    """
    guessed, attempts, brute_time = brute_info
    entropy, possibilities, hashcat_time = hashcat_info

    explanation = f"Password: {target_password}\n"
    explanation += f"Brute-force simulation: {attempts} attempts in {brute_time:.4f} seconds.\n"
    explanation += f"Estimated entropy: {entropy:.2f} bits\n"
    explanation += f"Total possibilities: {possibilities:.2e}\n"
    explanation += f"Simulated Hashcat cracking time: {hashcat_time:.4f} seconds.\n\n"

    # Explain vulnerability based on simulated cracking times
    if hashcat_time < 1:
        explanation += "Analysis: The password can be cracked almost instantly with advanced tools. It is highly vulnerable.\n"
    elif hashcat_time < 60:
        explanation += "Analysis: The password is weak and could be compromised within a minute by dedicated attackers.\n"
    else:
        explanation += "Analysis: The password has higher complexity, making it more resistant to brute-force attacks.\n"

    # Similarity check with a common password (e.g., 'password')
    common_password = "password"
    similarity = SequenceMatcher(None, target_password, common_password).ratio()
    if similarity > 0.5:
        explanation += "Warning: Your password bears similarity to common password patterns, which increases vulnerability.\n"
    else:
        explanation += "Your password does not closely resemble common patterns.\n"

    return explanation

# Main function: Get user input and run the analysis
def main():
    target = input("Enter the password for analysis: ").strip()

    print("\nStarting brute-force simulation...")
    brute_info = brute_force(target, max_length=len(target))
    if brute_info[0]:
        print(f"\nBrute-force cracked password: {brute_info[0]}")
        print(f"Attempts: {brute_info[1]}, Time: {brute_info[2]:.4f} seconds")
    else:
        print("\nBrute-force simulation did not crack the password within the max length limit.")

    hashcat_info = hashcat_simulation(target)
    entropy, possibilities, hashcat_time = hashcat_info
    print(f"\nSimulated Hashcat cracking time: {hashcat_time:.4f} seconds (Entropy: {entropy:.2f} bits)")

    explanation = generate_xai_explanation(target, brute_info, hashcat_info)
    print("\nXAI Explanation:\n")
    print(explanation)

if __name__ == "__main__":
    main()



Starting brute-force simulation...
Brute-force attempt 10000: bKr
Brute-force attempt 20000: elJ
Brute-force attempt 30000: gW1
Brute-force attempt 40000: jyj
Brute-force attempt 50000: l9B
Brute-force attempt 60000: oKT
Brute-force attempt 70000: rmb
Brute-force attempt 80000: tXt
Brute-force attempt 90000: wyL
Brute-force attempt 100000: y93
Brute-force attempt 110000: BLl
Brute-force attempt 120000: EmD
Brute-force attempt 130000: GXV
Brute-force attempt 140000: Jzd
Brute-force attempt 150000: Mav
Brute-force attempt 160000: OLN
Brute-force attempt 170000: Rm5
Brute-force attempt 180000: TYn
Brute-force attempt 190000: WzF
Brute-force attempt 200000: ZaX
Brute-force attempt 210000: 1Mf
Brute-force attempt 220000: 4nx
Brute-force attempt 230000: 6YP
Brute-force attempt 240000: 9z7
Brute-force attempt 250000: acbp
Brute-force attempt 260000: aeMH
Brute-force attempt 270000: ahnZ
Brute-force attempt 280000: ajZh
Brute-force attempt 290000: amAz
Brute-force attempt 300000: apbR
Brute-f


Starting brute-force simulation...
Brute-force attempt 10000: bKr
Brute-force attempt 20000: elJ
Brute-force attempt 30000: gW1
Brute-force attempt 40000: jyj
Brute-force attempt 50000: l9B
Brute-force attempt 60000: oKT
Brute-force attempt 70000: rmb
Brute-force attempt 80000: tXt
Brute-force attempt 90000: wyL
Brute-force attempt 100000: y93
Brute-force attempt 110000: BLl
Brute-force attempt 120000: EmD
Brute-force attempt 130000: GXV
Brute-force attempt 140000: Jzd
Brute-force attempt 150000: Mav
Brute-force attempt 160000: OLN
Brute-force attempt 170000: Rm5
Brute-force attempt 180000: TYn
Brute-force attempt 190000: WzF
Brute-force attempt 200000: ZaX
Brute-force attempt 210000: 1Mf
Brute-force attempt 220000: 4nx
Brute-force attempt 230000: 6YP
Brute-force attempt 240000: 9z7
Brute-force attempt 250000: acbp
Brute-force attempt 260000: aeMH
Brute-force attempt 270000: ahnZ
Brute-force attempt 280000: ajZh
Brute-force attempt 290000: amAz
Brute-force attempt 300000: apbR
Brute-f

KeyboardInterrupt: 

In [4]:
# download_and_process.py

import requests
import tarfile
import os
import math
import csv

def download_rockyou(url="https://github.com/danielmiessler/SecLists/raw/master/Passwords/Leaked-Databases/rockyou.txt.tar.gz",
                     local_tar="rockyou.txt.tar.gz"):
    print("Downloading Rockyou dataset...")
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(local_tar, "wb") as f:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
        print("Download complete.")
    else:
        raise Exception(f"Failed to download dataset. Status code: {response.status_code}")

def extract_tar(tar_path="rockyou.txt.tar.gz", extract_path="."):
    print("Extracting dataset...")
    if not tarfile.is_tarfile(tar_path):
        raise Exception(f"{tar_path} is not a valid tar.gz file.")
    with tarfile.open(tar_path, "r:gz") as tar:
        tar.extractall(path=extract_path)
    print("Extraction complete.")

def shannon_entropy(password):
    """Calculate the Shannon entropy of a given password string."""
    if not password:
        return 0
    entropy = 0
    freq = {}
    for ch in password:
        freq[ch] = freq.get(ch, 0) + 1
    length = len(password)
    for count in freq.values():
        p = count / length
        entropy -= p * math.log2(p)
    return entropy

def process_dataset(input_file="rockyou.txt", output_file="rockyou_with_entropy.csv", entropy_threshold=3.0):
    """
    Reads the Rockyou dataset, calculates entropy for each password,
    and writes out a CSV with passwords and their entropy.
    Only passwords with entropy >= entropy_threshold are saved.
    """
    if not os.path.exists(input_file):
        raise Exception(f"{input_file} not found. Please check that the dataset was extracted correctly.")

    with open(input_file, "r", encoding="latin1", errors="ignore") as f_in, \
         open(output_file, "w", newline="", encoding="utf-8") as f_out:

        writer = csv.writer(f_out)
        writer.writerow(["password", "entropy"])

        for line in f_in:
            pwd = line.strip()
            if not pwd:
                continue
            ent = shannon_entropy(pwd)
            if ent >= entropy_threshold:
                writer.writerow([pwd, ent])
    print(f"Processed dataset saved to {output_file}")

# --- Main execution block ---
# This will run when you execute the cell in the notebook.
try:
    if not os.path.exists("rockyou.txt"):
      download_rockyou()
      extract_tar()
    else:
      print("rockyou.txt already exists. Skipping download and extraction.")
    
    process_dataset()
    print("\\nData preparation complete!")
except Exception as e:
    print(f"\\nAn error occurred: {e}")

Downloading Rockyou dataset...
Download complete.
Extracting dataset...
Download complete.
Extracting dataset...
Extraction complete.
Extraction complete.
Processed dataset saved to rockyou_with_entropy.csv
\nData preparation complete!
Processed dataset saved to rockyou_with_entropy.csv
\nData preparation complete!


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import json

# --- 1. Load the GOLD STANDARD High-Entropy Dataset ---
print("Loading the gold standard password dataset...")
try:
    # <-- THE FIX IS HERE: Updated the filename
    df_strong = pd.read_csv('gold_standard_passwords.csv')
    
    print(f"--> Training on exactly {len(df_strong)} high-entropy passwords.")
    high_entropy_passwords = df_strong['password'].dropna().astype(str).values
except FileNotFoundError:
    print("Error: 'gold_standard_passwords.csv' not found. Please run the previous cell first.")
    raise

# --- 2. Prepare Data for the LSTm Model ---
chars = sorted(list(set(''.join(high_entropy_passwords))))
char_to_int = {ch: idx for idx, ch in enumerate(chars)}
int_to_char = {idx: ch for idx, ch in enumerate(chars)}
vocab_size = len(chars)
print(f"Vocabulary size for strong passwords: {vocab_size}")

seq_length = 10 
X = []
y = []
print("Creating input/output sequences for the model...")
for password in high_entropy_passwords:
    if len(password) > seq_length:
        for i in range(len(password) - seq_length):
            seq_in = password[i:i + seq_length]
            seq_out = password[i + seq_length]
            X.append([char_to_int.get(char, 0) for char in seq_in])
            y.append(char_to_int.get(seq_out, 0))

num_patterns = len(X)
print(f"Total training patterns: {num_patterns}")
X_reshaped = np.reshape(X, (num_patterns, seq_length, 1))
X_reshaped = X_reshaped / float(vocab_size)
y_categorical = to_categorical(y, num_classes=vocab_size)

# --- 3. Build the SIMPLER LSTM Model ---
print("\nBuilding a simpler, more robust LSTM model...")
model = Sequential([
    Input(shape=(X_reshaped.shape[1], X_reshaped.shape[2])),
    LSTM(256, return_sequences=True),
    Dropout(0.3),
    LSTM(256),
    Dropout(0.3),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(vocab_size, activation='softmax')
])

optimizer = Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.summary()

# --- 4. Train the Model ---
print("\nStarting model training... (This will take some time)")
# For a full run, you can increase the subset_size or remove the slicing
subset_size = 100000 
model.fit(X_reshaped[:subset_size], y_categorical[:subset_size], epochs=20, batch_size=128)

# --- 5. Save the New Model AND the Vocabulary ---
new_model_filename = 'strong_password_generator.h5'
vocab_filename = 'char_to_int.json'

model.save(new_model_filename)
with open(vocab_filename, 'w') as f:
    json.dump(char_to_int, f)

print(f"\nTraining complete. New model saved as '{new_model_filename}'")
print(f"Vocabulary saved as '{vocab_filename}'")

Loading the high-entropy password dataset...
Loaded 3378549 high-entropy passwords for training.
Loaded 3378549 high-entropy passwords for training.
Vocabulary size for strong passwords: 206
Creating input/output sequences for the model...
Vocabulary size for strong passwords: 206
Creating input/output sequences for the model...
Total training patterns: 5384202
Total training patterns: 5384202

Building the advanced LSTM model...

Building the advanced LSTM model...


Loading the high-entropy password dataset...
Loaded 3378549 high-entropy passwords for training.
Loaded 3378549 high-entropy passwords for training.
Vocabulary size for strong passwords: 206
Creating input/output sequences for the model...
Vocabulary size for strong passwords: 206
Creating input/output sequences for the model...
Total training patterns: 5384202
Total training patterns: 5384202

Building the advanced LSTM model...

Building the advanced LSTM model...


Loading the high-entropy password dataset...
Loaded 3378549 high-entropy passwords for training.
Loaded 3378549 high-entropy passwords for training.
Vocabulary size for strong passwords: 206
Creating input/output sequences for the model...
Vocabulary size for strong passwords: 206
Creating input/output sequences for the model...
Total training patterns: 5384202
Total training patterns: 5384202

Building the advanced LSTM model...

Building the advanced LSTM model...


Loading the high-entropy password dataset...
Loaded 3378549 high-entropy passwords for training.
Loaded 3378549 high-entropy passwords for training.
Vocabulary size for strong passwords: 206
Creating input/output sequences for the model...
Vocabulary size for strong passwords: 206
Creating input/output sequences for the model...
Total training patterns: 5384202
Total training patterns: 5384202

Building the advanced LSTM model...

Building the advanced LSTM model...



Starting model training... (This will take some time)
Epoch 1/20
Epoch 1/20
[1m 12/782[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5:57[0m 464ms/step - accuracy: 0.0367 - loss: 4.9577

Loading the high-entropy password dataset...
Loaded 3378549 high-entropy passwords for training.
Loaded 3378549 high-entropy passwords for training.
Vocabulary size for strong passwords: 206
Creating input/output sequences for the model...
Vocabulary size for strong passwords: 206
Creating input/output sequences for the model...
Total training patterns: 5384202
Total training patterns: 5384202

Building the advanced LSTM model...

Building the advanced LSTM model...



Starting model training... (This will take some time)
Epoch 1/20
Epoch 1/20
[1m 12/782[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5:57[0m 464ms/step - accuracy: 0.0367 - loss: 4.9577

KeyboardInterrupt: 