In [1]:
import pandas as pd
import hashlib
import requests
import json
import string
import numpy as np
import lightgbm as lgb


In [2]:
def dict_atk_ext_src(plaintext_password):
    hashed_password = hashlib.sha1(plaintext_password.encode()).hexdigest().upper()
    first_five_chars = hashed_password[:5]
    rest_of_hash = hashed_password[5:]
    api_url = f"https://api.pwnedpasswords.com/range/{first_five_chars}"
    response = requests.get(api_url)
    if response.status_code == 200:
        hashes = (line.split(':') for line in response.text.splitlines())
        for h, count in hashes:
            if h == rest_of_hash:
                return 0  # Breached
        return 1  # Not breached
    return 0  # Default to breached if API call fails


# def ml_password_classifer_score(password):
#     # Placeholder for ML model prediction
#     # Returns a hardcoded value for now. Replace with actual model prediction.
#     return 1  # Weak

def standard_checks_is_pass(password):
    # Simplified standard checks. Returns 1 if password passes, 0 otherwise
    if len(password) >= 8 and any(char.isdigit() for char in password) and any(char.isupper() for char in password):
        return 1
    return 0



In [3]:
# Load substitutions dictionary from JSON file
substitutions_dict_path = "substitutions_dict.json"  # Update this path if necessary
with open(substitutions_dict_path, "r") as file:
    substitutions_dict = json.load(file)

# Predefined dictionary file
dictionary_file = "common_passwords.txt"

In [4]:
substitutions_dict

{'@': 'a',
 '4': 'a',
 '3': 'e',
 '1': 'i',
 '!': 'i',
 '0': 'o',
 '$': 's',
 '5': 's'}

In [5]:

def generate_substitutions(plaintext):
    """
    Generate possible substitutions for the given plaintext, replacing common substituted characters
    with their alphabetical representation.
    """
    chars = list(plaintext)
    modified = False
    for i, char in enumerate(chars):
        if char in substitutions_dict:
            chars[i] = substitutions_dict[char]
            modified = True
    return "".join(chars), modified


def remove_numeric_sequences(password):
    """
    Remove sequences of three or more sequential numbers from the password.
    Returns the modified password and a flag indicating if any sequence was removed.
    """

    def is_sequential(s):
        ascending = all(
            ord(next_char) == ord(current_char) + 1
            for current_char, next_char in zip(s, s[1:])
        )
        descending = all(
            ord(next_char) == ord(current_char) - 1
            for current_char, next_char in zip(s, s[1:])
        )
        return ascending or descending

    new_password = ""
    numeric_sequence = ""
    modified = False
    for char in password:
        if char.isdigit():
            numeric_sequence += char
        else:
            if is_sequential(numeric_sequence) and len(numeric_sequence) >= 3:
                modified = True  # Indicate removal without keeping the first digit
                numeric_sequence = ""  # Reset sequence
            new_password += numeric_sequence + char
            numeric_sequence = ""
    if is_sequential(numeric_sequence) and len(numeric_sequence) >= 3:
        modified = True
    else:
        new_password += (
            numeric_sequence  # Add any remaining sequence not identified as sequential
        )
    return new_password, modified


def dict_atk_int_src(plaintext_password):
    sequence_removed_password, sequence_removed = remove_numeric_sequences(
        plaintext_password
    )
    final_password, substitution_made = generate_substitutions(
        sequence_removed_password
    )

    solution = hashlib.md5(final_password.encode()).hexdigest()

    sol = "No solution found"
    methods_used = []

    if sequence_removed:
        methods_used.append("numeric sequence removal")
    if substitution_made:
        methods_used.append("character substitution")
    methods_used.append("dictionary")  # Ensure dictionary check is always mentioned

    method_str = ", ".join(methods_used) if methods_used else "direct check"

    try:
        with open(dictionary_file, "r") as filename:
            for line in filename:
                line = line.strip()
                if hashlib.md5(line.encode()).hexdigest() == solution:
                    sol = line
                    break  # Found a match
    except FileNotFoundError:
        print(f"Error: The file '{dictionary_file}' was not found.")
        return "Error: Dictionary file not found.", 0

    if sol == "No solution found":
        return 1 #non found
    else:
        return 0

In [6]:
def is_sequential(s):
    if len(s) < 3:
        return False
    ascending = all(ord(next_char) == ord(current_char) + 1 for current_char, next_char in zip(s, s[1:]))
    descending = all(ord(next_char) == ord(current_char) - 1 for current_char, next_char in zip(s, s[1:]))
    return ascending or descending

def num_seq_is_found(password):
    numeric_sequence = ""
    for char in password:
        if char.isdigit():
            numeric_sequence += char
            if is_sequential(numeric_sequence) and len(numeric_sequence) >= 3:
                return 1  # Found a sequential numeric sequence
        else:
            if is_sequential(numeric_sequence) and len(numeric_sequence) >= 3:
                return 1  # Found a sequential numeric sequence
            numeric_sequence = ""  # Reset sequence because current char is not a digit
    # Check the last sequence if the password ends with a numeric sequence
    if is_sequential(numeric_sequence) and len(numeric_sequence) >= 3:
        return 1
    return 0  # No sequential numeric sequence found


In [7]:
# def substitution_is_found(password):
#     """
#     Checks if any character in the password can be substituted based on a predefined substitutions dictionary.
#     Returns 1 if a substitution can be made, 0 otherwise.
#     """
#     # Assuming substitutions_dict is available in the scope. If not, it should be loaded or passed to this function.
#     for char in password:
#         if char in substitutions_dict:
#             return 1  # A substitution can be made for this character
#     return 0  # No substitutions can be made


def substitution_is_found(plaintext_password):
    # First, remove numeric sequences from the password
    sequence_removed_password, sequence_removed = remove_numeric_sequences(plaintext_password)
    # Then, try to generate substitutions
    final_password, substitution_made = generate_substitutions(sequence_removed_password)

    # Calculate the MD5 hash of the final processed password
    solution = hashlib.md5(final_password.encode()).hexdigest()
    # print(final_password)
    # Attempt to find the password in the dictionary
    try:
        with open(dictionary_file, "r") as filename:
            for line in filename:
                line = line.strip()
                if hashlib.md5(line.encode()).hexdigest() == solution:
                    # Password found in dictionary after modifications
                    return 1
    except FileNotFoundError:
        print(f"Error: The file '{dictionary_file}' was not found.")
        # If the dictionary file is missing, treat it as if the password couldn't be cracked
        return 0

    # If the password was modified but not found in the dictionary, return 0
    if substitution_made and not sequence_removed:
        return 0

    # If no modifications were made or the password wasn't found in the dictionary, also return 0
    return 0


In [8]:
# def ml_password_classifer_score(password):
#     # Placeholder for ML model prediction
#     # Returns a hardcoded value for now. Replace with actual model prediction.
#     return 1  # Weak


def ml_password_classifier_score(password):
    """
    Machine Learning Based Password Strength Classifier.
    Accepts a single password string as input.
    """

    # Feature engineering setup
    punctuation = list(string.punctuation)

    # Wrap the input password in a DataFrame
    X_new = pd.DataFrame([password], columns=["password"])

    # Apply feature engineering
    X_new["length"] = X_new["password"].apply(len)
    X_new["has_num"] = X_new["password"].apply(
        lambda x: any(char.isdigit() for char in x)
    )
    X_new["num_cnt"] = X_new["password"].apply(lambda x: sum(c.isdigit() for c in x))
    X_new["has_lower"] = X_new["password"].apply(
        lambda x: any(char.islower() for char in x)
    )
    X_new["lower_cnt"] = X_new["password"].apply(lambda x: sum(c.islower() for c in x))
    X_new["has_upper"] = X_new["password"].apply(
        lambda x: any(char.isupper() for char in x)
    )
    X_new["upper_cnt"] = X_new["password"].apply(lambda x: sum(c.isupper() for c in x))
    X_new["has_special"] = X_new["password"].apply(
        lambda x: any(char in punctuation for char in x)
    )
    X_new["special_cnt"] = X_new["password"].apply(
        lambda x: sum(char in punctuation for char in x)
    )

    # Prepare features for prediction
    features = X_new.drop(columns=["password"])

    # Load the model
    bst = lgb.Booster(model_file="lgb_model_password_classifier.txt")

    # Predict
    y_pred = bst.predict(features)
    return np.argmax(y_pred)

In [9]:
from password_strength import PasswordStats

def char_seq_strength(plaintext_password):
    stats = PasswordStats(plaintext_password)
    sequence = stats.sequences_length
    strength = stats.strength(weak_bits = 30)
    weakness_factor = stats.weakness_factor
    password_strength = (1 - weakness_factor) * strength

    return password_strength

In [10]:
def estimate_brute_force_time(password):
    """
    Estimates the time required for a brute force attack to crack the given password, returning the time in hours.
    """
    password_length = len(password)
    attempts_per_second = 1e11  # Assumption based on computational capability
    possible_characters = 26 + 26 + 10 + 32  # Uppercase, lowercase, digits, and 32 symbols
    total_combinations = possible_characters ** password_length
    estimated_seconds = total_combinations / attempts_per_second

    # Convert estimated time from seconds to hours
    estimated_hours = estimated_seconds / 3600
    return estimated_hours

In [11]:
import pandas as pd
from tqdm import tqdm

def process_csv(file_path, test_mode=False):
    df = pd.read_csv(file_path)
    # Initialize columns for features
    features = ['dict_atk_ext_src', 'dict_atk_int_src', 'num_seq_is_found', 'substitution_is_found', 'char_seq_strength','ml_password_classifer_score', 'standard_checks_is_pass','estimate_brute_force_time_hours']
    for feature in features:
        df[feature] = 0

    # Limit rows for testing if test_mode is True
    if test_mode:
        rows_to_process = df.iloc[:3]
    else:
        rows_to_process = df

    for index, row in tqdm(rows_to_process.iterrows(), total=rows_to_process.shape[0], desc="Processing"):
        try:
            password = str(row['password'])  # Ensure password is treated as a string
            # Replace placeholders with actual function calls
            df.at[index, 'dict_atk_ext_src'] = dict_atk_ext_src(password)
            df.at[index, 'dict_atk_int_src'] = dict_atk_int_src(password)  # Assuming you've implemented this function
            df.at[index, 'num_seq_is_found'] = num_seq_is_found(password)
            df.at[index, 'substitution_is_found'] = substitution_is_found(password)
            df.at[index, 'char_seq_strength'] = char_seq_strength(password)
            df.at[index, 'ml_password_classifer_score'] = ml_password_classifier_score(password)
            df.at[index, 'standard_checks_is_pass'] = standard_checks_is_pass(password)
            df.at[index, 'estimate_brute_force_time_hours'] = estimate_brute_force_time(password)
        except AttributeError as e:
            print(f"Error processing password: {e}")
            # Set all features to 0 for this row if there's an AttributeError
            for feature in features:
                df.at[index, feature] = 0

    return df

# Example usage for testing with only the first 3 rows
file_path = './passwords.csv'
processed_df = process_csv(file_path, test_mode=False)

file_path = './my_dataframe.csv'

# Save the DataFrame to CSV
processed_df.to_csv(file_path, index=False)



Processing:   0%|          | 0/507 [00:00<?, ?it/s]

Processing: 100%|██████████| 507/507 [01:01<00:00,  8.28it/s]


In [12]:
file_path = './my_dataframe.csv'

# Load the DataFrame back from the CSV
loaded_df = pd.read_csv(file_path)

# Display the loaded DataFrame
loaded_df

Unnamed: 0,rank,password,category,value,time_unit,offline_crack_sec,rank_alt,strength,font_size,dict_atk_ext_src,dict_atk_int_src,num_seq_is_found,substitution_is_found,char_seq_strength,ml_password_classifer_score,standard_checks_is_pass,estimate_brute_force_time_hours
0,1.0,password,password-related,6.91,years,2.170000e+00,1.0,8.0,11.0,0,0,0,1,0.093578,1,0,1.693247e+01
1,2.0,123456,simple-alphanumeric,18.52,minutes,1.110000e-05,2.0,4.0,8.0,0,1,1,0,0.000000,0,0,1.916305e-03
2,3.0,12345678,simple-alphanumeric,1.29,days,1.110000e-03,3.0,4.0,8.0,0,1,1,0,0.000000,1,0,1.693247e+01
3,4.0,1234,simple-alphanumeric,11.11,seconds,1.110000e-07,4.0,4.0,8.0,0,1,1,0,0.000000,0,0,2.168747e-07
4,5.0,qwerty,simple-alphanumeric,3.72,days,3.210000e-03,5.0,8.0,11.0,0,0,0,1,0.000000,0,0,1.916305e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502,,,,,,,,,,0,1,0,0,0.033333,0,0,2.307178e-09
503,,,,,,,,,,0,1,0,0,0.033333,0,0,2.307178e-09
504,,,,,,,,,,0,1,0,0,0.033333,0,0,2.307178e-09
505,,,,,,,,,,0,1,0,0,0.033333,0,0,2.307178e-09


In [13]:
loaded_df = loaded_df.dropna()

In [14]:
columns_to_drop = ['category', 'value', 'time_unit', 'offline_crack_sec', 'rank_alt', 'font_size']
loaded_df = loaded_df.drop(columns=columns_to_drop)


In [15]:
loaded_df.columns

Index(['rank', 'password', 'strength', 'dict_atk_ext_src', 'dict_atk_int_src',
       'num_seq_is_found', 'substitution_is_found', 'char_seq_strength',
       'ml_password_classifer_score', 'standard_checks_is_pass',
       'estimate_brute_force_time_hours'],
      dtype='object')

In [16]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, roc_auc_score
import numpy as np

def tune_and_evaluate_model(X, y):
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize the model
    model = RandomForestClassifier(random_state=42)
    
    # Hyperparameter tuning setup
    param_distributions = {
        'n_estimators': [100, 200, 300, 400, 500],
        'max_depth': [None, 10, 20, 30, 50],
        'min_samples_split': [2, 5, 10, 20],
        'min_samples_leaf': [1, 2, 4, 8]
    }
    
    # Randomized search for hyperparameter tuning
    random_cv = RandomizedSearchCV(estimator=model, param_distributions=param_distributions, n_iter=10, cv=3, random_state=42, n_jobs=-1)
    random_cv.fit(X_train, y_train)
    
    # Best model after tuning
    best_model = random_cv.best_estimator_
    
    # Evaluate the model
    predictions = best_model.predict(X_test)
    proba_predictions = best_model.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, predictions)
    print(f"Accuracy: {accuracy}")
    mse = mean_squared_error(y_test, predictions)
    print(f"Mean Squared Error: {mse}")
    # Feature importance
    feature_importances = best_model.feature_importances_
    
    # Output feature importance
    for feature, importance in zip(X.columns, feature_importances):
        print(f"Feature: {feature}, Importance: {importance}")
        
    return best_model

# Define features and label as per your specification
features = ['dict_atk_ext_src', 'dict_atk_int_src', 'num_seq_is_found','char_seq_strength', 'substitution_is_found', 'ml_password_classifer_score', 'standard_checks_is_pass','estimate_brute_force_time_hours']
label = 'strength'

# Extract features and label from the DataFrame
X = loaded_df[features]
y = loaded_df[label]

# Call the function to tune the model and print feature importances
best_model = tune_and_evaluate_model(X, y)




Accuracy: 0.82
Mean Squared Error: 20.66
Feature: dict_atk_ext_src, Importance: 0.0
Feature: dict_atk_int_src, Importance: 0.021092902251884996
Feature: num_seq_is_found, Importance: 0.016637123228684977
Feature: char_seq_strength, Importance: 0.7943375726159542
Feature: substitution_is_found, Importance: 0.020241385443001296
Feature: ml_password_classifer_score, Importance: 0.023716047515165057
Feature: standard_checks_is_pass, Importance: 0.0
Feature: estimate_brute_force_time_hours, Importance: 0.1239749689453096


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

def tune_and_evaluate_model(X, y):
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize the model with Lasso Regression
    # You might want to adjust the alpha parameter based on your dataset characteristics
    model = Lasso(alpha=0.01, random_state=42)
    
    # Fit the model
    model.fit(X_train, y_train)
    
    # Predictions
    predictions = model.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, predictions)
    print(f"Mean Squared Error: {mse}")
    r_squared = r2_score(y_test, predictions)
    print(f"R-squared: {r_squared}")
    
    # Normalize the coefficients to get feature importances in the range of 0-1
    abs_coefficients = np.abs(model.coef_)
    normalized_importances = abs_coefficients / np.sum(abs_coefficients)
    
    # Output normalized feature importance
    for feature, importance in zip(X.columns, normalized_importances):
        print(f"Feature: {feature}, Importance: {importance:.5f}")
        
    return model

# Define features and label as per your specification
# Assuming `loaded_df` is a predefined DataFrame with your data
# features = ['dict_atk_ext_src', 'dict_atk_int_src', 'num_seq_is_found', 'char_seq_strength','substitution_is_found', 'ml_password_classifer_score', 'standard_checks_is_pass']
features = ['dict_atk_ext_src', 'dict_atk_int_src', 'num_seq_is_found','char_seq_strength', 'substitution_is_found', 'ml_password_classifer_score', 'standard_checks_is_pass','estimate_brute_force_time_hours']
label = 'strength'  # If 'strength' is a single column, it should be a string, not a list

# Extract features and label from the DataFrame
X = loaded_df[features]
y = loaded_df[label]

# Call the function to evaluate the model and print feature importances
best_model = tune_and_evaluate_model(X, y)


Mean Squared Error: 23.961656825564415
R-squared: 0.4130842585786546
Feature: dict_atk_ext_src, Importance: 0.00000
Feature: dict_atk_int_src, Importance: 0.14292
Feature: num_seq_is_found, Importance: 0.00680
Feature: char_seq_strength, Importance: 0.77292
Feature: substitution_is_found, Importance: 0.07307
Feature: ml_password_classifer_score, Importance: 0.00419
Feature: standard_checks_is_pass, Importance: 0.00000
Feature: estimate_brute_force_time_hours, Importance: 0.00010
