In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
from warnings import filterwarnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import pickle

# Suppress warnings
filterwarnings("ignore")

# Data Collection and Cleaning
data = pd.read_csv("C:/Users/User/OneDrive/Desktop/Test2/data.csv", error_bad_lines=False)
data.drop_duplicates(inplace=True)
data.dropna(inplace=True)

# Feature Engineering
def find_semantics(password):
    if pd.isna(password):
        return None
    for char in password:
        if char in string.punctuation:
            return 1
    return 0

data["special_char_freq"] = data["password"].apply(find_semantics)
data["length"] = data["password"].str.len()
data["lowercase_freq"] = np.round(data["password"].apply(lambda x: len([c for c in x if c.islower()]) / len(x) if len(x) > 0 else 0), 3)
data["uppercase_freq"] = np.round(data["password"].apply(lambda x: len([c for c in x if c.isupper()]) / len(x) if len(x) > 0 else 0), 3)
data["digit_freq"] = np.round(data["password"].apply(lambda x: len([c for c in x if c.isdigit()]) / len(x) if len(x) > 0 else 0), 3)
data["special_char_freq"] = np.round(data["password"].apply(lambda x: len([c for c in x if not c.isalpha() and not c.isdigit()]) / len(x) if len(x) > 0 else 0), 3)
data["special_char_freq"] /= data["length"]

# Data Analysis
cols = ['length', 'lowercase_freq', 'uppercase_freq', 'digit_freq', 'special_char_freq']
for col in cols:
    print(col)
    print(data[[col, 'strength']].groupby(['strength']).agg(["min", "max", "mean", "median"]))
    print('\n')

# TF-IDF on data
vectorizer = TfidfVectorizer(analyzer="char")
X = vectorizer.fit_transform(data["password"].fillna(''))
df2 = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
df2["length"] = data['length']
df2["lowercase_freq"] = data['lowercase_freq']

# Model Building
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(df2)
X_train, X_test, y_train, y_test = train_test_split(X_train_imputed, data["strength"], test_size=0.20, random_state=42)
clf = LogisticRegression(multi_class="multinomial")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Model Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Save the model
model_path = 'C:/Users/user/OneDrive/Desktop/Test2/static/classification_model.pkl'
with open(model_path, 'wb') as model_file:
    pickle.dump(clf, model_file)

# Prediction Function
def predict():
    password = input("")
    sample_array = np.array([password])
    sample_matrix = vectorizer.transform(sample_array)
    length_pass = len(password)
    length_normalised_lowercase = len([char for char in password if char.islower()]) / len(password)
    new_matrix2 = np.append(sample_matrix.toarray(), [length_pass, length_normalised_lowercase]).reshape(1, -1)
    result = clf.predict(new_matrix2)
    return "Password is weak" if result == 0 else "Password is normal" if result == 1 else "Password is strong"

# Example Prediction
prediction_result = predict()
print(prediction_result)


Skipping line 2810: expected 2 fields, saw 5
Skipping line 4641: expected 2 fields, saw 5
Skipping line 7171: expected 2 fields, saw 5
Skipping line 11220: expected 2 fields, saw 5
Skipping line 13809: expected 2 fields, saw 5
Skipping line 14132: expected 2 fields, saw 5
Skipping line 14293: expected 2 fields, saw 5
Skipping line 14865: expected 2 fields, saw 5
Skipping line 17419: expected 2 fields, saw 5
Skipping line 22801: expected 2 fields, saw 5
Skipping line 25001: expected 2 fields, saw 5
Skipping line 26603: expected 2 fields, saw 5
Skipping line 26742: expected 2 fields, saw 5
Skipping line 29702: expected 2 fields, saw 5
Skipping line 32767: expected 2 fields, saw 5
Skipping line 32878: expected 2 fields, saw 5
Skipping line 35643: expected 2 fields, saw 5
Skipping line 36550: expected 2 fields, saw 5
Skipping line 38732: expected 2 fields, saw 5
Skipping line 40567: expected 2 fields, saw 5
Skipping line 40576: expected 2 fields, saw 5
Skipping line 41864: expected 2 field

length
         length                       
            min  max       mean median
strength                              
0             1    7   6.549604    7.0
1             8   13   9.618964    9.0
2            14  220  15.932497   16.0


lowercase_freq
         lowercase_freq                        
                    min    max      mean median
strength                                       
0                   0.0  1.000  0.708594  0.714
1                   0.0  0.923  0.629739  0.667
2                   0.0  0.933  0.422641  0.400


uppercase_freq
         uppercase_freq                        
                    min    max      mean median
strength                                       
0                   0.0  1.000  0.012335  0.000
1                   0.0  0.923  0.008456  0.000
2                   0.0  0.905  0.367294  0.429


digit_freq
         digit_freq                        
                min    max      mean median
strength                                   
0   