In [57]:
import pandas as pd
import numpy as np
import warnings
import sqlite3
import random
warnings.filterwarnings('ignore')

import pickle

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import confusion_matrix , accuracy_score , classification_report

In [31]:
con = sqlite3.connect('password_data.sqlite')
data = pd.read_sql_query('select * from Users' , con)

In [32]:
data.head()

Unnamed: 0,index,password,strength
0,0,zxe870819,1
1,1,xw46454nr23l,1
2,2,soporte13,1
3,3,accounts6000webhost.com,2
4,4,c443balg,1


In [33]:
data.drop(['index'] , axis=1 , inplace=True)

In [34]:
data.head()

Unnamed: 0,password,strength
0,zxe870819,1
1,xw46454nr23l,1
2,soporte13,1
3,accounts6000webhost.com,2
4,c443balg,1


In [35]:
data.strength.unique()

array([1, 2, 0])

In [36]:
data.isna().sum()

password    0
strength    0
dtype: int64

In [37]:
password_tuple = np.array(data)
password_tuple

array([['zxe870819', 1],
       ['xw46454nr23l', 1],
       ['soporte13', 1],
       ...,
       ['czvrbun38', 1],
       ['mymyxe430', 1],
       ['glqjhkxb467', 1]], dtype=object)

In [38]:
random.shuffle(password_tuple)

In [39]:
x = [labels[0] for labels in password_tuple]
y = [labels[1] for labels in password_tuple]

In [40]:
def word_divide_char(inputs):
    characrter = []
    for i in inputs:
        characrter.append(i)
    return characrter

In [41]:
vectprizer = TfidfVectorizer(tokenizer=word_divide_char)

In [42]:
x = vectprizer.fit_transform(x)

In [43]:
x.shape

(100000, 80)

In [44]:
vectprizer.get_feature_names_out()

array(['\x11', ' ', '!', '#', '$', '%', '&', '(', ')', '*', '+', '-', '.',
       '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ';', '<',
       '=', '>', '?', '@', '[', '\\', ']', '^', '_', 'a', 'b', 'c', 'd',
       'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
       'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~',
       '°', '±', '³', '´', 'µ', 'ß', 'ä', 'æ', 'ç', 'õ', 'ö', 'ú', 'ý',
       'þ', '›'], dtype=object)

In [45]:
first_document_vector = x[0]

In [46]:
first_document_vector.T.todense()

matrix([[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.27235712],
        [0.20955885],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.31463444],
        [0.60700324],
        [0.29127013],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.23486904],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0

In [47]:
df = pd.DataFrame(first_document_vector.T.todense() , index=vectprizer.get_feature_names_out() ,  columns=['TF-IDF'])

In [48]:
df.sort_values(by=['TF-IDF'] , ascending=False)

Unnamed: 0,TF-IDF
8,0.607003
x,0.380828
z,0.359760
7,0.314634
9,0.291270
...,...
ö,0.000000
ú,0.000000
ý,0.000000
þ,0.000000


In [49]:
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size=0.2)

In [50]:
x_train.shape

(80000, 80)

In [51]:
clf = DecisionTreeClassifier(random_state=1441)

In [52]:
clf.fit(x_train , y_train)

In [53]:
y_pred = clf.predict(x_test)

In [54]:
cm = confusion_matrix(y_test , y_pred)
print(cm)
print(accuracy_score(y_test , y_pred))

[[ 2542   224    20]
 [  207 14388   141]
 [   30   126  2322]]
0.9626


In [55]:
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.91      0.91      0.91      2786
           1       0.98      0.98      0.98     14736
           2       0.94      0.94      0.94      2478

    accuracy                           0.96     20000
   macro avg       0.94      0.94      0.94     20000
weighted avg       0.96      0.96      0.96     20000



In [59]:
with open('model.h5' , 'wb') as f:
    pickle.dump(clf , f)