In [10]:
import numpy as np
import pandas as pd

In [11]:
dataset = pd.read_csv('cleanPasswords.csv')

In [12]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,password,strength
0,0,kzde5577,1
1,1,kino3434,1
2,2,visi7k1yr,1
3,3,megzy123,1
4,4,lamborghin1,1


In [13]:
dataset.drop(columns = 'Unnamed: 0', inplace = True)

In [14]:
dataset.isnull().sum()

password    1
strength    0
dtype: int64

In [15]:
dataset['password'].fillna('0', inplace=True)

In [16]:
X = dataset.iloc[:,0]
y = dataset.iloc[:,1]

In [17]:
X

0             kzde5577
1             kino3434
2            visi7k1yr
3             megzy123
4          lamborghin1
              ...     
669635      10redtux10
669636       infrared1
669637    184520socram
669638       marken22a
669639        fxx4pw4g
Name: password, Length: 669640, dtype: object

In [18]:
y

0         1
1         1
2         1
3         1
4         1
         ..
669635    1
669636    1
669637    1
669638    1
669639    1
Name: strength, Length: 669640, dtype: int64

In [19]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [20]:
def makeTokens(f):
  tokens = []
  for i in f:
    tokens.append(i)
  return tokens

In [21]:
vectorizer = TfidfVectorizer(tokenizer=makeTokens)

In [22]:
X = vectorizer.fit_transform(X)

In [23]:
#feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean = False)
X = sc.fit_transform(X)

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [16]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=1.623776739188721, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score
cm_lr = confusion_matrix(y_pred_lr,y_test)
acc_lr = accuracy_score(y_pred_lr,y_test)*100

In [17]:
cm_lr

array([[  6927,   5146,     29],
       [ 15534, 115729,   6234],
       [    20,   3347,  14444]], dtype=int64)

In [18]:
acc_lr

81.89474941759751

In [2]:
import pickle

In [30]:
with open("Logistic_Regression.pickle","wb") as f:
    pickle.dump(lr,f)

In [3]:
pickle_in = open("Logistic_Regression.pickle","rb")
l = pickle.load(pickle_in)

In [26]:
x = vectorizer.transform(['hosting'])
print(l.predict(x))

[0]


In [19]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 10, random_state = 0, criterion = 'entropy')
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score
cm_rf = confusion_matrix(y_pred_rf, y_test)
acc_rf = accuracy_score(y_pred_rf, y_test)*100

In [20]:
cm_rf

array([[ 18012,   1621,     50],
       [  4460, 121792,   2657],
       [     9,    809,  18000]], dtype=int64)

In [21]:
acc_rf

94.26199151783048

In [42]:
with open("Random_Forest.pickle","wb") as f:
    pickle.dump(rf,f)

In [27]:
pickle_in2 = open("Random_Forest.pickle","rb")
r = pickle.load(pickle_in2)

In [48]:
r.predict(x)

array([1], dtype=int64)

In [22]:
#Decision tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state = 0,criterion="entropy")
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)

cm_dt = confusion_matrix(y_test,y_pred_dt)

acc_dt = accuracy_score(y_test,y_pred_dt)*100

In [23]:
cm_dt

array([[ 18578,   3695,    208],
       [  3440, 118388,   2394],
       [   349,   2664,  17694]], dtype=int64)

In [24]:
acc_dt

92.38396750492802

In [49]:
with open("Decision_Tree.pickle","wb") as f:
    pickle.dump(dt,f)

In [28]:
pickle_in3 = open("Decision_Tree.pickle","rb")
d = pickle.load(pickle_in3)

In [29]:
d.predict(x)

array([1], dtype=int64)

In [25]:
#Guassian naive bayes
from sklearn.naive_bayes import GaussianNB
gna = GaussianNB()
gna.fit(X_train.toarray(),y_train)

y_pred_gna = gna.predict(X_test.toarray())

cm_gna = confusion_matrix(y_test,y_pred_gna)

acc_gna = accuracy_score(y_test,y_pred_gna)*100

In [26]:
cm_gna

array([[    27,      2,  22452],
       [    33,      8, 124181],
       [     6,      3,  20698]], dtype=int64)

In [27]:
acc_gna

12.384564840809988

In [50]:
with open("model4.pickle","wb") as f:
    pickle.dump(gna,f)

In [57]:
pickle_in4 = open("model4.pickle","rb")
g = pickle.load(pickle_in4)

In [59]:
g.predict(x.toarray())

array([2], dtype=int64)

In [None]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5,metric = "minkowski", p = 2) #minkowski is used to specify euclinean distance formula, p indicates no. of points
knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)

cm_knn = confusion_matrix(y_test,y_pred_knn)

acc_knn = accuracy_score(y_test,y_pred_knn)*100

In [None]:
cm_knn

In [None]:
acc_knn

In [None]:
#SVM
from sklearn.svm import SVC
svm = SVC(kernel = 'linear')
svm.fit(X_train, y_train)

y_pred_svm = svm.predict(X_test)

cm_svm = confusion_matrix(y_test,y_pred_svm)

acc_svm = accuracy_score(y_test,y_pred_svm)*100

In [None]:
cm_svm

In [None]:
acc_svm

In [None]:
#KSVM
from sklearn.svm import SVC
ksvm = SVC(kernel = 'rbf')
ksvm.fit(X_train, y_train)

y_pred_ksvm = ksvm.predict(X_test)

cm_ksvm = confusion_matrix(y_test,y_pred_ksvm)

acc_ksvm = accuracy_score(y_test,y_pred_ksvm)*100

In [None]:
cm_ksvm

In [None]:
acc_ksvm