In [13]:
import json
import plotly.express as px
import datetime
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import seaborn as sns
import pickle
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.utils import shuffle
from keras.utils.np_utils import to_categorical

In [14]:
signals_df = pd.read_csv("csvs/signals.csv")
signals_df.reset_index(drop=True, inplace=True)

In [15]:
signals_df.head()

Unnamed: 0,url,has_login_pattern_on_form,has_register_pattern_on_form,has_newsletter_pattern_on_form,has_forgot_pattern_on_form,has_forgot_password_pattern_in_text_content,has_forgot_password_pattern_in_attrs,has_forgot_password_pattern_in_text_content_on_form,has_forgot_password_pattern_in_attrs_on_form,has_forgot_pattern_in_attrs,...,has_already_pattern,has_already_pattern_on_form,not_have_pattern,not_have_pattern_on_form,has_newsletter_pattern.2,has_reset_pattern_on_url,has_newsletter_pattern_on_url,page_type,Sum,type
0,https://100things2do.ca/organized/,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,n,0,False
1,https://10bet.co.tz/,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,n,2,False
2,https://126dbs.com/2021/04/,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,n,0,False
3,https://1cloudfile.com/register,True,True,False,False,False,False,False,False,False,...,True,False,True,False,False,False,False,r,14,True
4,https://1cloudfile.com/account/login,True,False,False,False,True,True,True,True,False,...,False,False,True,False,False,False,False,l,11,True


In [16]:
def get_multi_class_type(type):
    # n means not_login_signup, e means newsletter 
    if((type=='n')|(type=='e')):
        return 0
    # l means login, b means both login and signup
    if((type=='l')|(type=='b')):
        return 1
    # r means register(signup) page
    if((type=='r')):
        return 2

In [17]:
signals_df['type'] = signals_df['page_type'].map(get_multi_class_type)

In [18]:
print("Num of login pages: %s,\nNum of signup pages: %s,\nNum of non-login-signup pages: %s" % (len(signals_df[signals_df.type==1]), len(signals_df[signals_df.type==2]), len(signals_df[signals_df.type==0])))

Num of login pages: 1299,
Num of signup pages: 973,
Num of non-login-signup pages: 2453


In [19]:
# Ignore pages where we got less than 3 signals on login and signup pages
only_features_detected = signals_df[((signals_df.Sum>3) & (signals_df.type==1) ) |((signals_df.Sum>3) & (signals_df.type==2) ) | (signals_df.type==0)]
only_features_detected.head()

Unnamed: 0,url,has_login_pattern_on_form,has_register_pattern_on_form,has_newsletter_pattern_on_form,has_forgot_pattern_on_form,has_forgot_password_pattern_in_text_content,has_forgot_password_pattern_in_attrs,has_forgot_password_pattern_in_text_content_on_form,has_forgot_password_pattern_in_attrs_on_form,has_forgot_pattern_in_attrs,...,has_already_pattern,has_already_pattern_on_form,not_have_pattern,not_have_pattern_on_form,has_newsletter_pattern.2,has_reset_pattern_on_url,has_newsletter_pattern_on_url,page_type,Sum,type
0,https://100things2do.ca/organized/,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,n,0,0
1,https://10bet.co.tz/,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,n,2,0
2,https://126dbs.com/2021/04/,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,n,0,0
3,https://1cloudfile.com/register,True,True,False,False,False,False,False,False,False,...,True,False,True,False,False,False,False,r,14,2
4,https://1cloudfile.com/account/login,True,False,False,False,True,True,True,True,False,...,False,False,True,False,False,False,False,l,11,1


In [20]:
print("After filtering by minumum 3 signals received pages: \nNum of login pages: %s,\nNum of signup pages: %s,\nNum of non-login-signup pages: %s" % (len(only_features_detected[only_features_detected.type==1]), len(only_features_detected[only_features_detected.type==2]), len(only_features_detected[only_features_detected.type==0])))

After filtering by minumum 3 signals received pages: 
Num of login pages: 1299,
Num of signup pages: 973,
Num of non-login-signup pages: 2453


In [22]:
predictors = only_features_detected.iloc[:, 1:89]
target = only_features_detected['type']
encoder = LabelEncoder()
encoder.fit(target)
Y = encoder.transform(target)
Y = to_categorical(Y)
X_train,X_test,Y_train,Y_test = train_test_split(predictors,Y,test_size=0.33)

In [23]:
model = Sequential()
model.add(Dense(32,activation='relu',input_dim=88))
model.add(Dense(16,activation='relu'))
model.add(Dense(8,activation='relu'))
model.add(Dense(3,activation='sigmoid'))

model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
history = model.fit(X_train,Y_train,epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200

In [None]:
scores = model.evaluate(X_test, Y_test)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

In [None]:
X_test.shape

In [None]:
Y_pred_nn = model.predict(X_test)

In [None]:
import numpy as np
y_actual = np.argmax(Y_test, axis=1)

In [None]:
import numpy as np
y = np.argmax(Y_pred_nn, axis=1)

In [None]:
print("classification Report:\n",classification_report(y_actual,y))
print('Confusion Matrix \n' , confusion_matrix(y_actual,y))

In [None]:
model.save('multiclass_classifier')