# Machine Learning Model for URL Classification

In this notebook, we ran ML algorithms ( Decision tree, Kernal SVM and Random Forest ) on our URL features for classification.  The results from each of these algorithms are:

Decision Tree: 0.867

KNN          : 0.8655

Random Forest: 0.879

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
feature_dataset = pd.read_csv("../Data/FeaturesDataset/features.csv")
label_dataset = pd.read_csv("../Data/CleanedDataset/merged_whois_verified_urls.csv")

In [3]:
feature_dataset.head()

Unnamed: 0,whois_regDate,whois_expDate,whois_updatedDate,dot_count,url_len,digit_count,special_count,hyphen_count,double_slash,single_slash,at_the_rate,protocol,protocol_count
0,403,326,23,6,225,58,12,4,0,10,0,0,0
1,2727,194,168,7,177,47,0,1,0,11,0,0,0
2,5431,46,317,6,60,0,0,0,0,2,0,0,0
3,3643,374,5,1,116,21,1,1,1,10,0,0,0
4,-1,-1,-1,3,36,0,0,0,0,1,0,0,0


In [4]:
label_dataset.head()

Unnamed: 0,url,label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,1
1,serviciosbys.com/paypal.cgi.bin.get-into.herf....,1
2,mail.printakid.com/www.online.americanexpress....,1
3,thewhiskeydregs.com/wp-content/themes/widescre...,1
4,smilesvoegol.servebbs.org/voegol.php,1


In [5]:
X = feature_dataset.iloc[:,[0,1,2,3,4,5,6,7,8,9,10,12] ].values #not including protocol feature
y = label_dataset.iloc[:, [1]].values

Protocol feature which labels 1 for HTTP ( Malicious ) and 0 for HTTPS ( Benign ). Here, we dropped the because we noticed that majority of benign URLs are neither starting from HTTP nor HTTPS. For example, WWW.EXAMPLE.COM

In [6]:
print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')

X shape: (10000, 12)
y shape: (10000, 1)


In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [8]:
print('X_train', X_train.shape)
print('X_test', X_test.shape)
print('y_train', y_train.shape)
print('y_test', y_test.shape)

X_train (8000, 12)
X_test (2000, 12)
y_train (8000, 1)
y_test (2000, 1)


In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



In [10]:
def get_dt_results():
    from sklearn.tree import DecisionTreeClassifier
    classifier = DecisionTreeClassifier(criterion='entropy',random_state=0)
    classifier.fit(X_train,y_train)
    y_pred = classifier.predict(X_test)
    from sklearn.metrics import confusion_matrix,accuracy_score
    cm = confusion_matrix(y_test,y_pred)
    accuracy = accuracy_score(y_test,y_pred)
    result_dict = {"accuracy":accuracy,"cm":cm}
    return result_dict

In [18]:
def get_knn_results():
    #Fitting into KNN
    from sklearn.neighbors import KNeighborsClassifier
    classifier = KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2) #To select which method to use to calculate 
    #distance, we need to define metric first and then p value 1 for manhattan distance, 2 for euclidian distance
    classifier.fit(X_train,y_train)
    y_pred = classifier.predict(X_test)
    from sklearn.metrics import confusion_matrix,accuracy_score
    cm = confusion_matrix(y_test,y_pred)
    accuracy = accuracy_score(y_test,y_pred)
    result_dict = {"accuracy":accuracy,"cm":cm}
    return result_dict
    

In [11]:
def get_kernel_SVM_results():
    from sklearn.svm import SVC
    classifier = SVC(kernel="rbf",random_state=0)
    classifier.fit(X_train,y_train)
    y_pred = classifier.predict(X_test)
    from sklearn.metrics import confusion_matrix,accuracy_score
    cm = confusion_matrix(y_test,y_pred)
    accuracy = accuracy_score(y_test,y_pred)
    result_dict = {"accuracy":accuracy,"cm":cm}
    return result_dict

In [19]:
def get_logistic_reg_results():
    from sklearn.linear_model import LogisticRegression
    classifier = LogisticRegression(random_state=0)
    classifier.fit(X_train,y_train)
    y_pred = classifier.predict(X_test)
    from sklearn.metrics import confusion_matrix,accuracy_score
    cm = confusion_matrix(y_test,y_pred)
    accuracy = accuracy_score(y_test,y_pred)
    result_dict = {"accuracy":accuracy,"cm":cm}
    return result_dict

In [20]:
def get_naive_bayes_results():
    from sklearn.naive_bayes import GaussianNB
    classifier = GaussianNB()
    classifier.fit(X_train,y_train)
    y_pred = classifier.predict(X_test)
    from sklearn.metrics import confusion_matrix,accuracy_score
    cm = confusion_matrix(y_test,y_pred)
    accuracy = accuracy_score(y_test,y_pred)
    result_dict = {"accuracy":accuracy,"cm":cm}
    return result_dict

In [12]:
def get_random_forest_results():
    from sklearn.ensemble import RandomForestClassifier
    classifier = RandomForestClassifier(n_estimators=20,criterion='entropy',random_state=0)
    classifier.fit(X_train,y_train)
    y_pred = classifier.predict(X_test)
    from sklearn.metrics import confusion_matrix,accuracy_score
    cm = confusion_matrix(y_test,y_pred)
    accuracy = accuracy_score(y_test,y_pred)
    result_dict = {"accuracy":accuracy,"cm":cm}
    return result_dict

In [33]:
def get_svm_results():
    #Fitting SVM to Training set
    from sklearn.svm import SVC
    classifier = SVC(kernel='linear',random_state=0)
    classifier.fit(X_train,y_train)
    y_pred = classifier.predict(X_test)
    from sklearn.metrics import confusion_matrix,accuracy_score
    cm = confusion_matrix(y_test,y_pred)
    accuracy = accuracy_score(y_test,y_pred)
    result_dict = {"accuracy":accuracy,"cm":cm}
    return result_dict

In [34]:
def get_classification_results():
    results_dict = {}
    dt = get_dt_results()
    knn = get_knn_results()
    kernelsvm = get_kernel_SVM_results()
    logreg = get_logistic_reg_results()
    nb = get_naive_bayes_results()
    rf = get_random_forest_results()
    svm = get_svm_results()
    results_dict = {"Decision Tree":dt,"KNN":knn,"Kernel SVM":kernelsvm,"Log Regression":logreg,"Naive Bayes":nb,"Random Forest":rf,"SVM":svm}
    #results_dict = {"Decision Tree":dt,"Kernel SVM":kernelsvm,"Random Forest":rf}
    return results_dict


In [35]:
classification_results = get_classification_results()

  
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  after removing the cwd from sys.path.
  y = column_or_1d(y, warn=True)


In [36]:
for k,v in classification_results.items():
    print(f"{k}: {v['accuracy'],v['cm']}")

Decision Tree: (0.868, array([[883, 129],
       [135, 853]]))
KNN: (0.8655, array([[875, 137],
       [132, 856]]))
Kernel SVM: (0.845, array([[896, 116],
       [194, 794]]))
Log Regression: (0.754, array([[735, 277],
       [215, 773]]))
Naive Bayes: (0.69, array([[1003,    9],
       [ 611,  377]]))
Random Forest: (0.8785, array([[891, 121],
       [122, 866]]))
SVM: (0.7765, array([[730, 282],
       [165, 823]]))


***