In [None]:
%%bash
pip install -r requirements.txt

In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.decomposition import PCA
import pickle

In [None]:
def class_labels_malware(class_name):
    return 0 if (class_name=="malware") else 1

In [None]:
def dataset_scaling(dataset):

    # Scaling dataset
    scaler = MinMaxScaler(feature_range=(0, 1)) # Scaling object for features 

    df_X = dataset.loc[:, dataset.columns!="URL_Type_obf_Type"]
    df_Y = dataset[["URL_Type_obf_Type"]]

    df_X = scaler.fit_transform(df_X)
    dataset1=np.concatenate((df_X, df_Y), axis=1)

    return dataset1, scaler

In [49]:
# Function for appending the column indices with correlation value closer to 0
def columns_lowcorr(corr, columns):
    index_lowcorr=[]
    for i in range(corr.shape[1]):
        if((-0.1 < corr.iloc[-1,i] < 0.1) or (pd.isnull(corr.iloc[-1,i]))):
            print(columns[-1],"and",columns[i],"with Correlation",round(corr.iloc[-1][i],2))
            index_lowcorr.append(i)
    return index_lowcorr
# Function for appending the features with high correlation
def columns_highcorr(corr, columns):
    index_highcorr=[]
    for i in range(corr.shape[0]):
        for j in range(i, corr.shape[1]):
            if(0.8<=corr.iloc[i][j]<1):
                print(columns[i],"and",columns[j],"with Correlation",round(corr.iloc[i][j],2))
                if(i not in index_highcorr):
                    index_highcorr.append(i);

    return index_highcorr

    return index_highcorr
# Function to determine the final Index column indices with low and high correlation values
def final_index(index_lowcorr, index_highcorr):
    index=[]
    for i in index_lowcorr:
        if i not in index:
            index.append(i)
    for j in index_highcorr:
        if j not in index:
            index.append(j) 
    index.sort()
    return index

In [32]:
# Loading Malware, Spam, Phishing and Defacement Datasets
# Each Dataset also consists of Benign URL's
malware=pd.read_csv("./datasets/multi-class-datasets//Malware.csv")
spam=pd.read_csv("./datasets/multi-class-datasets/Spam.csv")
phishing=pd.read_csv("./datasets/multi-class-datasets/Phishing.csv")
defacement=pd.read_csv("./datasets/multi-class-datasets/Defacement.csv")

In [46]:
# Data Cleaning: Strip whitespaces from the column names and drop NA values and from the explorative analysis it is clear that the Entropy_DirectoryName has no impact on the output colum
malware = malware.rename(str.strip, axis='columns')
spam=spam.rename(str.strip, axis="columns")
phishing=phishing.rename(str.strip, axis="columns")
defacement=defacement.rename(str.strip, axis="columns")

In [44]:
# In Malware Dataset nearly 40% of values are NULL values in the NumberRate_Extension column
malware1=malware.drop(["NumberRate_Extension", "Entropy_DirectoryName"], axis=1)
malware1=malware1.dropna()
malware1=shuffle(malware1)
malware1=malware1.reset_index(drop=True)

In [35]:
# In Spam Dataset nearly 34% of values are NULL values in the NumberRate_Extension column
spam1=spam.drop(["NumberRate_Extension", "Entropy_DirectoryName"], axis=1)
spam1=spam1.dropna()
spam1=shuffle(spam1)
spam1=spam1.reset_index(drop=True)

In [36]:
# In Phishing Dataset nearly 48% of values are NULL values in the NumberRate_Extension column
phishing1=phishing.drop(["NumberRate_Extension", "Entropy_DirectoryName"], axis=1)
phishing1=phishing1.dropna()
phishing1=shuffle(phishing1)
phishing1=phishing1.reset_index(drop=True)

In [37]:
# In Defacement Dataset nearly 39% of values are NULL values in the Entropy_DirectoryName and 32% of values are NULL values in the NumberRate_Extension column
defacement1=defacement.drop(["NumberRate_Extension", "Entropy_DirectoryName"], axis=1)
defacement1=defacement1.dropna()
defacement1=shuffle(defacement1) 
defacement1=defacement1.reset_index(drop=True)

In [45]:
# The final Dataset consists of 58972 rows, 77 features and 1 output feature appending all the DataFrames along rows
all_files=np.concatenate((malware1, spam1, phishing1, defacement1), axis=0)
all_files=pd.DataFrame(all_files, columns=malware1.columns)

In [47]:
# From the Explorative Analysis it is clear that there are 45 features with no impact on the output variable
names_columns_worst=['Querylength', 'path_token_count', 'avgdomaintokenlen', 'longdomaintokenlen', 
                     'avgpathtokenlen', 'charcompvowels', 'charcompace', 'ldl_url', 'ldl_path', 
                     'ldl_filename', 'ldl_getArg', 'dld_url', 'dld_domain', 'dld_path', 
                     'dld_filename', 'urlLen', 'domainlength', 'pathLength', 'subDirLen', 
                     'this.fileExtLen', 'ArgLen', 'pathurlRatio', 'ArgUrlRatio', 'argDomanRatio', 
                     'argPathRatio', 'executable', 'isPortEighty', 'ISIpAddressInDomainName', 
                     'LongestVariableValue', 'URL_DigitCount', 'host_DigitCount', 'Directory_DigitCount', 
                     'Extension_DigitCount', 'Query_LetterCount', 'Path_LongestWordLength', 'URL_sensitiveWord', 
                     'URLQueries_variable', 'spcharUrl', 'delimeter_Count', 'NumberRate_Domain', 'NumberRate_DirectoryName', 
                     'NumberRate_AfterPath', 'SymbolCount_URL', 'SymbolCount_FileName', 'SymbolCount_Extension'] 


all_files1=all_files.drop(columns=names_columns_worst) 

_______________________________________

In [48]:
all_files1.to_csv("./datasets/multi-class-datasets/All Files.csv",index=False)
all_files1=pd.read_csv("./datasets/multi-class-datasets/All Files.csv")

In [50]:
# High correlation test
corr=all_files1.corr()
index_allfiles_highcorr=columns_highcorr(corr, corr.columns)
all_files2=all_files1.drop(columns=corr.columns[index_allfiles_highcorr])

domain_token_count and SymbolCount_Domain with Correlation 1.0
tld and SymbolCount_Domain with Correlation 1.0
pathDomainRatio and URL_Letter_Count with Correlation 0.91
pathDomainRatio and Extension_LetterCount with Correlation 0.89
pathDomainRatio and LongestPathTokenLength with Correlation 0.92
Query_DigitCount and URL_Letter_Count with Correlation 0.86
Query_DigitCount and Extension_LetterCount with Correlation 0.87
Query_DigitCount and LongestPathTokenLength with Correlation 0.89
URL_Letter_Count and Extension_LetterCount with Correlation 0.95
URL_Letter_Count and LongestPathTokenLength with Correlation 0.97
Extension_LetterCount and LongestPathTokenLength with Correlation 0.97


In [51]:
all_files3=all_files2.copy()
all_files3["URL_Type_obf_Type"]=all_files3["URL_Type_obf_Type"].map({"Defacement":0, "benign":1, "malware":2, "phishing":3, "spam":4 })
all_files4, scaler=dataset_scaling(all_files3)
all_files4=pd.DataFrame(all_files4, columns=all_files3.columns)

In [52]:
X=all_files4.loc[:, all_files4.columns!="URL_Type_obf_Type"]
y=all_files4["URL_Type_obf_Type"]

In [53]:
# PCA
pca = PCA(n_components = 10)
pca.fit(X)
X_pca = pca.transform(X)

In [54]:
train_x, test_x, train_y, test_y=train_test_split(X_pca, y, random_state=0, test_size=0.25)
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(train_x, train_y)
predicted_test=knn.predict(test_x)
predicted_train=knn.predict(train_x)

In [55]:
accuracy_score(test_y, predicted_test) * 100

95.70643695313031

In [56]:
accuracy_score(train_y, predicted_train) * 100

97.14214655542743

In [57]:
pickle.dump(knn, open('multi_knn_pca.pkl', 'wb'))