In [1]:
import re
import os
import random
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from sklearn import metrics
from tensorflow import keras
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras import backend as K
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.utils import shuffle
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding, Conv2D, MaxPooling2D, Flatten
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint,ReduceLROnPlateau
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.ml.classification import LinearSVC
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import NGram,HashingTF, IDF
from sklearn.feature_extraction.text import TfidfVectorizer
from pyspark.ml.feature import StandardScaler
from pyspark.sql.functions import lit
from pyspark.mllib.feature import StandardScaler, StandardScalerModel
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.util import MLUtils
from pyspark.ml.classification import LogisticRegression, OneVsRest
from pyspark.ml import Pipeline
from pyspark.sql import Row
from sklearn.feature_extraction.text import TfidfVectorizer
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from sklearn.linear_model import LogisticRegression



Init Plugin
Init Graph Optimizer
Init Kernel


In [2]:
SIZE = 100
BATCH_SIZE = 16
EPOCHS = 100
SEED = 0

In [3]:
os.environ['PYTHONHASHSEED']=str(SEED)

random.seed(SEED)

np.random.seed(SEED)

In [4]:
df=pd.read_csv("/Users/abhinavshinow/Documents/GitHub/Mal_URL/Data/mal_2.csv")
df2=pd.read_csv("/Users/abhinavshinow/Documents/GitHub/Mal_URL/Data/mal_3.csv")

In [5]:
df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
df.drop('label',axis = 1, inplace = True)
df=df.rename(columns={'result': 'type'})

In [6]:
df2=df2.rename(columns={'label': 'type'})
df2['type']=df2['type'].replace({'bad':1,'good':0})

In [7]:
df.head()

Unnamed: 0,url,type
0,https://www.google.com,0
1,https://www.youtube.com,0
2,https://www.facebook.com,0
3,https://www.baidu.com,0
4,https://www.wikipedia.org,0


In [8]:
df2.head()

Unnamed: 0,url,type
0,diaryofagameaddict.com,1
1,espdesign.com.au,1
2,iamagameaddict.com,1
3,kalantzis.net,1
4,slightlyoffcenter.net,1


In [8]:
def getTokens(input):
    tokensBySlash = str(input.encode('utf-8')).split('/')
    allTokens=[]
    for i in tokensBySlash:
        tokens = str(i).split('-')
        tokensByDot = []
        for j in range(0,len(tokens)):
            tempTokens = str(tokens[j]).split('.')
            tokentsByDot = tokensByDot + tempTokens
        allTokens = allTokens + tokens + tokensByDot
    allTokens = list(set(allTokens))
    if 'com' in allTokens:
        allTokens.remove('com')
    return allTokens

In [9]:
#Model--1
data1 = np.array(df)


y1=[d[1] for d in data1]
url1=[d[0] for d in data1]
vectorised_url1=TfidfVectorizer()
x1=vectorised_url1.fit_transform(url1)

x_train1, x_test1, y_train1, y_test1 = train_test_split(x1,y1,test_size=0.2,shuffle='True',stratify=y1)

In [10]:
#Model--2
data2 = np.array(df2)


y2=[d[1] for d in data2]
url2=[d[0] for d in data2]
vectorised_url2=TfidfVectorizer()
x2=vectorised_url2.fit_transform(url2)

x_train2, x_test2, y_train2, y_test2 = train_test_split(x2,y2,test_size=0.2,shuffle='True',stratify=y2)

In [11]:
#Logistic Regression 
model_lg1 = LogisticRegression(solver='lbfgs', max_iter=10000)
model_lg2 = LogisticRegression(solver='lbfgs', max_iter=10000)

#XGBoost 
model_xg1 = xgb.XGBClassifier(n_jobs = 8)
model_xg2 = xgb.XGBClassifier(n_jobs = 8)

#Random Forest
model_rf1 = RandomForestClassifier(n_estimators=100)
model_rf2 = RandomForestClassifier(n_estimators=100)

# model_dc1=DecisionTreeClassifier()
# model_dc2=DecisionTreeClassifier()

In [12]:
model_lg1.fit(x_train1,y_train1)
model_lg1.score(x_test1,y_test1)

0.9957572526544938

In [13]:
model_lg2.fit(x_train2,y_train2)
model_lg2.score(x_test2,y_test2)

0.9647176340480182

In [14]:
model_xg1.fit(x_train1,y_train1)
model_xg1.score(x_test1,y_test1)





0.9980341196854592

In [15]:
model_xg2.fit(x_train2,y_train2)
model_xg2.score(x_test2,y_test2)





0.9387939543124874

In [16]:
model_rf1.fit(x_train1,y_train1)
model_rf1.score(x_test1,y_test1)

In [None]:
model_rf2.fit(x_train2,y_train2)
model_rf2.score(x_test2,y_test2)

In [None]:
pred_lg1 = model_lg1.predict(x_test1)
pred_lg2 = model_lg2.predict(x_test2)

pred_xg1 = model_xg1.predict(x_test1)
pred_xg2 = model_xg2.predict(x_test2)

pred_rf1 = model_rf1.predict(x_test1)
pred_rf2 = model_rf2.predict(x_test2)


In [None]:
model_xg2.fit(x_train2,y_train2)
model_xg2.score(x_test2,y_test2)

In [None]:
model_rf1.fit(x_train1,y_train1)
model_rf1.score(x_test1,y_test1)

In [None]:
model_rf2.fit(x_train2,y_train2)
model_rf2.score(x_test2,y_test2)

In [None]:
print(classification_report(y_test1,pred_lg1))

In [None]:
print(classification_report(y_test2,pred_lg2))

In [None]:
print(classification_report(y_test1,pred_xg1))

In [None]:
print(classification_report(y_test2,pred_xg2))

In [None]:
print(classification_report(y_test1,pred_rf1))

In [None]:
print(classification_report(y_test1,pred_rf2))