In [None]:
%%bash
pip install -r requirements.txt

In [12]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import math
import random
from collections import Counter
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import pickle

In [1]:
#Tokenizer
def getTokens(input):
	tokensBySlash = str(input.encode('utf-8')).split('/')	#get tokens after splitting by slash
	allTokens = []
	for i in tokensBySlash:
		tokens = str(i).split('-')	#get tokens after splitting by dash
		tokensByDot = []
		for token in tokens:
			tempTokens = str(token).split('.')
			tokensByDot = tokensByDot + tempTokens
		allTokens = allTokens + tokens + tokensByDot
	allTokens = list(set(allTokens))	#remove redundant tokens
	if 'com' in allTokens:
		allTokens.remove('com')	#removing .com since it occurs a lot of times and it should not be included in our features
	return allTokens

In [3]:
#Entropy when talked about in information theory relates to the randomness in data. Another way to think about entropy is that it is the unpredictability of the data.
def entropy(s):
	p, lns = Counter(s), float(len(s))
	return -sum( count/lns * math.log(count/lns, 2) for count in p.values())

In [4]:
# Function to label the classes in a DataFrame
def class_labels_malware(class_name):
    return 0 if (class_name=="malware") else 1

In [7]:
# Function for Data Scaling
def dataset_scaling(dataset):

    # Scaling dataset
    scaler = MinMaxScaler(feature_range=(0, 1)) # Scaling object for features 

    df_X = dataset.loc[:, dataset.columns!="URL_Type_obf_Type"]
    df_Y = dataset[["URL_Type_obf_Type"]]

    df_X = scaler.fit_transform(df_X)
    dataset1=np.concatenate((df_X, df_Y), axis=1)

    return dataset1, scaler 

In [8]:
# This dataset created from MISP Threat Text Data
allurls = './datasets/data.csv'	#path to our all urls file
allurlscsv = pd.read_csv(allurls,',',error_bad_lines=False)	#reading file
allurlsdata = pd.DataFrame(allurlscsv)	#converting to a dataframe

In [None]:
allurlsdata = np.array(allurlsdata)	#converting it into an array
random.shuffle(allurlsdata)	#shuffling

y = [d[1] for d in allurlsdata]	#all labels 
corpus = [d[0] for d in allurlsdata]	#all urls corresponding to a label (either good or bad)

#By using a vectorized implementation in an optimization algorithm we can make the process of computation much faster compared to Unvectorized Implementation
#Term Frequency Inverse Document Frequency for Text Data Vectorization 
vectorizer = TfidfVectorizer(tokenizer=getTokens)	#get a vector for each url but use our customized tokenizer
X = vectorizer.fit_transform(corpus)	#get the X vector

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)	#split into training and testing set 80/20 ratio

lgs = LogisticRegression()	#using logistic regression
lgs.fit(X_train, y_train)

In [13]:
y_pred = lgs.predict(X_test) #Make predictions from test data
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         bad       0.99      0.96      0.98     28340
        good       0.98      1.00      0.99     55753

    accuracy                           0.98     84093
   macro avg       0.99      0.98      0.98     84093
weighted avg       0.98      0.98      0.98     84093



In [14]:
print(confusion_matrix(y_test, y_pred))

[[27220  1120]
 [  217 55536]]


In [15]:
pickle.dump(lgs, open('binary_LR.pkl', 'wb'))