# T4 - Final Project

Semester 2221, CSEC 520/620, Team 4\
Final Project - URL Classification\
Due by December 14, 2022 11:59 PM EST.\
Accounts for 18% of total grade.

## Preliminary Requirements

This section ensures that the `python-whois` package is installed.
We also download our raw datasets which are stored in a Git repository hosted on GitHub.

**Make sure you are running this notebook in an isolated directory, as it will be turned into a Git working directory.**

In [2]:
# Ensure the WHOIS package is installed
print(f'{"":#^{36}}\n{"## Installing Packages ":#<{36}}\n{"":#^{36}}')
!pip install python-whois

# Download our repo, which contains the RAW datasets
print(f'\n{"":#^{36}}\n{"## Updating Repository ":#<{36}}\n{"":#^{36}}')
!git init
!git remote add origin https://github.com/aisgbnok/T4-Project.git
!git pull origin main --allow-unrelated-histories

####################################
## Installing Packages #############
####################################

####################################
## Updating Repository #############
####################################
Reinitialized existing Git repository in C:/Users/Anthony/Documents/T4-Project/.git/


error: remote origin already exists.


Already up to date.


From https://github.com/aisgbnok/T4-Project
 * branch            main       -> FETCH_HEAD


In [3]:
import os
import pandas as pd
import whois
from sklearn.svm import SVC

In [4]:
def load_data(seed=75, output=False, save=False):
  """
  Loads our separate dataframes and merges them together.
  Ensures the columns are equal, normalizes the labels,
  and finally shuffles the columns using the seed.

  :param seed: Integer value that ensures reproduction of resulting dataframe.
  :param output: Whether to print dataframes or not. True or False.
  :param save: Whether to save the pandas dataframe. Can be False (default), 'CSV', or 'PICKLE'.
  :return: A pandas dataframe that contains a url and label column.
           The label column is 0 for benign and 1 for malicious.
  """
  # Get all of our datasets
  df_aj = pd.read_csv(os.path.join('datasets', 'raw', 'urls-antonyj.csv'))
  df_ms = pd.read_csv(os.path.join('datasets', 'raw', 'urls-manu-siddhartha.csv'))

  if output:
    print(f'{"":#^{36}}\n{"## Original ":#<{36}}\n{"":#^{36}}')
    print('## urls-antonyj.csv')
    display(df_aj)
    print('## urls-manu-siddhartha.csv')
    display(df_ms)

  # Ensure Columns Match
  df_ms.columns = df_aj.columns

  # Normalize Data, 1 is malicious, 0 is benign
  df_aj['label'] = (df_aj['label'] == 'bad').astype(int)
  df_ms['label'] = (df_ms['label'] != 'benign').astype(int)

  # Merge dataframes
  df = pd.merge(df_aj, df_ms, how='outer')

  # Keep first exact matches
  df = df.drop_duplicates()

  # Drop all duplicate urls with conflicting labels
  # Prevents some data poisoning, and promotes data integrity
  df = df.drop_duplicates(subset='url', keep=False)

  # Shuffle using seed value
  df = df.sample(frac=1, random_state=seed)

  # Reset Index
  df = df.reset_index(drop=True)

  if output:
    print(f'{"":#^{36}}\n{"## Resulting ":#<{36}}\n{"":#^{36}}')
    print('## urls-antonyj.csv')
    display(df_aj)
    print('## urls-manu-siddhartha.csv')
    display(df_ms)
    print('## Final')
    display(df)

  if save == 'PICKLE':
    df.to_pickle('datasets/t4-urls.zip')
  elif save == 'CSV':
    df.to_csv('datasets/t4-urls.csv')

  return df

In [6]:
dataset = load_data()

In [5]:
datasetsize = len(dataset.index)
mals = dataset[dataset["label"] == 1]
malsize = len(mals.index)
bensize = datasetsize - malsize
print(datasetsize, malsize, bensize)

707514 66447 641067


In [None]:
import threading


#multithreading the data set
def multithreading():
  perc = 0.0
  threads = []
  max_threads = 10
  thread_num = 1
  while perc < 1:
    if len(threads) >= max_threads:
      for thread in threads:
        thread.join()
        print("Thread "+str(thread_num)+" done")
        thread_num+=1
    print("perc "+str(perc))
    thread = threading.Thread(target=checker, args=(perc*100, dataset.iloc[int(perc*len(dataset)):int((perc+0.01)*len(dataset))]))
    thread.start()
    threads.append(thread)
    perc = perc + 0.01

In [None]:
#numsection -> the percent into the data that it is
#data -> the data of the current section
def checker(numsection, data):
  rows = []
  for index, row in data.iterrows():
    w = None
    try:
      w = whois.whois(data.iloc[index]["url"])
    except:
      continue
    if w.domain_name != "null" and w.domain_name is not None:
      rowdict = {}
      rowdict["originalurl"] = data.iloc[index]["url"]
      rowdict.update(dict(w))
      rows.append(rowdict)
  whoisframe = pd.DataFrame(rows)
  whoisframe.to_csv('csv/whoisdata' + str(numsection) + '.csv')
  print("csv made")

In [6]:
# Import the drive module from the Google Colab library
from google.colab import drive

# Mount your personal Google Drive
drive.mount('/content/drive/')

# Immediately change the current directory to the shared drive.
# This will reduce the chance that your personal drive will be modified erroneously.
os.chdir('/content/drive/Shareddrives/CSEC 620 Group 4/Final Project')

Mounted at /content/drive/


In [None]:
#multithreading()

In [7]:
os.chdir('/content/drive/Shareddrives/CSEC 620 Group 4/Final Project/csv')
filelist = os.listdir('.')
newframe = pd.DataFrame()
for f in filelist:
  if f.find('whoisdata') == 0:
    print(f)
    currentframe = pd.read_csv(f)
    newframe = pd.concat([newframe, currentframe], ignore_index=True)

whoisdata5.0.csv
whoisdata2.0.csv
whoisdata3.0.csv
whoisdata1.0.csv
whoisdata7.000000000000001.csv
whoisdata8.0.csv
whoisdata4.0.csv
whoisdata9.0.csv
whoisdata6.000000000000001.csv
whoisdata10.0.csv
whoisdata10.999999999999998.csv
whoisdata11.999999999999998.csv
whoisdata12.999999999999998.csv
whoisdata13.999999999999998.csv
whoisdata15.0.csv
whoisdata16.0.csv
whoisdata17.0.csv
whoisdata18.000000000000004.csv
whoisdata19.000000000000004.csv
whoisdata20.000000000000004.csv
whoisdata21.000000000000004.csv
whoisdata22.000000000000007.csv
whoisdata23.000000000000007.csv
whoisdata24.000000000000007.csv
whoisdata25.000000000000007.csv
whoisdata26.000000000000007.csv
whoisdata27.000000000000007.csv
whoisdata28.000000000000007.csv
whoisdata29.00000000000001.csv
whoisdata30.00000000000001.csv
whoisdata31.00000000000001.csv
whoisdata32.000000000000014.csv
whoisdata33.000000000000014.csv
whoisdata34.000000000000014.csv
whoisdata35.000000000000014.csv
whoisdata36.000000000000014.csv
whoisdata37.00

  exec(code_obj, self.user_global_ns, self.user_ns)


In [8]:
benmal = []
nummal = 0
numbenign = 0
for index, row in newframe.iterrows():
  thisurl = row["originalurl"]
  mergedrow = dataset.loc[dataset["url"] == thisurl]
  benigncheck = mergedrow.iloc[0]["label"]
  benmal.append(benigncheck)
  if benigncheck == 0:
    numbenign = numbenign + 1
  else:
    nummal = nummal + 1
print(nummal)
print(numbenign)

350
5198


# Splitting the data set

In [None]:
splitVal = int(len(newframe)*.2)
testing_set1 = newframe.iloc[:splitVal,]
testing_set2 = newframe.iloc[splitVal:(splitVal*2),]
training_set = newframe.iloc[(splitVal*2):,]
benmaltraining = benmal[(splitVal*2):]
benmaltesting = benmal[:splitVal]
benmaltesting2 = benmal[splitVal:(splitVal*2)]

In [None]:
countries = {}
for index, row in training_set.iterrows():
  rowcountry = row["country"]
  if rowcountry in countries.keys():
    countries[rowcountry] = countries[rowcountry] + 1
  else:
    countries[rowcountry] = 1

for c in countries:
  print(c, countries[c])

US 1330
nan 1346
UK 5
CA 94
JP 18
BR 57
AU 10
GB 64
RU 12
IN 29
IS 81
DE 21
RO 15
NZ 2
Austria 4
SE 4
PA 13
REDACTED FOR PRIVACY 11
AT 4
LU 3
SI 1
CN 51
China 3
NL 18
CY 5
CZ 4
SN 2
FR 17
IT 7
CR 2
PE 2
TH 3
SK 1
KR 3
MX 1
TR 3
MY 2
IE 1
UA 2
CH 6
United Kingdom of Great Britain and Northern Ireland (the) 1
my 1
BE 3
ES 11
KN 1
SG 3
ID 6
Malaysia 3
PL 3
BG 5
PH 2
FI 1
SC 4
NO 2
cn 1
PK 2
BD 1
EC 1
CL 1
VN 1
EE 1
HR 1
KY 2
DK 4
BS 2
MH 1
HK 1
HU 3
IL 1
GREECE 1
ET 1
KH 1
ZA 1


In [None]:
regs = {}
for index, row in training_set.iterrows():
  rowreg = row["registrar"]
  if rowreg in regs.keys():
    regs[rowreg] = regs[rowreg] + 1
  else:
    regs[rowreg] = 1

for r in regs:
  print(r, regs[r])

Cloudflare, Inc. 17
ALIBABA.COM SINGAPORE E-COMMERCE PRIVATE LIMITED 3
Total Registrations 1
nan 293
MarkMonitor, Inc. 310
Dynadot5 LLC 2
GoDaddy.com, LLC 507
DYNADOT, LLC 29
123-Reg Limited 9
Regional Network Information Center, JSC dba RU-CENTER 8
Internet Domain Service BS Corp 12
Network Solutions, LLC 224
Aruba s.p.a. 11
Alibaba Cloud Computing (Beijing) Co., Ltd. 11
Namescout.com 6
PSI-USA, Inc. dba Domain Robot 12
MarkMonitor Inc. 168
Go Daddy Domains Canada, Inc 3
DIAMATRIX C.C. 1
DNC Holdings, Inc 12
GoDaddy Corporate Domains, LLC 26
GMO INTERNET, INC. 14
InterNetX GmbH 3
GANDI SAS 13
Amazon Registrar, Inc. 52
IONOS SE 11
Fasthosts Internet Ltd [Tag = LIVEDOMAINS] 6
CSC CORPORATE DOMAINS, INC. 167
RegistrarSafe, LLC 49
Synergy Wholesale 2
Google LLC 47
TUCOWS, INC. 21
TurnCommerce, Inc. DBA NameBright.com 20
ENOM, INC. 81
Key-Systems GmbH 46
NAMECHEAP INC 99
NIC.PE 1
Digital Registra 5
REGTIME-RU 1
Freemium Kft. 1
SNAPNAMES 35, LLC 1
ICI - Registrar 2
GIP RENATER 1
Iconicnames

In [None]:
dns = {}
for index, row in training_set.iterrows():
  rowdns = row["dnssec"]
  if rowdns in dns.keys():
    dns[rowdns] = dns[rowdns] + 1
  elif 'unsigned' in str(rowdns).lower() and (rowdns != 'unsigned'):
    dns["unsigned"] = dns["unsigned"] + 1
  else:
    dns[rowdns] = 1

for r in dns:
  print(r, dns[r])

signedDelegation 72
unsigned 2672
nan 527
Inactive 12
no 19
Signed delegation 7
yes 13
signed delegation 7
['signedDelegation', 'signed'] 1


In [None]:
from numpy.lib.function_base import percentile
percentages = {}
percentagerow = []

for index, row in training_set.iterrows():
  rowdomain = row["domain_name"]
  num = 0
  lett = 0
  for char in rowdomain:
    if char.isalpha():
      lett += 1
    elif char.isnumeric():
      num += 1
  perc = round(num/(num+lett), 1)
  percentagerow.append(perc)
  if perc in percentages.keys():
    percentages[perc] = percentages[perc] + 1
  else:
    percentages[perc] =  1

for p in percentages:
  print(p, percentages[p])

0.0 3194
0.1 57
0.2 48
0.3 18
0.4 6
0.5 4
0.7 2
0.6 1


In [None]:
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
le1 = preprocessing.LabelEncoder()
url1 = le1.fit_transform(training_set["originalurl"])
countries = le1.fit_transform(training_set["country"])
features1 = [[url1[i], countries[i]] for i in range(0, len(url1))]
label1 = le1.fit_transform(benmaltraining)

model1 = GaussianNB()
model1.fit(features1, label1)

GaussianNB()

In [None]:
le2 = preprocessing.LabelEncoder()
url2 = le2.fit_transform(training_set["originalurl"])
regs = le2.fit_transform(training_set["registrar"])
features2 = [[url2[i], regs[i]] for i in range(0, len(url2))]
label2 = le2.fit_transform(benmaltraining)
model2 = GaussianNB()
model2.fit(features2, label2)

GaussianNB()

In [None]:
le3 = preprocessing.LabelEncoder()
url3 = le3.fit_transform(training_set["originalurl"])
dnses = le3.fit_transform(training_set["dnssec"])
features3 = [[url3[i], dnses[i]] for i in range(0, len(url3))]
label3 = le3.fit_transform(benmaltraining)
model3 = GaussianNB()
model3.fit(features3, label3)

GaussianNB()

In [None]:
le4 = preprocessing.LabelEncoder()
url4 = le4.fit_transform(training_set["originalurl"])
percents = le4.fit_transform(percentagerow)
features4 = [[url4[i], percents[i]] for i in range(0, len(url4))]
label4 = le4.fit_transform(benmaltraining)
model4 = GaussianNB()
model4.fit(features4, label4)

GaussianNB()

In [None]:
test1url = le1.fit_transform(testing_set1["originalurl"])
test1country = le1.fit_transform(testing_set1["country"])
test1feats = [[test1url[i], test1country[i]] for i in range(0, len(test1url))]
test1final = model1.predict(test1feats)
test1final

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
test2url = le2.fit_transform(testing_set1["originalurl"])
test2reg = le2.fit_transform(testing_set1["registrar"])
test2feats = [[test2url[i], test2reg[i]] for i in range(0, len(test2url))]
test2final = model2.predict(test2feats)
test2final

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
test3url = le3.fit_transform(testing_set1["originalurl"])
test3dns = le3.fit_transform(testing_set1["dnssec"])
test3feats = [[test3url[i], test3dns[i]] for i in range(0, len(test3url))]
test3final = model3.predict(test3feats)
test3final

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
percenttest = []

for index, row in testing_set1.iterrows():
  rowdomain = row["domain_name"]
  num = 0
  lett = 0
  for char in rowdomain:
    if char.isalpha():
      lett += 1
    elif char.isnumeric():
      num += 1
  perc = round(num/(num+lett), 1)
  percenttest.append(perc)

test4url = le4.fit_transform(testing_set1["originalurl"])
test4percent = le4.fit_transform(percenttest)
test4feats = [[test4url[i], test4percent[i]] for i in range(0, len(test4url))]
test4final = model4.predict(test4feats)
test4final

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
# this is a function to actually test naive bayes and get results back for any given testing set. used with the svm
def nb(testingset):
  test1url = le1.fit_transform(testingset["originalurl"])
  test1country = le1.fit_transform(testingset["country"])
  test1feats = [[test1url[i], test1country[i]] for i in range(0, len(test1url))]
  test1finalf = model1.predict(test1feats)
  
  test2url = le2.fit_transform(testingset["originalurl"])
  test2reg = le2.fit_transform(testingset["registrar"])
  test2feats = [[test2url[i], test2reg[i]] for i in range(0, len(test2url))]
  test2finalf = model2.predict(test2feats)

  test3url = le3.fit_transform(testingset["originalurl"])
  test3dns = le3.fit_transform(testingset["dnssec"])
  test3feats = [[test3url[i], test3dns[i]] for i in range(0, len(test3url))]
  test3finalf = model3.predict(test3feats)

  percenttest = []

  for index, row in testingset.iterrows():
    rowdomain = row["domain_name"]
    num = 0
    lett = 0
    for char in rowdomain:
      if char.isalpha():
        lett += 1
      elif char.isnumeric():
        num += 1
    perc = round(num/(num+lett), 1)
    percenttest.append(perc)

  test4url = le4.fit_transform(testingset["originalurl"])
  test4percent = le4.fit_transform(percenttest)
  test4feats = [[test4url[i], test4percent[i]] for i in range(0, len(test4url))]
  test4finalf = model4.predict(test4feats)

  return [test1finalf, test2finalf, test3finalf, test4finalf]

# SVM Aggregation

In [None]:
# this is the actual svm function. the trainingdata being passed in is a 2d matrix where each row is one entry and testing data is the same
# maybe try changing the kernel since the data is so bad atm
from sklearn.metrics import classification_report, confusion_matrix
def svm_aggregate(trainingdata, testingdata):
  y_train = benmaltesting
  y_test = benmaltesting2
  x_train = trainingdata
  x_test = testingdata
  svmclassifier = SVC(kernel='rbf')
  svmclassifier.fit(x_train, y_train)
  y_predictions = svmclassifier.predict(x_test)
  print(classification_report(y_test,y_predictions))

In [None]:
# this is a helper function just to make transforming the data easier. it takes the array of each of the test results and makes the entries row-by-row instead of in each column if im not mistaken in how it is
def aggregator(data):
  outputdata = []
  i = 0
  row = []
  while i < len(data[0]):
    for d in data:
      row.append(d[i])
    outputdata.append(row)
    row = []
    i+=1
  return outputdata

In [None]:
# this is the handler function for svm, it manipulates the data into a useable form for the svm then calls the svm_aggregate function
# d2aggregate is the training data that needs to be aggregated. this comes from naive bayes' first testing set
def svm_handler(d2aggregate):
  trainingdata = aggregator(d2aggregate)
  testingdata = aggregator(nb(testing_set2))
  svm_aggregate(trainingdata, testingdata)

svm_handler([test1final, test2final, test3final, test4final])

              precision    recall  f1-score   support

           0       0.94      1.00      0.97      1040
           1       0.00      0.00      0.00        69

    accuracy                           0.94      1109
   macro avg       0.47      0.50      0.48      1109
weighted avg       0.88      0.94      0.91      1109



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
