# <font color='red'> Sample Pre-processing of Web Content for Dataset Preparation </font>

## Initialisation Code

In [1]:
# Basic Libraries to be installed before moving ahead
!pip install pysafebrowsing
!pip install tld
!pip install whois
!pip install geoip2

Collecting pysafebrowsing
  Downloading pysafebrowsing-0.1.1-py3-none-any.whl (5.7 kB)
Collecting configparser
  Downloading configparser-5.2.0-py3-none-any.whl (19 kB)
Installing collected packages: configparser, pysafebrowsing
Successfully installed configparser-5.2.0 pysafebrowsing-0.1.1
Collecting tld
  Downloading tld-0.12.6-py38-none-any.whl (412 kB)
Installing collected packages: tld
Successfully installed tld-0.12.6
Collecting geoip2
  Downloading geoip2-4.5.0-py2.py3-none-any.whl (26 kB)
Collecting maxminddb<3.0.0,>=2.2.0
  Downloading maxminddb-2.2.0.tar.gz (330 kB)
Collecting aiohttp<4.0.0,>=3.6.2
  Downloading aiohttp-3.8.1-cp38-cp38-win_amd64.whl (555 kB)
Collecting async-timeout<5.0,>=4.0.0a3
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting multidict<7.0,>=4.5
  Downloading multidict-5.2.0-cp38-cp38-win_amd64.whl (45 kB)
Collecting yarl<2.0,>=1.0
  Downloading yarl-1.7.2-cp38-cp38-win_amd64.whl (122 kB)
Collecting frozenlist>=1.1.1
  Downloading froz

In [2]:
# Basic Initialisation
import time
import pandas as pd
import numpy as np
pd.set_option('mode.chained_assignment', None) #Switch off warning

In [37]:
#Verifying pathname of dataset before loading
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename));
        print(os.listdir("../input"))

/kaggle/input/geoipdatabase/GeoLite2-Country.mmdb
['geoipdatabase', 'preprocessingsampledata']
/kaggle/input/preprocessingsampledata/PreprocessingSampleData.csv
['geoipdatabase', 'preprocessingsampledata']


## Loading the Sample Web Content Crawled & Collected by MalCrawler

In [29]:
# Loading Dataset containing Raw Web Content, URL and IP Address (Output of MalCrawler)
def loadDataset():
    df = pd.read_csv("/kaggle/input/preprocessingsampledata/PreprocessingSampleData.csv")
    return df

df = loadDataset()
df = df[['url','ip_add', 'content']] # The three Columns of the initial data
df

Unnamed: 0,url,ip_add,content
0,http://www.dutchthewiz.com/freeware/,175.67.214.68,"Decay suggest in 1315.. Current constitution, ..."
1,http://www.collectiblejewels.com,188.120.171.121,breast addict nudger whash ky darkie catholics...
2,http://www.deadlinedata.com,193.51.170.1,Nato's military stoic philosophy says to accep...
3,http://www.mil.fi/maavoimat/kalustoesittely/00...,13.237.35.44,Night being newton. according to the formation...
4,http://www.avclub.com/content/node/24539,220.193.62.89,34 per two children. if we exercise simple pra...
...,...,...,...
95,http://www.ipl.org/div/potus/gwashington.html,22.63.103.109,"In high-energy sixth congress, geneva. Desert ..."
96,http://sharellmartin.biz,102.44.184.56,Forth. designation or headquarters to chicago'...
97,http://www.oatlands.org/,206.161.206.80,"Ancient chinese gather sticks"". toponymist geo..."
98,http://www.threshold21.com,169.23.27.160,Internet itself flagellated eukaryota. their c...


In [53]:
#Adding new blank columns to the dataframe df
df['geo_loc']=""
df['url_len']=""
df['js_len']=""
df['js_obf_len']=""
df['tld']=""
df['who_is']=""
df['https']=""
df['label']=""
df = df[['url','ip_add','geo_loc','url_len','js_len','js_obf_len','tld','who_is','https','content','label']]
#df

## Computing the 'geo_loc' Attribute from IP Address

In [50]:
# Filling the 'geo_loc' column of dataframe 
import os
import geoip2.database
import socket
import time

reader = geoip2.database.Reader('/kaggle/input/geoipdatabase/GeoLite2-Country.mmdb')

for x in df.index:
    try:
        ip_add = str(df['ip_add'][x])
        response = reader.country(ip_add)
        df['geo_loc'][x] = response.country.name
        #print(x, "Finished,value is:",response.country.name)   
    except Exception as msg:
        df['geo_loc'][x] = ""
        #print(x," Finished with Error Msg:",msg)

reader.close()
#df

## Computing 'url_len

In [55]:
#Generating 'url_len' from 'url'
df['url_len'] = df['url'].str.len()
#df

## Computing 'js_len'

In [56]:
import re       #importing regex for string selection and parsing

def get_js_len_inKB(content): #Function for computing 'js_len from Web Content
    js=re.findall(r'<script>(.*?)</script>',content)
    complete_js=''.join(js)
    js_len = len(content.encode('utf-8'))/1000
    return js_len
for x in df.index: #Computing and Putting 'js_len' in Pandas Dataframe
    df['js_len'][x] = get_js_len_inKB(df['content'][x])

#df

## Computing 'js_obf_len'

In [58]:
# Computed using Selenium Emulator, thus will have to be run separately and then added
# Code given in https://github.com/lucianogiuseppe/JS-Auto-DeObfuscator/blob/master/jsado.py

## Computing 'tld' Attribute

In [60]:
#Filling up TLD column
from tld import get_tld

for x in df.index:       
    try:
        u = df.url[x]
        s = get_tld(str(u), fix_protocol=True)
        df['tld'][x] = s
    except:
        pass
#df

## Computing 'who_is' Attribute

In [66]:
#Whois processing
import whois
start_time = time.time()

for x in df.index:  
    try:    
        domain = whois.query(df['url'][x])
        #print(domain.registrar)
        if len(str(domain.registrar)) >1 :
            df['who_is'][x]= 'complete'
        else:
            df['who_is'][x]= 'incomplete'
    except Exception as msg:
        #print(x,", Error: ",msg)
        df['who_is'][x]= 'incomplete'
    #print(x,df['who_is'][x])

print("***Total Time taken --- %s seconds ---***" % (time.time() - start_time))
#df

***Total Time taken --- 0.10448932647705078 seconds ---***


In [70]:
# Alternate Code for Computing using WHOIS API
from urllib.request  import  urlopen       # Importing url library
import  json                               # Importing the JSON Module

url =  'https://www.bits-pilani.ac.in'  #A sample URL
apiKey = 'at_YC7W9LM2w1lQOCMmN0KUe3OU7B8Jc'
url = 'https://www.whoisxmlapi.com/whoisserver/WhoisService?'\
    + 'domainName=' + url + '&apiKey=' + apiKey + "&outputFormat=JSON"

whois_data= urlopen(url).read().decode('utf8') #WHO IS info returned by API
data=json.loads(whois_data) # Converting it from JSON to a Python Dict Object 
#if data['registrarName']=="":
    #who_is = 'incomplete'
#else:
    #who_is = 'complete'
  
# Sample of one URL is shown here
# Similarly, who_is data is checked for all URLs in the dataset


## Computing the 'https' Attribute

In [76]:
# Filling the column https_status
import http.client

start_time = time.time()

for x in df.index:
    https_status= False
    try:
        conn = http.client.HTTPSConnection(df['url'][x])
        conn.request("HEAD", "/")
        res = conn.getresponse()
        if res.status == 200 or res.status==301 or res.status==302:
            https_status= True   
        #print(x,res.status,res.reason,https_status)
    except Exception as msg:
        df['https'][x]= 'no'
        #print(x,"Error: ",msg)
    finally:
        df['https'][x]= https_status
        #conn.close

print("***Total Time taken --- %s seconds ---***" % (time.time() - start_time))
#df

***Total Time taken --- 0.027629613876342773 seconds ---***


## Allocation of Class Label 

In [78]:
# Filling the label of training set from Google Safe Browising API
from pysafebrowsing import SafeBrowsing
KEY= "AIzaSyABO6DPGmHpCs8U5ii1Efkp1dUPJHQfGpo"

start_time = time.time()
s = SafeBrowsing(KEY)

for x in df.index:
    
    try:
        url = df['url'][x]
        r = s.lookup_urls([url])
        label=r[url]['malicious']    
        df['label']=label
        #print(x, label)
    except Exception as msg:
        df['label']=""
        #print(x,"Error: ",msg)

print("***Total Time taken --- %s seconds ---***" % (time.time() - start_time))

#df

***Total Time taken --- 2.215198516845703 seconds ---***


## Saving of Processed Data

In [79]:
# Saving the file
#df.to_csv("Datasets/processed_webdata_sample.csv")