In [1]:
import pandas as pd
import re
import warnings
warnings.simplefilter(action='ignore', category=Warning)
import pickle
import tldextract
import whois
from datetime import datetime
from urllib.parse import urlparse,urlencode
from tqdm import tqdm
current = datetime.now()

# Nouvelle section

In [2]:
#import data
df = pd.read_csv("spam2.csv", encoding='latin-1',nrows = 10000)
df.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [3]:
#data shape
df.shape

(10000, 2)

# Nouvelle section

In [4]:
#to simplify the problem, we gonna transform all malicious urls' type to one type which is phishing
def cleaning(x):
    if (x == "malware") | (x == "defacement") | (x== "phishing"):
        return 1
    else:
        return 0

In [5]:
#apply cleaning function on the urls
df['type'] = df['type'].apply(cleaning)

In [6]:
#now we have only two types of urls, benign --> good and phishing --> bad
df['type'].value_counts()

0    7325
1    2675
Name: type, dtype: int64

In [7]:
def is_shortened(domain_name):
    list = ["bit.ly","tinyurl","ow.ly","goo.gl","tiny.cc","shorte.st","BudURL.com"]
    for i in list:
        if i in domain_name :
            return 1
        else:
            return 0

In [8]:
def is_redirecting(url):
    position = url.rfind("//")
    if position > 6: # case of http 
        if position > 7: # case of https
            return 1
        else : 
            return 0
    else:
        return 0

In [9]:
#now we will add the features that we need to train our models
df['domaine_name'] = df['url'].apply(lambda x : tldextract.extract(x).registered_domain)
df['contains_dash'] = df['domaine_name'].apply(lambda x : 1 if '-' in x else 0 )
df['contains_at'] = df['url'].apply(lambda x : 1 if '@' in x else 0 )
df['contains_percent'] = df['url'].apply(lambda x : 1 if '%' in x else 0 )
df['url_len'] = df['url'].apply(lambda x : len(x))
df['url_depth'] = df['url'].apply(lambda x : len(x.split("/")))
df['use_https'] = df['url'].apply(lambda x : 1 if "https" in x else 0)
df['digits'] = df['url'].apply(lambda x : len("".join(_ for _ in x if _.isdigit())))
df['redirection']=df['url'].apply(lambda x :is_redirecting(x))
df['contains_ip']= df['domaine_name'].apply(lambda x :1 if x=='' else 0 )
df['shortening']=df['domaine_name'].apply(lambda x :is_shortened(x))
df.head()

Unnamed: 0,url,type,domaine_name,contains_dash,contains_at,contains_percent,url_len,url_depth,use_https,digits,redirection,contains_ip,shortening
0,br-icloud.com.br,1,br-icloud.com.br,1,0,0,16,1,0,0,0,0,0
1,mp3raid.com/music/krizz_kaliko.html,0,mp3raid.com,0,0,0,35,3,0,1,0,0,0
2,bopsecrets.org/rexroth/cr/1.htm,0,bopsecrets.org,0,0,0,31,4,0,1,0,0,0
3,http://www.garage-pirenne.be/index.php?option=...,1,garage-pirenne.be,1,0,0,88,4,0,7,0,0,0
4,http://adventure-nicaragua.net/index.php?optio...,1,adventure-nicaragua.net,1,0,0,235,4,0,22,0,0,0


In [10]:
def age_domaine(domain_name):
  creation_date = domain_name.creation_date
  expiration_date = domain_name.expiration_date
  if (isinstance(creation_date,str) or isinstance(expiration_date,str)):
    try:
      creation_date = datetime.strptime(creation_date,'%Y-%m-%d')
      expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
    except:
      return 1
  if ((expiration_date is None) or (creation_date is None)):
      return 1
  elif ((type(expiration_date) is list) or (type(creation_date) is list)):
      return 1
  else:
    ageofdomain = abs((expiration_date - creation_date).days)
    if ((ageofdomain/30) < 6):
      age = 1
    else:
      age = 0
  return age

In [11]:
def regestered_age(url):
  t = []
  dns = 0
  try:
    domain_name = whois.whois(urlparse(url).netloc)
  except:
    dns = 1
  t.append(dns)
  t.append(1 if dns == 1 else age_domaine(domain_name))
  return t

In [12]:
for i in tqdm(range(df.shape[0])):
    is_registred,age = regestered_age(df.loc[i,"url"])
    df.loc[i,"is_regetered"] = is_registred
    df.loc[i,"age"] = age
    

  1%|▏         | 125/10000 [00:10<10:07, 16.25it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


  4%|▎         | 353/10000 [00:41<10:02, 16.01it/s]  

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


  4%|▍         | 440/10000 [00:49<08:28, 18.81it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


  5%|▍         | 475/10000 [00:52<13:54, 11.42it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


  9%|▊         | 856/10000 [01:57<10:06, 15.08it/s]  

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 12%|█▏        | 1152/10000 [02:26<15:32,  9.49it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 13%|█▎        | 1347/10000 [02:52<31:38,  4.56it/s]  

Error trying to connect to socket: closing socket - [WinError 10054] Une connexion existante a dû être fermée par l’hôte distant


 14%|█▎        | 1362/10000 [02:53<11:39, 12.35it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 15%|█▍        | 1490/10000 [03:04<11:38, 12.18it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 15%|█▍        | 1495/10000 [03:04<11:36, 12.21it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 15%|█▌        | 1536/10000 [03:14<32:48,  4.30it/s]  

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 16%|█▌        | 1568/10000 [03:16<09:41, 14.49it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 16%|█▌        | 1607/10000 [03:19<11:40, 11.97it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 17%|█▋        | 1709/10000 [03:28<08:30, 16.24it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 19%|█▉        | 1888/10000 [03:45<09:32, 14.17it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 21%|██        | 2076/10000 [04:03<09:35, 13.76it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 22%|██▏       | 2218/10000 [04:16<12:31, 10.35it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 27%|██▋       | 2707/10000 [05:01<10:33, 11.51it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 27%|██▋       | 2730/10000 [05:03<07:37, 15.88it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 28%|██▊       | 2754/10000 [05:05<09:59, 12.09it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 29%|██▊       | 2854/10000 [05:14<08:00, 14.86it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 29%|██▉       | 2888/10000 [05:16<07:37, 15.53it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 30%|███       | 3028/10000 [05:25<06:33, 17.70it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 31%|███       | 3117/10000 [05:34<10:11, 11.25it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 32%|███▏      | 3155/10000 [05:36<05:23, 21.13it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 32%|███▏      | 3228/10000 [05:46<06:47, 16.61it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 34%|███▎      | 3364/10000 [05:57<06:54, 16.03it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 35%|███▍      | 3465/10000 [06:24<1:06:05,  1.65it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 37%|███▋      | 3680/10000 [06:47<20:44,  5.08it/s]  

Error trying to connect to socket: closing socket - [WinError 10054] Une connexion existante a dû être fermée par l’hôte distant


 39%|███▉      | 3900/10000 [07:14<23:11,  4.38it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 44%|████▍     | 4416/10000 [08:09<05:29, 16.95it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 46%|████▌     | 4608/10000 [08:28<05:24, 16.63it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 47%|████▋     | 4672/10000 [08:31<02:33, 34.74it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 47%|████▋     | 4693/10000 [08:35<11:10,  7.92it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 48%|████▊     | 4774/10000 [08:42<09:34,  9.09it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 49%|████▉     | 4932/10000 [08:58<06:53, 12.26it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 51%|█████     | 5053/10000 [09:09<03:33, 23.13it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed
Error trying to connect to socket: closing socket - [WinError 10054] Une connexion existante a dû être fermée par l’hôte distant


 51%|█████     | 5071/10000 [09:11<08:06, 10.14it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 53%|█████▎    | 5263/10000 [09:34<09:50,  8.02it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 53%|█████▎    | 5271/10000 [09:35<08:54,  8.84it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 54%|█████▍    | 5430/10000 [10:09<07:02, 10.83it/s]  

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 62%|██████▏   | 6156/10000 [11:42<21:24,  2.99it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 66%|██████▌   | 6577/10000 [12:35<04:56, 11.55it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 67%|██████▋   | 6710/10000 [12:48<08:23,  6.54it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 70%|███████   | 7023/10000 [13:32<02:48, 17.68it/s]

Error trying to connect to socket: closing socket - [WinError 10054] Une connexion existante a dû être fermée par l’hôte distant


 71%|███████▏  | 7146/10000 [13:56<43:18,  1.10it/s]

Error trying to connect to socket: closing socket - timed out


 72%|███████▏  | 7221/10000 [14:04<05:47,  8.00it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 74%|███████▍  | 7449/10000 [14:33<05:30,  7.71it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 82%|████████▏ | 8181/10000 [15:43<01:57, 15.52it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 82%|████████▏ | 8217/10000 [15:46<02:38, 11.23it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 86%|████████▌ | 8563/10000 [16:28<01:21, 17.73it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 87%|████████▋ | 8665/10000 [16:44<02:16,  9.81it/s]

Error trying to connect to socket: closing socket - [WinError 10054] Une connexion existante a dû être fermée par l’hôte distant


 88%|████████▊ | 8752/10000 [16:48<00:40, 30.86it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 89%|████████▊ | 8866/10000 [17:03<03:03,  6.19it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 89%|████████▉ | 8912/10000 [17:07<01:23, 13.02it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 94%|█████████▎| 9367/10000 [17:56<00:40, 15.57it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 98%|█████████▊| 9820/10000 [18:44<00:10, 16.71it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


 99%|█████████▉| 9890/10000 [19:01<00:14,  7.68it/s]

Error trying to connect to socket: closing socket - [WinError 10054] Une connexion existante a dû être fermée par l’hôte distant


100%|█████████▉| 9989/10000 [19:23<00:03,  3.61it/s]

Error trying to connect to socket: closing socket - [Errno 11001] getaddrinfo failed


100%|██████████| 10000/10000 [19:24<00:00,  8.59it/s]


In [13]:
#df['is_regetered']=df['url'].apply(lambda x :regestered_age(x)[0])

In [14]:
#df['age']=df['url'].apply(lambda x :regestered_age(x)[1])

In [15]:
#here is our final dataset after features extraction
df.head()

Unnamed: 0,url,type,domaine_name,contains_dash,contains_at,contains_percent,url_len,url_depth,use_https,digits,redirection,contains_ip,shortening,is_regetered,age
0,br-icloud.com.br,1,br-icloud.com.br,1,0,0,16,1,0,0,0,0,0,0.0,1.0
1,mp3raid.com/music/krizz_kaliko.html,0,mp3raid.com,0,0,0,35,3,0,1,0,0,0,0.0,1.0
2,bopsecrets.org/rexroth/cr/1.htm,0,bopsecrets.org,0,0,0,31,4,0,1,0,0,0,0.0,1.0
3,http://www.garage-pirenne.be/index.php?option=...,1,garage-pirenne.be,1,0,0,88,4,0,7,0,0,0,0.0,1.0
4,http://adventure-nicaragua.net/index.php?optio...,1,adventure-nicaragua.net,1,0,0,235,4,0,22,0,0,0,1.0,1.0


# Tf-IDF Vectorizer

In [16]:
#import the TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
#the tokens needed by tf-idf vectorizer to compute their tf-idf weight 
#will be created by this function
def Tokenize(f):
        
    tokens_1 = str(f.encode('utf-8')).split('/') # make tokens after splitting by slash
    total_Tokens = []
    
    

    for token in tokens_1:

        tokens =  str(token).split('-') # make tokens after splitting by dash (-)

         
    
    tokens_dot = []
   

    for token in range(0,len(tokens)):
                tokens_2 = str(tokens[token]).split('.') # make tokens after splitting by dot (.)
                tokens_dot = tokens_dot + tokens_2
                total_Tokens = total_Tokens + tokens + tokens_dot
                total_Tokens = list(set(total_Tokens))  #remove redundant tokens

    if 'com' in total_Tokens:
            total_Tokens.remove('com') # removing .com since it occurs a lot of times and it should not be included in our features
    
    return total_Tokens

In [18]:
#create the vectorizer
vectorizer = TfidfVectorizer(tokenizer=Tokenize)

In [19]:
#apply the vectorizer on links
vectorized_links = vectorizer.fit_transform(df['url'])

In [20]:
from scipy.sparse import csr_matrix

In [21]:
#vectorized_links is a sparse matrix, so to concatenat other features with the 
#vectorized links, we need to transform their structure from dataframe to sparse matrix
#generate a sparse matrix from the other features except urls 
OtherFeatures = csr_matrix(df.drop(['url','domaine_name','type'],axis=1))

In [22]:
from scipy.sparse import hstack

In [23]:
#concatenat the two sparse matrixs (urls and the stas genertisticated from urls)
X = hstack((vectorized_links, OtherFeatures))
# X represents our independantes features
X


<10000x17692 sparse matrix of type '<class 'numpy.float64'>'
	with 66194 stored elements in COOrdinate format>

In [24]:
# we have urls and 1188 features

In [25]:
#y is the dependant feature which we are trying to predict
y = df['type']

Now we are finished from the part of features engineering in which we have extracted multiple information from the urls. Also we have used the tf-idf vectorizer to use the url's content in the process of prediction

# Machine Learning 

in this part we are going to test multiple classification models and we try to take the best one which gives us the best precision both on the train and test data

In [26]:
#first we devide our dataset into two parts, one for training and the other for testing
from sklearn.model_selection import train_test_split

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=42)

In [28]:
#these are the models which are going to use
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

In [29]:
# because of our dataset is unbalanced and we are most interested in the false postive rate, that means 
#we don't want that our model predicts that the message is spam and actually is not.
#because of this error the user can miss a very important mail which classified as a spam by the model.
from sklearn.metrics import precision_score

In [30]:
#Create the models
RandomForestClassifier = RandomForestClassifier()
AdaBoostClassifier = AdaBoostClassifier()
GradientBoostingClassifier = GradientBoostingClassifier()
SVC = SVC()
LogisticRegression = LogisticRegression()
KNN = KNeighborsClassifier()
xgb_cl = xgb.XGBClassifier()

In [31]:
models = {"LogisticRegression" : LogisticRegression,
"RandomForestClassifier" : RandomForestClassifier,
"AdaBoostClassifier": AdaBoostClassifier,
"GradientBoostingClassifier" : GradientBoostingClassifier,
"SVC" : SVC,
"KNN" : KNN,
"XGB":xgb_cl}

In [32]:
# a dictionary which will store the precision of each model both on train and test set
results = {"model":[],"train_precision":[],"test_precision":[]}

In [33]:
#train and test the performance of each model
for name,model in models.items():
    print(name)
    results['model'].append(name)
    #train the model
    model.fit(X_train,y_train)
    #make predictions
    y_train_1 = model.predict(X_train)
    y_test_1 = model.predict(X_test)
    #compute the precision of the model
    train_precision = precision_score(y_train,y_train_1)
    test_precision = precision_score(y_test,y_test_1)
    #store the results
    results["train_precision"].append(train_precision)
    results["test_precision"].append(test_precision)

LogisticRegression
RandomForestClassifier
AdaBoostClassifier
GradientBoostingClassifier
SVC
KNN
XGB


In [34]:
results_df = pd.DataFrame(results)

In [35]:
results_df.sort_values(['train_precision','test_precision'],ascending=False)

Unnamed: 0,model,train_precision,test_precision
1,RandomForestClassifier,0.999442,0.981308
6,XGB,0.982891,0.940048
3,GradientBoostingClassifier,0.952412,0.952941
2,AdaBoostClassifier,0.918165,0.901599
5,KNN,0.868043,0.813713
0,LogisticRegression,0.844286,0.826516
4,SVC,0.744275,0.688889


it's obvious from the results_df that the best model is xgboost, so we are going to save the xgboost model
to make predictions in the futur