In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./data/test.csv', index_col=0)

In [3]:
df.head()

Unnamed: 0,url,label
0,http://minsotc.alania.gov.ru,0
1,http://www.freejavaguide.com,0
2,http://yeneliswa.co.za/moods/bankofamerica/7dd...,1
3,https://victordahdalehfoundation.com/programme...,0
4,http://camphhsi.com/product/list_947.html,0


In [4]:
from feature_engineering import get_protocol
df['https'] = df['url'].apply(lambda x: get_protocol(x))

In [5]:
df.head()

Unnamed: 0,url,label,https
0,http://minsotc.alania.gov.ru,0,False
1,http://www.freejavaguide.com,0,False
2,http://yeneliswa.co.za/moods/bankofamerica/7dd...,1,False
3,https://victordahdalehfoundation.com/programme...,0,True
4,http://camphhsi.com/product/list_947.html,0,False


In [6]:
from evaluation import evaluate_features
protocol_results = evaluate_features(df, 'label', 0.4)

In [7]:
protocol_results

Unnamed: 0,Feature Importance (RF),Coefficient (LR),Accuracy (RF),Accuracy (LR)
https,1.0,-0.505409,0.562598,0.562598


In [8]:
df['len_url'] = df['url'].str.len()

In [9]:
df.head()

Unnamed: 0,url,label,https,len_url
0,http://minsotc.alania.gov.ru,0,False,28
1,http://www.freejavaguide.com,0,False,28
2,http://yeneliswa.co.za/moods/bankofamerica/7dd...,1,False,180
3,https://victordahdalehfoundation.com/programme...,0,True,68
4,http://camphhsi.com/product/list_947.html,0,False,41


In [10]:
evaluate_features(df, 'label', 0.6)

Unnamed: 0,Feature Importance (RF),Coefficient (LR),Accuracy (RF),Accuracy (LR)
https,0.070871,-0.617548,0.699592,0.695966
len_url,0.929129,0.030171,0.699592,0.695966


In [11]:
def have_host_name(url: str) -> int:
    from urllib.parse import urlparse 
    hostname = urlparse(url).hostname
    # print(f'hostname:{hostname} ')
    if hostname:
        return 1
    else:
        return 0




In [12]:
df['have_host_name'] = df['url'].apply(lambda x: have_host_name(x))

In [13]:
evaluate_features(df, 'label', 0.6)

Unnamed: 0,Feature Importance (RF),Coefficient (LR),Accuracy (RF),Accuracy (LR)
https,0.075545,-0.629227,0.692321,0.689008
len_url,0.922984,0.029372,0.692321,0.689008
have_host_name,0.001471,2.181722,0.692321,0.689008


In [14]:

def has_ip_address(url):
    import re
    # Regular expression pattern to match IP addresses
    ip_pattern = r'\b(?:\d{1,3}\.){3}\d{1,3}\b'

    # Check if the URL contains an IP address
    if re.search(ip_pattern, url):
        return 1
    else:
        return 0

In [15]:
df['have_ip'] = df['url'].apply(lambda x: has_ip_address(x))

In [16]:
df[df['have_ip'] == 1][:5]

Unnamed: 0,url,label,https,len_url,have_host_name,have_ip
33,http://101.99.90.147/banks/Desjardins/1f99bbd5...,1,False,70,1,1
206,http://110.4.45.230/~med3112/System/Log/Safety...,1,False,224,1,1
250,http://69.167.151.209/files/556af810c9e769e651...,1,False,72,1,1
295,http://118.27.28.16/pc/,1,False,23,1,1
638,http://67.229.48.206/ap/signin?key=a@b.c,1,False,40,1,1


In [17]:
evaluate_features(df, 'label', 0.6)

Unnamed: 0,Feature Importance (RF),Coefficient (LR),Accuracy (RF),Accuracy (LR)
https,0.066526,-0.574021,0.703196,0.698904
len_url,0.893491,0.028368,0.703196,0.698904
have_host_name,0.000895,1.636618,0.703196,0.698904
have_ip,0.039088,3.840891,0.703196,0.698904


In [18]:
def spec_car_count(url: str) -> int:
    sp_c = ['@','?','-','=','.','#','%','+','$','!','*',',','//']
    num_chars = sum([url.count(char)  for char in sp_c ])
    return num_chars

In [19]:
df['special_chars'] = df['url'].apply(lambda x: spec_car_count(x))

In [20]:
df.head()

Unnamed: 0,url,label,https,len_url,have_host_name,have_ip,special_chars
0,http://minsotc.alania.gov.ru,0,False,28,1,0,4
1,http://www.freejavaguide.com,0,False,28,1,0,3
2,http://yeneliswa.co.za/moods/bankofamerica/7dd...,1,False,180,1,0,9
3,https://victordahdalehfoundation.com/programme...,0,True,68,1,0,4
4,http://camphhsi.com/product/list_947.html,0,False,41,1,0,3


In [21]:
evaluate_features(df, 'label', 0.6)

Unnamed: 0,Feature Importance (RF),Coefficient (LR),Accuracy (RF),Accuracy (LR)
https,0.043888,-0.607918,0.74376,0.717009
len_url,0.724607,0.05028,0.74376,0.717009
have_host_name,0.000449,-0.629279,0.74376,0.717009
have_ip,0.027443,4.409952,0.74376,0.717009
special_chars,0.203614,-0.237329,0.74376,0.717009


In [22]:
def digit_count(URL: str) -> int:
    digits = 0
    for i in URL:
        if i.isnumeric():
            digits = digits + 1
    return digits

def letters_count(URL: str) -> int:
    alphas = 0
    for i in URL:
        if i.isalpha():
            alphas = alphas + 1
    return alphas
df['digits']  = df['url'].apply(lambda x: digit_count(x))
df['letters']  = df['url'].apply(lambda x: letters_count(x))

In [23]:
evaluate_features(df, 'label', 0.7)

Unnamed: 0,Feature Importance (RF),Coefficient (LR),Accuracy (RF),Accuracy (LR)
https,0.026211,-0.507896,0.860781,0.719615
len_url,0.316761,0.002819,0.860781,0.719615
have_host_name,0.000265,-0.36687,0.860781,0.719615
have_ip,0.010636,3.964348,0.860781,0.719615
special_chars,0.139596,-0.204213,0.860781,0.719615
digits,0.255085,0.117401,0.860781,0.719615
letters,0.251446,0.0375,0.860781,0.719615


In [24]:
df['letters_to_digits'] = (df['letters'] + 1) / (df['digits'] + 1)

In [25]:
evaluate_features(df, 'label', 1)

Unnamed: 0,Feature Importance (RF),Coefficient (LR),Accuracy (RF),Accuracy (LR)
https,0.030884,-0.483274,0.850934,0.719755
len_url,0.330472,0.016454,0.850934,0.719755
have_host_name,0.000324,-0.275332,0.850934,0.719755
have_ip,0.008664,4.13439,0.850934,0.719755
special_chars,0.151345,-0.198754,0.850934,0.719755
digits,0.168853,0.078952,0.850934,0.719755
letters,0.140015,0.024016,0.850934,0.719755
letters_to_digits,0.169442,-0.009228,0.850934,0.719755


In [26]:
print(df[df['label'] == 1].head(10)['url'])

2     http://yeneliswa.co.za/moods/bankofamerica/7dd...
8     http://zonaderegistrosenlineabnweb1.com/BNWeb/...
9                 http://froxhositng.000webhostapp.com/
10    https://emailupdate.azurewebsites.net/apple/eb...
12    https://home-onlineingservice-7c2773.ingress-c...
13    https://chase-authoririze.serveusers.com/../my...
15                     https://app-dispositivoauth.com/
21    http://facebook.com-review-notifyfvrwiamhxmm.v...
23            https://constreetyj.com/square/index2.php
24    http://nettewalter.com/znb/adobelatest/zxsguwj...
Name: url, dtype: object


In [38]:
def get_feats(url: str) -> pd.DataFrame:
    protocol = (get_protocol(url) == 'https')
    url_len = len(url)
    host_name = have_host_name(url)
    ip = has_ip_address(url)
    special_chars = spec_car_count(url)
    digits = digit_count(url)
    letters = letters_count(url)
    let_to_digs = (letters + 1) / (digits + 1)

    return pd.DataFrame({
        'https': [protocol],
        'len_url': [url_len],
        'have_host_name': [host_name],
        'have_ip': [ip],
        'special_chars': [special_chars],
        'digits': [digits],
        'letters': [letters],
        'letters_to_digits': [let_to_digs]
    })

    




In [39]:
fts = get_feats('http://minsotc.alania.gov.ru')

In [40]:
fts

Unnamed: 0,https,len_url,have_host_name,have_ip,special_chars,digits,letters,letters_to_digits
0,False,28,1,0,4,0,22,23.0


In [41]:
# from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report




In [42]:
df.dropna(inplace=True)

In [43]:
X = df.drop(['url', 'label'], axis= 1)
y = df['label']

In [44]:
X.head()

Unnamed: 0,https,len_url,have_host_name,have_ip,special_chars,digits,letters,letters_to_digits
0,False,28,1,0,4,0,22,23.0
1,False,28,1,0,3,0,23,24.0
2,False,180,1,0,9,50,112,2.215686
3,True,68,1,0,4,0,59,60.0
4,False,41,1,0,3,3,30,7.75


In [45]:
rf1 = RandomForestClassifier(n_jobs=-1)
rf1.fit(X, y)

In [46]:
print(classification_report(y, rf1.predict(X)))

              precision    recall  f1-score   support

           0       0.79      0.91      0.85     79998
           1       0.90      0.76      0.83     79998

    accuracy                           0.84    159996
   macro avg       0.85      0.84      0.84    159996
weighted avg       0.85      0.84      0.84    159996



In [47]:
def predict_url(url: str) -> str:
    pred = rf1.predict(get_feats(url)).item()
    return 'safe' if pred == 0 else 'PHISHING'


In [52]:
print(predict_url('https://app-dispositivoauth.com/'))

safe


In [None]:
from urllib.parse import urlparse

In [None]:
urlparse('https://home-onlineingservice-7c2773.ingress-c/')

ParseResult(scheme='https', netloc='home-onlineingservice-7c2773.ingress-c', path='/', params='', query='', fragment='')

### notes
1. try netloc length
2. try totally removing length
3. try manually adding results from google searches and other lengthy urls
4. try google index
5. try other models
6. try deep learning \\ for last
7. try special characters to url length 
8. try path lengths
