# Offline Model Creation

In [19]:
import numpy as np
import pandas as pd
import mlflow

In [3]:
from xgboost import XGBClassifier
from func import URLFeatureGenerator

  from pandas import MultiIndex, Int64Index


## Read in Data

In [4]:
df = pd.read_csv('../data/processed/features.csv')

In [5]:
df.head()

Unnamed: 0,url,https,url_length,url_path_length,url_host_length,url_host_is_ip,url_has_port_in_string,number_of_digits,number_of_parameters,number_of_fragments,is_encoded,num_encoded_char,url_string_entropy,number_of_subdirectories,number_of_periods,tld,geo_loc,js_len,who_is,label
0,http://www.physics.smu.edu/web/,True,31,5,19,False,False,0,0,0,False,0,3.80872,3,3,edu,Germany,50.0,True,0
1,http://www.gershlaw.com,True,23,0,16,False,False,0,0,0,False,0,3.82791,1,2,com,Romania,84.0,False,0
2,http://www.defenselink.mil/mtom/,True,32,6,19,False,False,0,0,0,False,0,3.905639,3,2,mil,South Korea,0.0,True,0
3,http://www.superetrader.co.uk/,True,30,1,22,False,False,0,0,0,False,0,3.82258,2,3,co.uk,Japan,88.0,True,0
4,http://salonequipmentintl.com/,True,30,1,22,False,False,0,0,0,False,0,3.923231,2,1,com,Japan,94.5,True,0


In [6]:
X = df.drop(['label', 'url', 'geo_loc', 'tld'], axis=1)
y = df['label']

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Creating Pipeline

### Preprocessing

Create numerical pipeline:

In [8]:
num_cols = X.select_dtypes(include=np.number).columns.tolist()

In [9]:
from imblearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="mean")),
        ('std_scaler', StandardScaler())
    ])

Create categorical pipeline:

In [10]:
cat_cols = X.select_dtypes(include=object).columns.tolist()

In [11]:
from sklearn.preprocessing import OneHotEncoder


cat_pipeline = Pipeline([
        ('enc', OneHotEncoder())
])

Combine for full preprocessing pipeline:

In [12]:
from sklearn.compose import ColumnTransformer

pre_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_cols),
        ("cat", cat_pipeline, cat_cols),
    ], remainder = 'passthrough')

### Modelling

In [13]:
from imblearn.under_sampling import RandomUnderSampler
sampler = RandomUnderSampler(sampling_strategy = 0.3)

In [14]:
xgb = XGBClassifier(use_label_encoder = False, eval_metric='mlogloss')

pipe = Pipeline([
    ('pre', pre_pipeline),
    ('sampler', sampler),
    ('xgb', xgb)
])

In [15]:
param_grid = {
    'xgb__n_estimators': [5, 10, 50], 
    'xgb__gamma' : [0, 0.2, 0.5]
}

In [46]:
from sklearn.model_selection import GridSearchCV

search = GridSearchCV(pipe, param_grid, scoring = 'f1', cv = 3)

In [47]:
search.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('pre',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('num',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('std_scaler',
                                                                                          StandardScaler())]),
                                                                         ['url_length',
                                                                          'url_path_length',
                                                                          'url_host_length',
                                                                         

In [48]:
pd.DataFrame(search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_xgb__gamma,param_xgb__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.027691,0.000471,0.011344,0.000472,0.0,5,"{'xgb__gamma': 0, 'xgb__n_estimators': 5}",0.956044,0.925532,0.961749,0.947775,0.0159,3
1,0.031696,0.000472,0.011344,0.000944,0.0,10,"{'xgb__gamma': 0, 'xgb__n_estimators': 10}",0.952555,0.943942,0.953069,0.949855,0.004186,2
2,0.054049,0.001416,0.012011,0.0,0.0,50,"{'xgb__gamma': 0, 'xgb__n_estimators': 50}",0.947559,0.935484,0.944345,0.942462,0.005106,8
3,0.030694,0.001701,0.012011,0.000817,0.2,5,"{'xgb__gamma': 0.2, 'xgb__n_estimators': 5}",0.957798,0.930481,0.953069,0.947116,0.01192,4
4,0.031695,0.000472,0.011344,0.000472,0.2,10,"{'xgb__gamma': 0.2, 'xgb__n_estimators': 10}",0.957798,0.937163,0.944544,0.946502,0.008537,6
5,0.056718,0.001249,0.012011,0.0,0.2,50,"{'xgb__gamma': 0.2, 'xgb__n_estimators': 50}",0.943119,0.93381,0.906897,0.927942,0.015359,9
6,0.028359,0.000472,0.010677,0.000472,0.5,5,"{'xgb__gamma': 0.5, 'xgb__n_estimators': 5}",0.957798,0.938849,0.961749,0.952799,0.009995,1
7,0.034031,0.002162,0.011677,0.000472,0.5,10,"{'xgb__gamma': 0.5, 'xgb__n_estimators': 10}",0.953789,0.938849,0.942857,0.945165,0.006314,7
8,0.058386,0.001701,0.011344,0.000472,0.5,50,"{'xgb__gamma': 0.5, 'xgb__n_estimators': 50}",0.952206,0.935484,0.953069,0.946919,0.008094,5


In [49]:
search.best_params_

{'xgb__gamma': 0.5, 'xgb__n_estimators': 5}

## Testing on Unseen Data

In [50]:
y_pred = search.predict(X_test)

In [51]:
from sklearn.metrics import f1_score

f1_score(y_pred, y_test)

0.9630642954856362

In [52]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_pred, y_test)

array([[14621,     5],
       [   22,   352]], dtype=int64)

## Saving Model

In [None]:
import joblib

In [None]:
joblib.dump(search, '../models/search')