## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import pickle
import joblib

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Data Reading

In [3]:
test_df = pd.read_csv('UNSW_NB15_testing-set.csv')
test_df.head()

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,1.1e-05,udp,-,INT,2,0,496,0,90909.0902,...,1,2,0,0,0,1,2,0,Normal,0
1,2,8e-06,udp,-,INT,2,0,1762,0,125000.0003,...,1,2,0,0,0,1,2,0,Normal,0
2,3,5e-06,udp,-,INT,2,0,1068,0,200000.0051,...,1,3,0,0,0,1,3,0,Normal,0
3,4,6e-06,udp,-,INT,2,0,900,0,166666.6608,...,1,3,0,0,0,2,3,0,Normal,0
4,5,1e-05,udp,-,INT,2,0,2126,0,100000.0025,...,1,3,0,0,0,2,3,0,Normal,0


In [5]:
columns_to_include = ['dpkts', 'sttl', 'smean', 'ct_srv_src', 'proto']
test_df = test_df[columns_to_include]
test_df

Unnamed: 0,dpkts,sttl,smean,ct_srv_src,proto
0,0,254,248,2,udp
1,0,254,881,2,udp
2,0,254,534,3,udp
3,0,254,450,3,udp
4,0,254,1063,3,udp
...,...,...,...,...,...
82327,0,254,52,1,udp
82328,8,254,903,1,tcp
82329,0,0,46,1,arp
82330,0,0,46,1,arp


## Creating Classes for Pipeline

### 1. Proto Smoothing

In [6]:
class ProtoEncoding(BaseEstimator, TransformerMixin):
    def __init__(self, stats_file='encoding_stats.pkl'):
        self.stats_file = stats_file
        self.proto_stats = None
        self.proto_overall_mean = None
    
    def fit(self, X, y=None):
        with open(self.stats_file, 'rb') as f:
            encoding_stats = pickle.load(f)
        
        self.proto_stats, self.proto_overall_mean = encoding_stats['proto']
        return self
    
    def transform(self, X):
        X = X.copy()
        X['proto_target_encoded'] = X['proto'].map(self.proto_stats)
        X['proto_target_encoded'].fillna(self.proto_overall_mean, inplace=True)
        return X

### 3. Column Dropper

In [10]:
class ColumnDropper(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self

    def transform(self, X):
        return X.drop('proto', axis = 1)

### 5. Scaling

In [7]:
class FeatureScaler(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self

    def transform(self, X):
        with open('robust_scaler_model.pkl', 'rb') as f:
            loaded_scaler = pickle.load(f)
        columns_to_scale = ['dpkts', 'sttl', 'smean', 'ct_srv_src', 'proto_target_encoded']
        X[columns_to_scale] = loaded_scaler.transform(X[columns_to_scale])
        return X

### 7. Prediction

In [8]:
class Predictor(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self

    def transform(self, X):
        model = joblib.load('pretrained_voting2_model.pkl')
        pred =  model.predict(X)
        return pred

In [13]:
Predictor().fit_transform(FeatureScaler().fit_transform(ColumnDropper().fit_transform(ProtoEncoding().fit_transform(test_df))))

array([1, 1, 1, ..., 0, 0, 1], dtype=int64)

## Pipeline

In [14]:
pipeline = Pipeline(
    steps = [('Proto Encoding', ProtoEncoding()),
             ('Drop Columns', ColumnDropper()),
             ('Scaling', FeatureScaler()),
             ('Prediction', Predictor())
            ])

In [15]:
pipeline

## Final Outcome

In [16]:
pred = pipeline.fit_transform(test_df)

In [17]:
pred

array([1, 1, 1, ..., 0, 0, 1], dtype=int64)

### Dumping the Pipeline

In [24]:
joblib.dump(pipeline, 'pipeline_intrusion_det.pkl')

['pipeline_intrusion_det.pkl']

In [18]:
df = pd.read_csv('UNSW_NB15_testing-set.csv')

In [19]:
y = df['label']
y

0        0
1        0
2        0
3        0
4        0
        ..
82327    0
82328    0
82329    0
82330    0
82331    0
Name: label, Length: 82332, dtype: int64

In [20]:
from sklearn.metrics import *

In [21]:
accuracy_score(y, pred)

0.887832191614439

In [22]:
accuracy_score(y, pred)

0.887832191614439

In [23]:
df['label'].value_counts(normalize = 'true')

label
1    0.5506
0    0.4494
Name: proportion, dtype: float64

In [79]:
accuracy = accuracy_score(y, pred)
print('Accuracy', accuracy)
precision = precision_score(y, pred)
print('Precision', precision)
recall = recall_score(y, pred)
print('Recall', recall)
f1 = f1_score(y, pred)
print('F1', f1)

Accuracy 0.887832191614439
Precision 0.867055784913872
Recall 0.9404835436336363
F1 0.9022782345534004
