## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import pickle
import joblib

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Data Reading

In [3]:
test_df = pd.read_csv('UNSW_NB15_testing-set.csv')
test_df.head()

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports
0,1,1.1e-05,udp,-,INT,2,0,496,0,90909.0902,...,1,1,1,2,0,0,0,1,2,0
1,2,8e-06,udp,-,INT,2,0,1762,0,125000.0003,...,1,1,1,2,0,0,0,1,2,0
2,3,5e-06,udp,-,INT,2,0,1068,0,200000.0051,...,1,1,1,3,0,0,0,1,3,0
3,4,6e-06,udp,-,INT,2,0,900,0,166666.6608,...,2,2,1,3,0,0,0,2,3,0
4,5,1e-05,udp,-,INT,2,0,2126,0,100000.0025,...,2,2,1,3,0,0,0,2,3,0


## Creating Classes for Pipeline

### 1. Proto Smoothing

In [19]:
class ProtoEncoding(BaseEstimator, TransformerMixin):
    def __init__(self, stats_file='encoding_stats.pkl'):
        self.stats_file = stats_file
        self.proto_stats = None
        self.proto_overall_mean = None
    
    def fit(self, X, y=None):
        with open(self.stats_file, 'rb') as f:
            encoding_stats = pickle.load(f)
        
        self.proto_stats, self.proto_overall_mean = encoding_stats['proto']
        return self
    
    def transform(self, X):
        X = X.copy()
        X['proto_target_encoded'] = X['proto'].map(self.proto_stats)
        X['proto_target_encoded'].fillna(self.proto_overall_mean, inplace=True)
        return X

### 2. Service Encoding

In [20]:
class ServiceEncoding(BaseEstimator, TransformerMixin):
    def __init__(self, stats_file='encoding_stats.pkl'):
        self.stats_file = stats_file
        self.service_stats = None
        self.service_overall_mean = None
    
    def fit(self, X, y=None):
        with open(self.stats_file, 'rb') as f:
            encoding_stats = pickle.load(f)
        
        self.service_stats, self.service_overall_mean = encoding_stats['service']
        return self
    
    def transform(self, X):
        X = X.copy()
        X['service_target_encoded'] = X['service'].map(self.service_stats)
        X['service_target_encoded'].fillna(self.service_overall_mean, inplace=True)
        return X

### 3. State Encoding

In [21]:
class StateEncoding(BaseEstimator, TransformerMixin):
    def __init__(self, stats_file='encoding_stats.pkl'):
        self.stats_file = stats_file
        self.state_stats = None
        self.state_overall_mean = None
    
    def fit(self, X, y=None):
        with open(self.stats_file, 'rb') as f:
            encoding_stats = pickle.load(f)
        
        self.state_stats, self.state_overall_mean = encoding_stats['state']
        return self
    
    def transform(self, X):
        X = X.copy()
        X['state_target_encoded'] = X['state'].map(self.state_stats)
        X['state_target_encoded'].fillna(self.state_overall_mean, inplace=True)
        return X

### 4. Droping Unnecessary Columns

In [22]:
class ColumnDropper(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self

    def transform(self, X):
        cols = ['proto', 'service', 'state', 'id']
        return X.drop(cols, axis = 1)

### 5. Scaling

In [23]:
class FeatureScaler(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self

    def transform(self, X):
        with open('robust_scaler_model.pkl', 'rb') as f:
            loaded_scaler = pickle.load(f)
        exclude_columns = ['is_ftp_login', 'ct_ftp_cmd', 'is_sm_ips_ports', 'label']
        columns_to_scale = [col for col in X.columns if col not in exclude_columns]
        X[columns_to_scale] = loaded_scaler.transform(X[columns_to_scale])
        return X

### 4. Prediction

In [24]:
class Prediction(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self

    def transform(self, X):
        model = joblib.load('pretrained_model.pkl')
        pred =  model.predict(X)
        return pred

In [25]:
Prediction().fit_transform(FeatureScaler().fit_transform(ColumnDropper().fit_transform(StateEncoding().fit_transform(ServiceEncoding().fit_transform(ProtoEncoding().fit_transform(test_df))))))

array([1, 1, 1, ..., 0, 0, 1])

## Pipeline

In [26]:
pipeline = Pipeline(
    steps = [('Proto Encoding', ProtoEncoding()),
             ('Service Encoding', ServiceEncoding()),
             ('State Encoding', StateEncoding()),
             ('Drop Columns', ColumnDropper()),
             ('Scaling', FeatureScaler()),
             ('Predict', Prediction())
            ])

In [27]:
pipeline

## Final Outcome

In [28]:
pred = pipeline.fit_transform(test_df)

In [29]:
pred

array([1, 1, 1, ..., 0, 0, 1])

### Dumping the Pipeline

In [30]:
joblib.dump(pipeline, 'pipeline_intrusion_det.pkl')

['pipeline_intrusion_det.pkl']