In [1]:
# import all classes from src 
from src.data_ingestion import DataIngestionFactory
from src.data_cleaning import DataCleanerFactory, DropEmptyFeatures, DropMissingValues, FillMissingValuesStrategy
from src.data_preprocessing import DataPreprocessingFactory, RemoveOutliers, CapOutliers, CapWithValues
from src.feature_engineering import FeatureEngineeringFactory, FrequencyEncoding, TargetEncoding, TimeSeriesFeatureEngineering, ConvertToFloat
from src.feature_selection import FeatureSelectionFactory, RobustScaling, LogTransformation, DimensionalityReduction, UnderSampling, OverSampling

Data Ingestion

In [2]:
obj = DataIngestionFactory()
file_path = "sampled_dataset.csv"
ing = obj.get_ingestor(file_path)
data = ing.ingest(file_path)

In [5]:
data.shape 

(323148, 87)

Data Cleaning 

In [7]:
cleaner = DataCleanerFactory(strategy=DropEmptyFeatures())
df_cleaned = cleaner.handle_missing_values(data)

In [9]:
# lets try drop missing values
print(f"Before dropping: {df_cleaned.isna().sum().sum()}")

# lets try drop missing values
# set the new strategy
cleaner.set_strategy(DropMissingValues())
df_missing_dropped = cleaner.handle_missing_values(data)

print(f"After dropping: {df_missing_dropped.isna().sum().sum()}")

Before dropping: 20966
[info] --number of rows before dropping missing values: 323148--
After dropping: 0


Data Pre-processing

In [5]:
x = 'Flow Duration'
columns_to_cap = ['Total Fwd Packet', 'Total Bwd packets', 'Total Length of Fwd Packet', 'Total Length of Bwd Packet', 'Flow Bytes/s', 'Flow IAT Mean', 'Fwd Header Length', 'Bwd Header Length', 'Packet Length Variance']

In [6]:
processor = DataPreprocessingFactory(strategy=CapOutliers(features=columns_to_cap))

# print 
print(f"Description before treatment: {df_missing_dropped[columns_to_cap].describe()}")
df_outliers_removed = processor.preprocess_features(df_missing_dropped)
print("-____--------------------______")
print(f"Description after treatment: {df_outliers_removed[columns_to_cap].describe()}")

Description before treatment:        Total Fwd Packet  Total Bwd packets  Total Length of Fwd Packet  \
count     318941.000000      318941.000000                3.189410e+05   
mean         130.483939         107.153655                6.490474e+04   
std          687.012398         589.720274                6.274008e+05   
min            0.000000           0.000000                0.000000e+00   
25%            5.000000           5.000000                4.740000e+02   
50%            6.000000           5.000000                5.200000e+02   
75%            7.000000           6.000000                1.042000e+03   
max         7292.000000        7437.000000                1.703909e+07   

       Total Length of Bwd Packet  Flow Bytes/s  Flow IAT Mean  \
count                3.189410e+05  3.189410e+05   3.189410e+05   
mean                 1.877836e+05  1.543363e+06   2.740586e+05   
std                  1.279044e+06  1.946000e+07   1.524879e+06   
min                  0.000000e+00  0.00

In [7]:
# cap with specified values 
cap_dict = {
    'Flow Packets/s': 200000,
    'Down/Up Ratio': 4,
    'Average Packet Size': 2500,
    'Fwd Segment Size Avg': 2500,
    'Bwd Segment Size Avg': 3000,
    'Fwd Bytes/Bulk Avg': 250000,
    'Subflow Fwd Bytes': 1700, 
    'Subflow Bwd Bytes': 2500,
    'Bwd Init Win Bytes': 12000,
    'Active Mean': 10000000,
    'Idle Mean': 40000000,
    'Total TCP Flow Time': 10000000000
}

processor.set_strategy(strategy=CapWithValues(cap_dict=cap_dict))
df_capped = processor.preprocess_features(df_outliers_removed)

Feature engineering

In [8]:
type(df_capped)

pandas.core.frame.DataFrame

In [9]:
feature_eng = FeatureEngineeringFactory(strategy=FrequencyEncoding(cat_cols=['Flow ID']))
df_freq_encoded = feature_eng.engineer_features(data=df_capped)

In [10]:
# target encoding 
feature_eng.set_strategy(TargetEncoding(features=['Src IP', 'Dst IP'], target='Label'))
df_target_encoded = feature_eng.engineer_features(data=df_freq_encoded)

In [11]:
# time series 
feature_eng.set_strategy(TimeSeriesFeatureEngineering(features=['Timestamp'], target='Label'))
df_time_series = feature_eng.engineer_features(data=df_target_encoded)

In [12]:
df_time_series.head(2)

Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,...,Idle Max,Idle Min,Total TCP Flow Time,year,month,day,hour,minute,second,Label
0,1.6e-05,0.169219,58172,0.0,8080,6,77583,6,5,517.0,...,0.0,0.0,77583,2023,12,6,22,9,38,0
1,3e-06,0.169219,42998,0.0,8080,6,1832,6,5,544.0,...,0.0,0.0,1832,2023,12,6,20,41,47,0


In [13]:
# convert to float
feature_eng.set_strategy(strategy=ConvertToFloat())
df_float = feature_eng.engineer_features(data=df_time_series)

Feature Selection

In [14]:
log_scaler = FeatureSelectionFactory(strategy=LogTransformation())
df_log_scaled = log_scaler.select_feature(df_float)

In [15]:
# PCA 
log_scaler.set_strategy(DimensionalityReduction(n_components=9))
df_pca = log_scaler.select_feature(df_log_scaled)

In [16]:
# UnderSampling 
log_scaler.set_strategy(strategy=UnderSampling())
df_undersampled = log_scaler.select_feature(df_pca)

In [17]:
# # OverSampling 
# log_scaler.set_strategy(strategy=OverSampling())
# df_resampled = log_scaler.select_feature(df_undersampled) 

In [18]:
from src.data_split import RandomSplitterFactory
from src.data_split import (
    RandomSplitter
)

In [19]:
splitter = RandomSplitterFactory(strategy=RandomSplitter())
X_train, X_test, y_train, y_test = splitter.split(df_undersampled)

In [20]:
# print shapes 
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (19248, 9)
X_test shape: (4812, 9)
y_train shape: (19248,)
y_test shape: (4812,)


In [21]:
# model 
from src.model_building import (
    ModelBuildingFactory,
    GradientBoostingClassifierModel, 
    KNNClassifierModel, 
    RandomForestModel,
    XGBoostModel,
)

In [25]:
model = ModelBuildingFactory(RandomForestModel())

# train 
pred, m = model.train_model(X_train, y_train, X_test)

[info] --Training time: 6.445322036743164 seconds--


In [27]:
model.set_model(model=KNNClassifierModel())
pred_1, m1 = model.train_model(X_train, y_train, X_test)

[info] --Training time: 0.06400609016418457 seconds--


In [32]:
model.set_model(model=XGBoostModel())
pred_2, m2 = model.train_model(X_train, y_train, X_test)

Parameters: { "use_label_encoder" } are not used.



[info] --Training time: 12.469817161560059 seconds--


In [None]:
# Model  Evaluation 
from src.model_evaluation import ModelEvaluationFactory
from src.model_evaluation import (
    AccuracyScore,
    ClassificationReport,
    ConfusionMatrix,
)

In [40]:
evaluator = ModelEvaluationFactory(strategy=AccuracyScore(y_test, pred))
accuracy = evaluator.evaluate()
print(f"Accuracy: {accuracy}") 

evaluator.set_strategy(strategy=ClassificationReport(y_test, pred))
classification_report = evaluator.evaluate()
print(f"Classification Report: {classification_report}")

Accuracy: 0.9673732335827099
Classification Report:               precision    recall  f1-score   support

         0.0       0.99      0.97      0.98      2012
         1.0       0.93      0.99      0.96      1550
         2.0       0.99      0.96      0.97      1028
         3.0       0.00      0.00      0.00         5
         4.0       0.00      0.00      0.00         3
         5.0       0.00      0.00      0.00         1
         6.0       1.00      0.50      0.67         4
         7.0       0.00      0.00      0.00         0
         8.0       0.33      0.11      0.17        18
        10.0       0.00      0.00      0.00         1
        11.0       1.00      0.97      0.99       190

    accuracy                           0.97      4812
   macro avg       0.48      0.41      0.43      4812
weighted avg       0.97      0.97      0.97      4812



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
