<a target="_blank" href="https://colab.research.google.com/github/akramdhaifullah/big-data-analysis/blob/master/malware/pcap_ml_uts.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv('tcp.csv', encoding='ISO-8859-1')

# EDA

In [3]:
df.head()

Unnamed: 0,Time,Source,Src Port,Destination,Dest. Port,Protocol,Length,Differentiated Services Field,Flags,TCP Options,Severity,Info
0,2023-01-30 01:00:07,10.10.1.198,59620,172.217.194.95,443,TLSv1.2,64468,0x00,0x018,0101080a705310265824ea67,Warning,Ignored Unknown Record
1,2023-01-30 01:00:07,10.10.1.198,59620,172.217.194.95,443,TLSv1.2,64468,0x00,0x018,0101080a705310275824ea67,Warning,Ignored Unknown Record
2,2023-01-30 01:00:07,10.10.1.198,59620,172.217.194.95,443,TLSv1.2,64468,0x00,0x018,0101080a705310275824ea67,Warning,Ignored Unknown Record
3,2023-01-30 01:00:07,10.10.1.198,59620,172.217.194.95,443,TLSv1.2,64468,0x00,0x018,0101080a705310275824ea67,Warning,Ignored Unknown Record
4,2023-01-30 01:00:07,10.10.1.198,59620,172.217.194.95,443,TLSv1.2,64468,0x00,0x018,0101080a705310285824ea67,Warning,Ignored Unknown Record


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253391 entries, 0 to 253390
Data columns (total 12 columns):
 #   Column                         Non-Null Count   Dtype 
---  ------                         --------------   ----- 
 0   Time                           253391 non-null  object
 1   Source                         253391 non-null  object
 2   Src Port                       253391 non-null  int64 
 3   Destination                    253391 non-null  object
 4   Dest. Port                     253391 non-null  int64 
 5   Protocol                       253391 non-null  object
 6   Length                         253391 non-null  int64 
 7   Differentiated Services Field  253391 non-null  object
 8   Flags                          253391 non-null  object
 9   TCP Options                    253263 non-null  object
 10  Severity                       175879 non-null  object
 11  Info                           253391 non-null  object
dtypes: int64(3), object(9)
memory usage: 23.2+ M

In [5]:
df.describe()

Unnamed: 0,Src Port,Dest. Port,Length
count,253391.0,253391.0,253391.0
mean,2564.706683,57484.365664,1957.140336
std,10973.375308,11009.771967,10500.699863
min,22.0,22.0,56.0
25%,443.0,59620.0,96.0
50%,443.0,59620.0,96.0
75%,443.0,59620.0,96.0
max,59620.0,59620.0,64468.0


# Preprocessing

In [7]:
df.isnull().sum()

Time                                 0
Source                               0
Src Port                             0
Destination                          0
Dest. Port                           0
Protocol                             0
Length                               0
Differentiated Services Field        0
Flags                                0
TCP Options                        128
Severity                         77512
Info                                 0
dtype: int64

In [8]:
df.dropna(inplace=True)

In [9]:
df.Severity.unique()



In [10]:
df.Protocol.unique()

array(['TLSv1.2', 'TCP', 'HTTP', 'HTTP/JSON'], dtype=object)

In [11]:
df.drop(columns=['Time', 'Info'], inplace=True)

In [12]:
cols_to_encode = ['Source','Destination','Protocol','Differentiated Services Field','Flags','TCP Options','Severity']

encoder = LabelEncoder()

df_encoded = df

for feature in df:
    if feature in cols_to_encode:
        df_encoded[feature] = encoder.fit_transform(df[feature])

df_encoded

Unnamed: 0,Source,Src Port,Destination,Dest. Port,Protocol,Length,Differentiated Services Field,Flags,TCP Options,Severity
0,1,59620,4,443,3,64468,0,3,175620,2
1,1,59620,4,443,3,64468,0,3,175621,2
2,1,59620,4,443,3,64468,0,3,175621,2
3,1,59620,4,443,3,64468,0,3,175621,2
4,1,59620,4,443,3,64468,0,3,175622,2
...,...,...,...,...,...,...,...,...,...,...
253344,4,443,1,59620,2,96,1,1,175611,1
253345,4,443,1,59620,2,96,1,1,175612,1
253387,4,443,1,59620,2,80,1,1,175616,1
253388,4,443,1,59620,2,88,1,1,175617,1


In [13]:
scaler = StandardScaler()

df_scaled = scaler.fit_transform(df_encoded)

# feature selection

In [14]:
X = df.drop(['Severity'], axis=1)
y = df['Severity']

In [15]:
# Handle missing values
imputer = SimpleImputer(strategy='mean')  # Choose the appropriate strategy
X_imputed = imputer.fit_transform(X)

# SMOTE()

In [21]:
oversample = SMOTE()

X_sampled, y_sampled = oversample.fit_resample(X_imputed, y)

# splitting

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_sampled, y_sampled, test_size=0.2, random_state=42)

In [23]:
clf = HistGradientBoostingClassifier()

clf.fit(X_train, y_train)

In [24]:
y_pred = clf.predict(X_test)

In [25]:
acc_score = accuracy_score(y_test, y_pred)
clf_report = classification_report(y_test, y_pred)
conf_mat = confusion_matrix(y_test, y_pred)

print(f"Accuracy:\n{acc_score}\n\n{clf_report}\nConfusion matrix:\n{conf_mat}")

Accuracy:
0.9913707173981147

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     35056
           1       0.97      1.00      0.99     35104
           2       1.00      0.97      0.99     35179

    accuracy                           0.99    105339
   macro avg       0.99      0.99      0.99    105339
weighted avg       0.99      0.99      0.99    105339

Confusion matrix:
[[35056     0     0]
 [    0 35104     0]
 [    0   909 34270]]
