# Machine Learning Final Task:

# Imports:

In [1]:
import joblib
import warnings
import pandas            as pd
import numpy             as np
import matplotlib.pyplot as plt
import scipy.stats       as stats
import plotly.express    as px
import seaborn           as sns
from collections                          import Counter
from scipy.io                             import arff
from sklearn                              import metrics, linear_model
from sklearn.svm                          import SVC
from sklearn.feature_selection            import RFE
from sklearn.naive_bayes                  import GaussianNB
from sklearn.impute                       import SimpleImputer
from sklearn.neighbors                    import KNeighborsClassifier
from sklearn.linear_model                 import LogisticRegression , LinearRegression , Lasso
from sklearn.ensemble                     import RandomForestRegressor , RandomForestClassifier
from sklearn.tree                         import DecisionTreeClassifier , DecisionTreeRegressor
from sklearn.model_selection              import GridSearchCV , train_test_split , cross_val_score
from sklearn.preprocessing                import StandardScaler, LabelEncoder, RobustScaler , OneHotEncoder
from sklearn.metrics                      import accuracy_score , roc_auc_score , roc_curve , r2_score , f1_score
from sklearn.metrics                      import confusion_matrix , classification_report , mean_absolute_error , mean_squared_error
from imblearn.over_sampling               import SMOTE
from imblearn.under_sampling              import TomekLinks
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.compose                      import ColumnTransformer





warnings.filterwarnings('ignore')


# Reading and Cleaning first dataset

#### Reading the data:

In [2]:
data = pd.read_csv('train_task.csv')
data

Unnamed: 0,ID,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,level,Class
0,7060,0,tcp,smtp,SF,1129,327,0,0,0,...,1.00,0.00,0.14,0.02,0.00,0.04,0.0,0.01,21,0
1,22036,0,tcp,http,SF,242,2147,0,0,0,...,1.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00,21,0
2,19555,0,tcp,http,SF,299,3991,0,0,0,...,1.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00,21,0
3,11393,0,udp,domain_u,SF,43,133,0,0,0,...,1.00,0.01,0.00,0.00,0.00,0.00,0.0,0.00,21,0
4,22783,0,tcp,telnet,S0,0,0,0,0,0,...,0.14,0.19,0.05,0.00,0.10,0.67,0.0,0.00,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14676,8562,0,tcp,smtp,SF,3873,337,0,0,0,...,0.83,0.03,0.01,0.01,0.00,0.00,0.0,0.00,21,0
14677,11680,0,tcp,smtp,SF,767,335,0,0,0,...,0.50,0.03,0.00,0.00,0.00,0.00,0.0,0.00,21,0
14678,1091,0,tcp,http,SF,162,19994,0,0,0,...,1.00,0.00,0.07,0.04,0.07,0.00,0.0,0.00,21,0
14679,2992,0,udp,domain_u,SF,44,80,0,0,0,...,1.00,0.01,0.00,0.00,0.00,0.00,0.0,0.00,20,0


In [3]:
data.shape

(14681, 44)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14681 entries, 0 to 14680
Data columns (total 44 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   ID                           14681 non-null  int64  
 1   duration                     14681 non-null  int64  
 2   protocol_type                14681 non-null  object 
 3   service                      14681 non-null  object 
 4   flag                         14681 non-null  object 
 5   src_bytes                    14681 non-null  int64  
 6   dst_bytes                    14681 non-null  int64  
 7   land                         14681 non-null  int64  
 8   wrong_fragment               14681 non-null  int64  
 9   urgent                       14681 non-null  int64  
 10  hot                          14681 non-null  int64  
 11  num_failed_logins            14681 non-null  int64  
 12  logged_in                    14681 non-null  int64  
 13  num_compromised 

In [5]:
data.columns

Index(['ID', 'duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'level', 'Class'],
      dtype='object')

## EDA

In [6]:
for column in data.columns:
    unique_values = data[column].unique()
    print(f"Unique values in {column} column: {unique_values}")

Unique values in ID column: [ 7060 22036 19555 ...  1091  2992 22066]
Unique values in duration column: [    0  9093     2     8    14   871     1     5     4 20252  4852    28
    36    29  5410     3     9    31    15    32    10  8621    26 12039
  3487    12  5066    27   159  1051 40025   638  2580  9195   315  9015
     6    40 42260 11014    11  5831 12417  6531 31905  2532  7247  8477
    25  1228  1464   261 12160  2712   398 17105  6141  9002    69  4816
  5044 37847  5057  4263   590    18  2814   432   149 13535  8035  8337
    16  4576 26070    23  3995 17399 16885 22821   475  5925 10835 13918
    21  2620  1730  9505 26848   310  4309   542  6522  4115    58  5080
  3949 12041   107   129  7025   121   282 14608 16872   192    20 12044
  8655  4764  2349    22   142   151 16553   771 14380    60   815 40806
 13417     7  5849    13 16533 15122   756   905  1721 17944  1398  8217
 39910    46 13430 15435  1208 10629  2639  2775  8649  4610  5042  1985
  1200 26525  1530  

In [7]:
data.shape

(14681, 44)

In [8]:
data['protocol_type'].unique()

array(['tcp', 'udp', 'icmp'], dtype=object)

In [9]:
data['service'].unique()

array(['smtp', 'http', 'domain_u', 'telnet', 'ftp_data', 'eco_i',
       'private', 'urp_i', 'other', 'red_i', 'ftp', 'auth', 'nnsp',
       'Z39_50', 'domain', 'kshell', 'netstat', 'whois', 'systat',
       'imap4', 'ecr_i', 'klogin', 'discard', 'daytime', 'ssh', 'IRC',
       'uucp_path', 'hostnames', 'iso_tsap', 'name', 'sql_net', 'ctf',
       'finger', 'exec', 'sunrpc', 'netbios_ns', 'efs', 'uucp', 'courier',
       'csnet_ns', 'pop_3', 'login', 'ldap', 'netbios_ssn', 'ntp_u',
       'http_443', 'supdup', 'link', 'X11', 'gopher', 'vmnet', 'mtp',
       'netbios_dgm', 'bgp', 'time', 'nntp', 'echo', 'printer', 'shell',
       'urh_i', 'remote_job', 'pm_dump', 'rje', 'pop_2', 'tim_i'],
      dtype=object)

In [10]:
data['flag'].unique()

array(['SF', 'S0', 'REJ', 'RSTO', 'S2', 'RSTR', 'S1', 'RSTOS0', 'SH',
       'S3', 'OTH'], dtype=object)

#### Encoding:

In [11]:
service_encoding = {
    'smtp': 0, 'http': 1, 'domain_u': 2, 'telnet': 3, 'ftp_data': 4, 'eco_i': 5, 'private': 6, 'urp_i': 7, 'other': 8,
    'red_i': 9, 'ftp': 10, 'auth': 11, 'nnsp': 12, 'Z39_50': 13, 'domain': 14, 'kshell': 15, 'netstat': 16, 'whois': 17,
    'systat': 18, 'imap4': 19, 'ecr_i': 20, 'klogin': 21, 'discard': 22, 'daytime': 23, 'ssh': 24, 'IRC': 25,
    'uucp_path': 26, 'hostnames': 27, 'iso_tsap': 28, 'name': 29, 'sql_net': 30, 'ctf': 31, 'finger': 32, 'exec': 33,
    'sunrpc': 34, 'netbios_ns': 35, 'efs': 36, 'uucp': 37, 'courier': 38, 'csnet_ns': 39, 'pop_3': 40, 'login': 41,
    'ldap': 42, 'netbios_ssn': 43, 'ntp_u': 44, 'http_443': 45, 'supdup': 46, 'link': 47, 'X11': 48, 'gopher': 49,
    'vmnet': 50, 'mtp': 51, 'netbios_dgm': 52, 'bgp': 53, 'time': 54, 'nntp': 55, 'echo': 56, 'printer': 57, 'shell': 58,
    'urh_i': 59, 'remote_job': 60, 'pm_dump': 61, 'rje': 62, 'pop_2': 63, 'tim_i': 64
}

data['service'] = data['service'].map(service_encoding)

In [12]:
flag_encoding = {'S3' : 0 , 'SF' : 1, 'S1' : 2, 'SH' : 3, 'RSTR' : 4, 'RSTO' : 5, 'S2' : 6, 'REJ' : 7, 'RSTOS0' : 8, 'S0' : 9, 'OTH' : 10}
data['flag'] = data['flag'].map(flag_encoding)

In [13]:
protocol_type_encoding = {'icmp' : 0 , 'udp' : 1, 'tcp' : 2 }

data['protocol_type'] = data['protocol_type'].map(protocol_type_encoding)

In [14]:
categorical_variable = 'Class'

# Get all numerical variable column names
numerical_variables = [col for col in data.columns if col != categorical_variable]

# Perform one-way ANOVA 
alpha = 0.05
for num_var in numerical_variables:
    groups = data[categorical_variable].unique()
    data_groups = [data[data[categorical_variable] == group][num_var] for group in groups]

    # Perform one-way ANOVA
    statistic, p_value = stats.f_oneway(*data_groups)

    # Print the results
    print(f"\nANOVA for {categorical_variable} and {num_var}:")
    print("ANOVA F-statistic:", statistic)
    print("P-value:", p_value)

    # Check for significance
    if p_value < alpha:
        print("The means are significantly different.")
    else:
        print("No significant difference observed between the means.")



ANOVA for Class and ID:
ANOVA F-statistic: 1.2315328342957128
P-value: 0.2671273246360095
No significant difference observed between the means.

ANOVA for Class and duration:
ANOVA F-statistic: 29.307751224213508
P-value: 6.271860385052889e-08
The means are significantly different.

ANOVA for Class and protocol_type:
ANOVA F-statistic: 30.61472777611233
P-value: 3.2008139551572846e-08
The means are significantly different.

ANOVA for Class and service:
ANOVA F-statistic: 5840.032549072899
P-value: 0.0
The means are significantly different.

ANOVA for Class and flag:
ANOVA F-statistic: 21744.895085640565
P-value: 0.0
The means are significantly different.

ANOVA for Class and src_bytes:
ANOVA F-statistic: 1.2798158648421794
P-value: 0.25795175656628166
No significant difference observed between the means.

ANOVA for Class and dst_bytes:
ANOVA F-statistic: 0.00047189349660461727
P-value: 0.982669124473487
No significant difference observed between the means.

ANOVA for Class and land:
A

In [15]:
data = data.drop(['ID',"is_host_login","urgent","num_outbound_cmds" , "src_bytes" , "dst_bytes","land","hot","num_failed_logins","num_shells","srv_count"], axis=1)

#### Handling outliers:

In [16]:

print(data['duration'].quantile(0.10)) 
print(data['duration'].quantile(0.97)) 
data['duration'] = np.where(data['duration'] > 31, 0, data['duration'])

0.0
133.20000000000255


#### Spliting data:

In [17]:
x1 = data.drop(['Class'] , axis = 1).values
y1 = data['Class'].values

In [18]:
x1_train , x1_test , y1_train , y1_test = train_test_split(x1,y1 , test_size= 0.2 , random_state=42)

#### Training and saaving the model:

In [19]:
# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 150],  
    'max_depth': [None, 10, 20, 30], 
    'min_samples_split': [2, 5, 10], 
    'min_samples_leaf': [1, 2, 4]  
}

In [20]:
rf_classifier = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=3, scoring='f1_macro', n_jobs=-1)

# Fit the model
grid_search.fit(x1_train, y1_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best model
best_rf_model = grid_search.best_estimator_

# Make predictions on the test set
y1_pred = best_rf_model.predict(x1_test)

# Evaluate the model
classification_rep = classification_report(y1_test, y1_pred)
print("Classification Report:\n", classification_rep)


Best Hyperparameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1741
           1       1.00      1.00      1.00      1196

    accuracy                           1.00      2937
   macro avg       1.00      1.00      1.00      2937
weighted avg       1.00      1.00      1.00      2937



In [21]:
f1 = f1_score(y1_test, y1_pred, average='macro') * 100
print("F1 Score:", f1)

F1 Score: 99.82371018218055


In [22]:
joblib.dump(best_rf_model, 'best_model2.pkl')

['best_model2.pkl']

# Reaading and Cleaning the second dataset

#### Reading the data:

In [23]:
df = pd.read_csv("test_task.csv")
df

Unnamed: 0,ID,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,level
0,15831,0,tcp,http,REJ,0,0,0,0,0,...,236,1.00,0.00,0.09,0.01,0.0,0.0,1.0,0.07,21
1,12617,0,tcp,ftp_data,SF,151,0,0,0,0,...,37,0.33,0.07,0.33,0.05,0.0,0.0,0.0,0.00,20
2,16328,0,udp,domain_u,SF,44,139,0,0,0,...,255,1.00,0.00,0.00,0.00,0.0,0.0,0.0,0.00,21
3,11403,0,tcp,http,SF,251,4014,0,0,0,...,255,1.00,0.00,0.00,0.01,0.0,0.0,0.0,0.00,21
4,11891,0,tcp,smtp,SF,1317,367,0,0,0,...,166,0.80,0.40,0.20,0.02,0.0,0.0,0.0,0.00,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5746,7222,0,tcp,private,S0,0,0,0,0,0,...,22,0.09,0.05,0.00,0.00,1.0,1.0,0.0,0.00,20
5747,9085,0,tcp,smtp,SF,2315,328,0,0,0,...,121,0.43,0.02,0.01,0.02,0.0,0.0,0.0,0.00,21
5748,22575,0,tcp,http,S0,0,0,0,0,0,...,25,0.10,0.05,0.00,0.00,1.0,1.0,0.0,0.00,19
5749,12005,0,tcp,http,SF,191,24802,0,0,0,...,255,1.00,0.00,0.00,0.00,0.0,0.0,0.0,0.00,21


In [24]:
df = df.drop(['ID',"is_host_login","urgent","num_outbound_cmds" , "src_bytes" , "dst_bytes","land","hot","num_failed_logins","num_shells","srv_count"], axis=1)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5751 entries, 0 to 5750
Data columns (total 32 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   duration                     5751 non-null   int64  
 1   protocol_type                5751 non-null   object 
 2   service                      5751 non-null   object 
 3   flag                         5751 non-null   object 
 4   wrong_fragment               5751 non-null   int64  
 5   logged_in                    5751 non-null   int64  
 6   num_compromised              5751 non-null   int64  
 7   root_shell                   5751 non-null   int64  
 8   su_attempted                 5751 non-null   int64  
 9   num_root                     5751 non-null   int64  
 10  num_file_creations           5751 non-null   int64  
 11  num_access_files             5751 non-null   int64  
 12  is_guest_login               5751 non-null   int64  
 13  count             

In [26]:
data.columns

Index(['duration', 'protocol_type', 'service', 'flag', 'wrong_fragment',
       'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
       'num_root', 'num_file_creations', 'num_access_files', 'is_guest_login',
       'count', 'serror_rate', 'srv_serror_rate', 'rerror_rate',
       'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
       'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
       'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
       'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
       'dst_host_serror_rate', 'dst_host_srv_serror_rate',
       'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'level', 'Class'],
      dtype='object')

In [27]:
df['protocol_type'].unique()

array(['tcp', 'udp', 'icmp'], dtype=object)

In [28]:
df['service'].unique()

array(['http', 'ftp_data', 'domain_u', 'smtp', 'pop_3', 'other', 'telnet',
       'eco_i', 'private', 'finger', 'echo', 'courier', 'ecr_i', 'Z39_50',
       'iso_tsap', 'netbios_ns', 'vmnet', 'kshell', 'ntp_u', 'rje',
       'http_443', 'ldap', 'daytime', 'csnet_ns', 'exec', 'systat',
       'imap4', 'mtp', 'ctf', 'discard', 'name', 'bgp', 'time', 'ssh',
       'urp_i', 'supdup', 'ftp', 'link', 'uucp', 'gopher', 'klogin',
       'uucp_path', 'pop_2', 'netbios_dgm', 'netstat', 'auth', 'nnsp',
       'sql_net', 'whois', 'sunrpc', 'netbios_ssn', 'X11', 'hostnames',
       'nntp', 'urh_i', 'efs', 'IRC', 'shell', 'printer', 'login',
       'remote_job', 'domain', 'red_i'], dtype=object)

In [29]:
df['flag'].unique()

array(['REJ', 'SF', 'S0', 'RSTR', 'RSTO', 'S3', 'SH', 'S1', 'S2',
       'RSTOS0', 'OTH'], dtype=object)

#### Encoding:

In [30]:
service_encoding = {
    'smtp': 0, 'http': 1, 'domain_u': 2, 'telnet': 3, 'ftp_data': 4, 'eco_i': 5, 'private': 6, 'urp_i': 7, 'other': 8,
    'red_i': 9, 'ftp': 10, 'auth': 11, 'nnsp': 12, 'Z39_50': 13, 'domain': 14, 'kshell': 15, 'netstat': 16, 'whois': 17,
    'systat': 18, 'imap4': 19, 'ecr_i': 20, 'klogin': 21, 'discard': 22, 'daytime': 23, 'ssh': 24, 'IRC': 25,
    'uucp_path': 26, 'hostnames': 27, 'iso_tsap': 28, 'name': 29, 'sql_net': 30, 'ctf': 31, 'finger': 32, 'exec': 33,
    'sunrpc': 34, 'netbios_ns': 35, 'efs': 36, 'uucp': 37, 'courier': 38, 'csnet_ns': 39, 'pop_3': 40, 'login': 41,
    'ldap': 42, 'netbios_ssn': 43, 'ntp_u': 44, 'http_443': 45, 'supdup': 46, 'link': 47, 'X11': 48, 'gopher': 49,
    'vmnet': 50, 'mtp': 51, 'netbios_dgm': 52, 'bgp': 53, 'time': 54, 'nntp': 55, 'echo': 56, 'printer': 57, 'shell': 58,
    'urh_i': 59, 'remote_job': 60, 'pm_dump': 61, 'rje': 62, 'pop_2': 63, 'tim_i': 64
}

df['service'] = df['service'].map(service_encoding)

In [31]:
flag_encoding = {'S3' : 0 , 'SF' : 1, 'S1' : 2, 'SH' : 3, 'RSTR' : 4, 'RSTO' : 5, 'S2' : 6, 'REJ' : 7, 'RSTOS0' : 8, 'S0' : 9, 'OTH' : 10}
df['flag'] = df['flag'].map(flag_encoding)

In [32]:
protocol_type_encoding = {'icmp' : 0 , 'udp' : 1, 'tcp' : 2 }

df['protocol_type'] = df['protocol_type'].map(protocol_type_encoding)

In [33]:
df["protocol_type"].unique()

array([2, 1, 0], dtype=int64)

In [34]:
df.shape

(5751, 32)

#### Loading Model and making predictions:

In [35]:
loaded_model = joblib.load('best_model2.pkl')

In [36]:
# Making predictions:
predictions = loaded_model.predict(df)

In [37]:
# Creating new dataframe:
predictions_df = pd.DataFrame({'Prediction': predictions})

In [38]:
# Saving it to Csv file:
predictions_df.to_csv('predictions_output3.csv', index=False)