In [4]:
import pandas as pd
import numpy as np

In [5]:
urls = [
        "http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz",
        "http://kdd.ics.uci.edu/databases/kddcup99/kddcup.names"
        ]

In [6]:
# the code in this cell that cleans up and prepare the KDD data set by https://github.com/lironber/GOAD/blob/master/data_loader.py
df_colnames = pd.read_csv(urls[1], skiprows=1, sep=':', names=['f_names', 'f_types'])
df_colnames.loc[df_colnames.shape[0]] = ['status', ' symbolic.']

df = pd.read_csv(urls[0], header=None, names=df_colnames['f_names'].values)
df_symbolic = df_colnames[df_colnames['f_types'].str.contains('symbolic.')]
df_continuous = df_colnames[df_colnames['f_types'].str.contains('continuous.')]
samples = pd.get_dummies(df.iloc[:, :-1], columns=df_symbolic['f_names'][:-1])

labels = np.where(df['status'] == 'normal.', 1, 0)


In [7]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn import metrics

scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(samples)

In [8]:
df_scaled.shape

(494021, 121)

In [9]:
samples

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,su_attempted,...,flag_S3,flag_SF,flag_SH,land_0,land_1,logged_in_0,logged_in_1,is_host_login_0,is_guest_login_0,is_guest_login_1
0,0,181,5450,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0
1,0,239,486,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0
2,0,235,1337,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0
3,0,219,1337,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0
4,0,217,2032,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494016,0,310,1881,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0
494017,0,282,2286,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0
494018,0,203,1200,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0
494019,0,291,1200,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0


In [10]:
np.count_nonzero(labels) / len(labels)

0.19691065764410826

In [11]:
#!pip install catboost

In [12]:
from catboost import CatBoostClassifier, Pool

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(samples, labels, test_size=0.33, random_state=42)

In [14]:
model = CatBoostClassifier(iterations=100,
                           depth=5,
                           learning_rate=0.01,
                           loss_function='Logloss',
                           verbose=True)

In [15]:
# train the model
model.fit(X_train, y_train)

0:	learn: 0.6531781	total: 87.5ms	remaining: 8.66s
1:	learn: 0.6161391	total: 120ms	remaining: 5.86s
2:	learn: 0.5801563	total: 152ms	remaining: 4.93s
3:	learn: 0.5477762	total: 183ms	remaining: 4.4s
4:	learn: 0.5136813	total: 214ms	remaining: 4.06s
5:	learn: 0.4845641	total: 245ms	remaining: 3.84s
6:	learn: 0.4589135	total: 274ms	remaining: 3.65s
7:	learn: 0.4306526	total: 310ms	remaining: 3.57s
8:	learn: 0.4064459	total: 345ms	remaining: 3.48s
9:	learn: 0.3843380	total: 375ms	remaining: 3.38s
10:	learn: 0.3620018	total: 408ms	remaining: 3.3s
11:	learn: 0.3419765	total: 441ms	remaining: 3.24s
12:	learn: 0.3231790	total: 474ms	remaining: 3.17s
13:	learn: 0.3103468	total: 505ms	remaining: 3.1s
14:	learn: 0.2982175	total: 540ms	remaining: 3.06s
15:	learn: 0.2813186	total: 573ms	remaining: 3.01s
16:	learn: 0.2639556	total: 605ms	remaining: 2.96s
17:	learn: 0.2542112	total: 636ms	remaining: 2.9s
18:	learn: 0.2393039	total: 669ms	remaining: 2.85s
19:	learn: 0.2268173	total: 695ms	remaining:

<catboost.core.CatBoostClassifier at 0x7f97200ebeb0>

In [16]:
# make the prediction using the resulting model
preds_class = model.predict(X_test)
preds_proba = model.predict_proba(X_test)
print("class = ", preds_class)
print("proba = ", preds_proba)

class =  [0 0 0 ... 1 0 0]
proba =  [[0.99304808 0.00695192]
 [0.99304808 0.00695192]
 [0.99304808 0.00695192]
 ...
 [0.02974465 0.97025535]
 [0.99817744 0.00182256]
 [0.99304808 0.00695192]]


In [17]:
y_pred = preds_class
print(metrics.classification_report(y_test,y_pred ))
print('Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, y_pred)))
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, y_pred))
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print('\nTrue Positives(TP) = ', tp)
print('\nTrue Negatives(TN) = ', tn)
print('\nFalse Positives(FP) = ', fp)
print('\nFalse Negatives(FN) = ', fn)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    130860
           1       0.98      1.00      0.99     32167

    accuracy                           1.00    163027
   macro avg       0.99      1.00      0.99    163027
weighted avg       1.00      1.00      1.00    163027

Model accuracy score: 0.9962
[[130267    593]
 [    33  32134]]

True Positives(TP) =  32134

True Negatives(TN) =  130267

False Positives(FP) =  593

False Negatives(FN) =  33


In [18]:
model.save_model('catboost_model.cbm',
           format="cbm",
           export_parameters=None,
           pool=None)

In [19]:
samples

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,su_attempted,...,flag_S3,flag_SF,flag_SH,land_0,land_1,logged_in_0,logged_in_1,is_host_login_0,is_guest_login_0,is_guest_login_1
0,0,181,5450,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0
1,0,239,486,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0
2,0,235,1337,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0
3,0,219,1337,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0
4,0,217,2032,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494016,0,310,1881,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0
494017,0,282,2286,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0
494018,0,203,1200,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0
494019,0,291,1200,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0


In [83]:
X_train.to_parquet("dataset.prq", index=False, engine='pyarrow')

In [85]:
X_train.to_csv("dataset.csv", index=False)