In [54]:
import pandas as pd
import numpy as np
# import torch
from sklearn.model_selection import train_test_split
from sklearn.model_selection import PredefinedSplit
from sklearn import preprocessing, decomposition

seed = 42
FILENAME = "train_dataset.csv"

In [55]:
# 1. read_csv
# 2. dropna
# 3. drop label
# 4. replace src_bytes
# 5. casting 
# 6. encoding (cat/num)
# 7. extract y 
# 8. drop type
# 9. label encoding on y

import

In [56]:
# 1, 2 e 3
df = pd.read_csv(FILENAME, sep=",", low_memory=False)
df = df.dropna()
df = df.drop(columns=["label"])

# 4
df["src_bytes"] = df["src_bytes"].replace("0.0.0.0", np.nan).astype(float)
mean_src_bytes = df["src_bytes"].mean()
df["src_bytes"] = df["src_bytes"].fillna(mean_src_bytes)

# 5
df.astype({'src_bytes': 'int64', 'ts': 'datetime64[ms]', 'dns_AA': 'bool', 'dns_RD': 'bool', 'dns_RA': 'bool', 'dns_rejected': 'bool', 'ssl_resumed': 'bool', 'ssl_established': 'bool', 'weird_notice': 'bool'}).dtypes

# 6
y = df["type"]
df = df.drop(columns=["type"])

print(df.select_dtypes(include=['object']).shape)

# 7
oe = preprocessing.OrdinalEncoder()
df_oe = oe.fit_transform(df.select_dtypes(include=['object']))
df.loc[:, df.select_dtypes(include=['object']).columns] = df_oe
X = df.to_numpy()

# 8
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

(617002, 27)


splitting

In [57]:
indeces = np.arange(X.shape[0])
train_idx, val_idx = train_test_split(indeces, test_size=0.2, stratify=y, random_state=seed)
val_idx, test_idx = train_test_split(val_idx, test_size=0.5, stratify=y[val_idx], random_state=seed)

fold = np.zeros(X.shape[0])
fold[train_idx] = -1
fold[val_idx] = 0
fold[test_idx] = 1

ps = PredefinedSplit(fold)
ps.get_n_splits()

X_test = X[test_idx,:]
y_test = y[test_idx]
X_train = X[train_idx,:]
y_train = y[train_idx]
X_val = X[val_idx,:]
y_val = y[val_idx]

preprocess

In [58]:
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

X = scaler.transform(X)

pca = decomposition.PCA(n_components='mle')
pca.fit(X_train)

X = pca.transform(X)

In [59]:
print(pca.explained_variance_ratio_)

[1.44900607e-01 1.33969802e-01 1.33675083e-01 4.92563102e-02
 4.70841450e-02 4.61255229e-02 4.54983184e-02 4.23647696e-02
 3.90549396e-02 3.04763781e-02 2.33517088e-02 2.29889283e-02
 2.21947914e-02 2.17003153e-02 2.08727979e-02 2.00212553e-02
 1.97292747e-02 1.88016761e-02 1.76938953e-02 1.65682323e-02
 1.54616886e-02 1.51850213e-02 9.87827754e-03 7.51437108e-03
 6.96394655e-03 5.98062813e-03 5.02077679e-03 4.85400224e-03
 2.14769257e-03 2.05417698e-03 1.90607313e-03 1.43796611e-03
 1.20337627e-03 1.14801797e-03 9.00030610e-04 7.01092490e-04
 3.94912281e-04 3.89787902e-04 3.12938584e-04 1.14936749e-04
 1.01535087e-04 1.77915237e-16]


In [60]:
from sklearn.svm import SVC
from sklearn import svm
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, balanced_accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import pickle

---
svc

In [62]:
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 1, 10, 100, 1000],  
}  

grid = GridSearchCV(svm.LinearSVC(random_state=seed), param_grid, verbose=10, cv=ps, scoring='balanced_accuracy', n_jobs=-1)
grid.fit(X, y)
file = open("svm.save", "wb")
pickle.dump(grid.best_estimator_, file)
print("Best hyper: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)
file.close()

Fitting 2 folds for each of 10 candidates, totalling 20 fits
[CV 2/2; 4/10] START C=1, penalty=l2............................................
[CV 1/2; 3/10] START C=1, penalty=l1............................................
[CV 1/2; 1/10] START C=0.1, penalty=l1..........................................
[CV 2/2; 3/10] START C=1, penalty=l1............................................[CV 2/2; 2/10] START C=0.1, penalty=l2..........................................

[CV 2/2; 1/10] START C=0.1, penalty=l1..........................................[CV 1/2; 4/10] START C=1, penalty=l2............................................

[CV 1/2; 2/10] START C=0.1, penalty=l2..........................................




[CV 2/2; 1/10] END ...........C=0.1, penalty=l1;, score=0.649 total time=59.8min
[CV 1/2; 5/10] START C=10, penalty=l1...........................................




[CV 1/2; 1/10] END ...........C=0.1, penalty=l1;, score=0.650 total time=62.7min
[CV 2/2; 5/10] START C=10, penalty=l1...........................................




[CV 1/2; 3/10] END .............C=1, penalty=l1;, score=0.666 total time=72.4min
[CV 1/2; 6/10] START C=10, penalty=l2...........................................




[CV 2/2; 3/10] END .............C=1, penalty=l1;, score=0.661 total time=76.0min
[CV 2/2; 6/10] START C=10, penalty=l2...........................................




[CV 2/2; 2/10] END ...........C=0.1, penalty=l2;, score=0.653 total time=81.6min
[CV 1/2; 7/10] START C=100, penalty=l1..........................................




[CV 1/2; 2/10] END ..........C=0.1, penalty=l2;, score=0.653 total time=112.3min
[CV 2/2; 7/10] START C=100, penalty=l1..........................................




[CV 1/2; 5/10] END ............C=10, penalty=l1;, score=0.666 total time=99.2min
[CV 1/2; 8/10] START C=100, penalty=l2..........................................




[CV 2/2; 5/10] END ............C=10, penalty=l1;, score=0.661 total time=99.0min
[CV 2/2; 8/10] START C=100, penalty=l2..........................................




[CV 1/2; 7/10] END ..........C=100, penalty=l1;, score=0.666 total time=105.0min
[CV 1/2; 9/10] START C=1000, penalty=l1.........................................




[CV 2/2; 7/10] END ..........C=100, penalty=l1;, score=0.661 total time=114.5min
[CV 2/2; 9/10] START C=1000, penalty=l1.........................................




[CV 1/2; 4/10] END ............C=1, penalty=l2;, score=0.670 total time=285.0min
[CV 1/2; 10/10] START C=1000, penalty=l2........................................




[CV 1/2; 9/10] END .........C=1000, penalty=l1;, score=0.666 total time=107.7min
[CV 2/2; 10/10] START C=1000, penalty=l2........................................




[CV 2/2; 9/10] END .........C=1000, penalty=l1;, score=0.662 total time=102.9min




[CV 2/2; 4/10] END ............C=1, penalty=l2;, score=0.668 total time=343.1min




[CV 1/2; 6/10] END ...........C=10, penalty=l2;, score=0.683 total time=273.5min




[CV 2/2; 6/10] END ...........C=10, penalty=l2;, score=0.688 total time=278.4min




[CV 1/2; 8/10] END ..........C=100, penalty=l2;, score=0.698 total time=195.9min




[CV 2/2; 8/10] END ..........C=100, penalty=l2;, score=0.700 total time=195.4min




[CV 1/2; 10/10] END .........C=1000, penalty=l2;, score=0.723 total time=80.2min




[CV 2/2; 10/10] END .........C=1000, penalty=l2;, score=0.719 total time=72.9min
Best hyper:  LinearSVC(C=1000, random_state=42)
Best score:  0.7207091302697002




In [63]:
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [10000, 100000, 1000000],  
}  

grid = GridSearchCV(svm.LinearSVC(random_state=seed), param_grid, verbose=10, cv=ps, scoring='balanced_accuracy', n_jobs=-1)
grid.fit(X, y)
file = open("svm_.save", "wb")
pickle.dump(grid.best_estimator_, file)
print("Best hyper: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)
file.close()

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV 1/2; 2/6] START C=10000, penalty=l2.........................................[CV 2/2; 2/6] START C=10000, penalty=l2.........................................
[CV 1/2; 4/6] START C=100000, penalty=l2........................................
[CV 2/2; 1/6] START C=10000, penalty=l1.........................................
[CV 1/2; 3/6] START C=100000, penalty=l1........................................
[CV 1/2; 1/6] START C=10000, penalty=l1.........................................
[CV 2/2; 4/6] START C=100000, penalty=l2........................................
[CV 2/2; 3/6] START C=100000, penalty=l1........................................



KeyboardInterrupt: 