In [None]:
import pandas as pd
import numpy as np
# import torch
from sklearn.model_selection import train_test_split
from sklearn.model_selection import PredefinedSplit
from sklearn import preprocessing

seed = 42

FILENAME = "train_dataset.csv"

#Prepare train data
df1 = pd.read_csv(FILENAME, encoding='ISO-8859-1', sep=",", low_memory=False)
# print("EX1) #Righe: " + str(df1.shape[0])+ " #Colonne: "+str(df1.shape[1]))

# print(df1.nunique())
# print(df1.isna().sum())

# print(df1.shape)
df1 = df1.dropna()
# print(df1.shape)


y = df1["type"]
X = df1.drop(columns=["type", "label"])

err_arr = X.loc[X["src_bytes"] == "0.0.0.0"]
X = X.drop(index=err_arr.index)
y = y.drop(index=err_arr.index)
X.astype({'src_bytes': 'int64', 'ts': 'datetime64[ms]', 'dns_AA': 'bool', 'dns_RD': 'bool', 'dns_RA': 'bool', 'dns_rejected': 'bool', 'ssl_resumed': 'bool', 'ssl_established': 'bool', 'weird_notice': 'bool'}).dtypes

X = X.to_numpy()
y = y.to_numpy()

for feature in ["dns_AA","dns_RD","dns_RA","dns_rejected","ssl_version","ssl_cipher","ssl_resumed","ssl_established","ssl_subject","ssl_issuer","http_trans_depth","http_method","http_uri","http_referrer","http_version","http_request_body_len","http_response_body_len","http_status_code","http_user_agent","http_orig_mime_types","http_resp_mime_types","weird_name","weird_addl","weird_notice"]:
    # print(f"Feature: {feature}")    
    feature_index = np.where(df1.columns == feature)[0][0]
    elements, counts = np.unique(X[:, feature_index], return_counts=True)

    # for element, count in zip(elements, counts):
    #     print(f"    Element: {element}, Count: {count}")


oe = preprocessing.OrdinalEncoder()
oe.fit(X)
X = oe.transform(X)

le = preprocessing.LabelEncoder()
le.fit(y)
y = le.transform(y)


# variances = np.var(X, axis=0)
# features = df1.columns
# for i, variance in enumerate(variances):
#     print(f"{features[i]} \t\t: Variance = {variance}")


from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold(0.01)
X_mod = selector.fit_transform(X)

print(f"Original shape: {X.shape}")
print(f"Modified shape: {X_mod.shape}")


train_idx, test_idx = train_test_split(np.arange(X_mod.shape[0]), test_size=0.2, stratify=y, random_state=seed)
test_idx, val_idx = train_test_split(test_idx, test_size=0.5, stratify=y[test_idx], random_state=seed)

y_test = y[test_idx]
y_train = y[train_idx]
y_val = y[val_idx]

X_test_mod = X_mod[test_idx,:]
X_train_mod = X_mod[train_idx,:]
X_val_mod = X_mod[val_idx,:]

X_test = X[test_idx,:]
X_train = X[train_idx,:]
X_val = X[val_idx,:]

print(X_train.shape, X_test.shape, X_val.shape)
print(X_train_mod.shape, X_test_mod.shape, X_val_mod.shape)
print(len(y_train), len(y_test), len(y_val))


from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import decomposition

## Scaling (treshold)
scaler = preprocessing.StandardScaler()
scaler.fit(X_train_mod)
X_train_mod = scaler.transform(X_train_mod)
X_test_mod = scaler.transform(X_test_mod)
X_val_mod = scaler.transform(X_val_mod)

## Scaling (no treshold)
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

print(X_train.shape, X_test.shape, X_val.shape)
print(X_train_mod.shape, X_test_mod.shape, X_val_mod.shape)


## PCA (treshold)
pca = decomposition.PCA()
pca.fit(X_train_mod)
X_train_mod_pca = pca.transform(X_train_mod)
X_test_mod_pca = pca.transform(X_test_mod)
X_val_mod_pca = pca.transform(X_val_mod)

## PCA (no treshold)
pca = decomposition.PCA()
pca.fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
X_val_pca = pca.transform(X_val)

print(X_train_pca.shape, X_test_pca.shape, X_val_pca.shape)
print(X_train_mod_pca.shape, X_test_mod_pca.shape, X_val_mod_pca.shape)


## LDA (treshold)
lda = LinearDiscriminantAnalysis(n_components=9)
lda.fit(X_train_mod, y_train)
X_train_mod_lda = lda.transform(X_train_mod)
X_test_mod_lda = lda.transform(X_test_mod)
X_val_mod_lda = lda.transform(X_val_mod)

## LDA (no treshold)
lda = LinearDiscriminantAnalysis(n_components=9)
lda.fit(X_train, y_train)
X_train_lda = lda.transform(X_train)
X_test_lda = lda.transform(X_test)
X_val_lda = lda.transform(X_val)

X_train_std = X_train
X_test_std = X_test
X_val_std = X_val

print(X_train_lda.shape, X_test_lda.shape, X_val_lda.shape)
print(X_train_mod_lda.shape, X_test_mod_lda.shape, X_val_mod_lda.shape)

# Feed Forward networks