In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import category_encoders as ce
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


df = pd.read_csv("../EdgeIIoT-dataset.csv")

print(f"Dimensioni del dataset: {df.shape}")

print(df.isnull().sum().sort_values())

threshold = 0.95

numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

frac_zeros = (df[numeric_cols] == 0).sum() / len(df)

cols_to_drop = frac_zeros[frac_zeros > threshold].index.tolist()

print("Colonne numeriche con troppi zeri da droppare:")
print(cols_to_drop)
df = df.drop(columns=cols_to_drop)
print("\nShape dopo drop:", df.shape)

df.dtypes.to_csv("../tipi_di_dati_2.csv", header=["dtype"])

numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

categorical_features = df.select_dtypes(include=['object', 'str']).columns.tolist()

print("Feature numeriche:", numeric_features)
print("Feature categoriche:", categorical_features)

onehot_features = ['arp.dst.proto_ipv4', 'arp.src.proto_ipv4',
                   'http.request.method', 'http.request.version',
                   'mqtt.conack.flags', 'mqtt.protoname']

labelencode_features = ['ip.src_host', 'ip.dst_host', 'tcp.srcport', 'mqtt.topic','frame.time','dns.qry.name.len']

binary_features = ['http.file_data', 'http.request.uri.query', 'http.referer',
                   'http.request.full_uri', 'tcp.options', 'tcp.payload', 'mqtt.msg']

be = ce.BinaryEncoder(cols=binary_features, return_df=True)
df = be.fit_transform(df)

target_feature = 'Attack_type'
le_target = LabelEncoder()
df[target_feature] = le_target.fit_transform(df[target_feature])

for col in labelencode_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

df_enc = pd.get_dummies(df, columns=onehot_features, dummy_na=False)

print("Shape finale:", df_enc.shape)

y = df_enc['Attack_type']
X = df_enc.drop(columns="Attack_type")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(df_enc['Attack_type'].value_counts(normalize=True))

  df = pd.read_csv("../EdgeIIoT-dataset.csv")


Dimensioni del dataset: (2219201, 63)
frame.time            0
ip.src_host           0
ip.dst_host           0
arp.dst.proto_ipv4    0
arp.opcode            0
                     ..
mbtcp.len             0
mbtcp.trans_id        0
mbtcp.unit_id         0
Attack_label          0
Attack_type           0
Length: 63, dtype: int64
Colonne numeriche con troppi zeri da droppare:
['arp.opcode', 'arp.hw.size', 'icmp.transmit_timestamp', 'icmp.unused', 'http.content_length', 'http.response', 'http.tls_port', 'tcp.connection.synack', 'udp.port', 'udp.time_delta', 'dns.qry.name', 'dns.qry.qu', 'dns.qry.type', 'dns.retransmission', 'dns.retransmit_request', 'dns.retransmit_request_in', 'mqtt.conflag.cleansess', 'mqtt.conflags', 'mqtt.msg_decoded_as', 'mqtt.proto_len', 'mqtt.topic_len', 'mqtt.ver', 'mbtcp.len', 'mbtcp.trans_id', 'mbtcp.unit_id']

Shape dopo drop: (2219201, 38)
Feature numeriche: ['icmp.checksum', 'icmp.seq_le', 'tcp.ack', 'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin', 'tcp.conn

# ADA Boost

In [2]:

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier


from scipy.stats import uniform, randint
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, f1_score

classes, counts = np.unique(y_train, return_counts=True)
class_weights = {cls: 1.0/count for cls, count in zip(classes, counts)}
sample_weights = np.array([class_weights[label] for label in y_train])


# 3. Parametri AdaBoost
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0]
}

ada_search = GridSearchCV(AdaBoostClassifier(random_state=42), param_grid, cv=5, scoring='f1',n_jobs=-1)
ada_search.fit(X_train, y_train,sample_weight=sample_weights)



print("best accuracy",ada_search.best_score_)
print(ada_search.best_estimator_)


best_ada = ada_search.best_params_

ada = AdaBoostClassifier(**best_ada)
ada.fit(X_train, y_train, sample_weight=sample_weights)

# Predict continuous values
y_pred_ada = ada.predict(X_test)

#matrice di confusione
cm_ada = confusion_matrix(y_test, y_pred_ada)

# Use unique values for labels
labels = sorted(df['type'].unique())

plt.figure(figsize=(6, 4))
sns.heatmap(cm_ada, annot=True, fmt='d', cmap='Blues',
            xticklabels=labels,
            yticklabels=labels)

plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Matrice di Confusione - ada boost')
plt.show()


: 