# XGBoost

## Imports

In [1]:
%pip install xgboost
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
import numpy as np
import joblib

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


## Dataset

### Not all

In [None]:
dataset_path = './dataset/data/NF-UQ-NIDS-v2.csv'
sample_size = 7500000
data = pd.read_csv(dataset_path, nrows=sample_size)

features = data.drop(['Attack', 'Dataset'], axis=1)
target = data['Attack']

#Eliminate Nans
features = features.dropna()
target = target.loc[features.index]

#Eliminate the biggest values
for col in features.select_dtypes(include=[np.number]).columns:
    upper_limit = features[col].quantile(0.99)
    lower_limit = features[col].quantile(0.01)
    features = features[features[col] <= upper_limit]
    features = features[features[col] >= lower_limit]
    target = target.loc[features.index]

non_numeric_cols = features.select_dtypes(include=['object']).columns
encoder = LabelEncoder()
for col in non_numeric_cols:
    features[col] = encoder.fit_transform(features[col])

target = encoder.fit_transform(target)

# Save encoder
label_encoder_path = './label_encoder.joblib'
joblib.dump(encoder, label_encoder_path)

# Normalize numeric columns
numeric_cols = features.select_dtypes(include=[np.number]).columns
scaler = RobustScaler()
features[numeric_cols] = scaler.fit_transform(features[numeric_cols])

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [3]:
# If the labels are different, execute the "Intersection for the labels code"
print("Etiquetas ajustadas en y_train:", np.unique(y_train))
print("Etiquetas ajustadas en y_test:", np.unique(y_test))

Etiquetas ajustadas en y_train: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
Etiquetas ajustadas en y_test: [ 0  1  2  3  4  6  7  9 10 11 12 13 14]


#### Intersection for the labels

In [5]:
# Obtener etiquetas comunes entre y_train e y_test
common_labels = set(y_train).intersection(set(y_test))

# Filtrar tanto y_train como y_test para mantener solo las etiquetas comunes
mask_train = np.isin(y_train, list(common_labels))
mask_test = np.isin(y_test, list(common_labels))

X_train = X_train[mask_train]
y_train = y_train[mask_train]

X_test = X_test[mask_test]
y_test = y_test[mask_test]

# Asegurar que los índices están alineados tras el filtrado
X_train = X_train.reset_index(drop=True)
y_train = pd.Series(y_train).reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = pd.Series(y_test).reset_index(drop=True)

# Re-check unique classes
unique_train_labels = set(y_train)
unique_test_labels = set(y_test)

print("Etiquetas únicas en y_train:", sorted(unique_train_labels))
print("Etiquetas únicas en y_test:", sorted(unique_test_labels))

# Crear un LabelEncoder
label_encoder = LabelEncoder()

# Ajustar el encoder a todas las etiquetas únicas de y_train
y_train = label_encoder.fit_transform(y_train)

# Transformar y_test utilizando el mismo encoder
y_test = label_encoder.transform(y_test)

# Imprimir las clases mapeadas
print("Clases originales:", label_encoder.classes_)
print("Etiquetas transformadas en y_train:", sorted(set(y_train)))
print("Etiquetas transformadas en y_test:", sorted(set(y_test)))


Etiquetas únicas en y_train: [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16]
Etiquetas únicas en y_test: [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16]
Clases originales: [ 0  1  2  3  4  5  6  8  9 10 11 12 13 14 15 16]
Etiquetas transformadas en y_train: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
Etiquetas transformadas en y_test: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]


In [9]:
# Check the number of rows after the cleanup
print(X_train.shape)
print(X_test.shape)

(4719951, 45)
(1179988, 45)


### All

In [None]:
dataset_path = './dataset/data/NF-UQ-NIDS-v2.csv'
data = pd.read_csv(dataset_path)

features = data.drop(['Attack'], axis=1)
target = data['Attack']

#Eliminate Nans
features = features.dropna()
target = target.loc[features.index]

#Eliminate the biggest values
for col in features.select_dtypes(include=[np.number]).columns:
    upper_limit = features[col].quantile(0.99)
    lower_limit = features[col].quantile(0.01)
    features = features[features[col] <= upper_limit]
    features = features[features[col] >= lower_limit]
    target = target.loc[features.index]

non_numeric_cols = features.select_dtypes(include=['object']).columns
encoder = LabelEncoder()
for col in non_numeric_cols:
    features[col] = encoder.fit_transform(features[col])

target = encoder.fit_transform(target)

# Normalize numeric columns
numeric_cols = features.select_dtypes(include=[np.number]).columns
scaler = RobustScaler()
features[numeric_cols] = scaler.fit_transform(features[numeric_cols])

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

### Just train (without test split)

In [None]:
# Cargar los datos
dataset_path = './dataset/data/NF-UQ-NIDS-v2.csv'
data = pd.read_csv(dataset_path)

# Separar las características (features) y la etiqueta (target)
features = data.drop(['Attack'], axis=1)
target = data['Attack']

# Eliminar NaN
features = features.dropna()
target = target.loc[features.index]

# Eliminar los valores más grandes (outliers)
for col in features.select_dtypes(include=[np.number]).columns:
    upper_limit = features[col].quantile(0.99)
    lower_limit = features[col].quantile(0.01)
    features = features[features[col] <= upper_limit]
    features = features[features[col] >= lower_limit]
    target = target.loc[features.index]

# Codificar las columnas no numéricas
non_numeric_cols = features.select_dtypes(include=['object']).columns
encoder = LabelEncoder()
for col in non_numeric_cols:
    features[col] = encoder.fit_transform(features[col])

# Codificar la etiqueta (target)
target = encoder.fit_transform(target)

# Normalizar las columnas numéricas
numeric_cols = features.select_dtypes(include=[np.number]).columns
scaler = RobustScaler()
features[numeric_cols] = scaler.fit_transform(features[numeric_cols])

# Asignar todo el conjunto de datos a X_train e y_train
X_train = features
y_train = target

## Model

In [10]:
model = xgb.XGBClassifier(eval_metric='logloss', n_estimators=200, n_jobs=8, verbosity=1)

## Training

In [11]:
model.fit(X_train, y_train)

## Testing

In [12]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(accuracy)

0.9979787930046746


## Save

In [None]:
joblib.dump(model, './models_xgboost/7c5m_200.joblib')

['./models_xgboost/1c5m_200.joblib']