# XGBoost

## Imports

In [1]:
%pip install xgboost
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
import numpy as np

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


## Dataset

In [None]:
dataset_path = './dataset/data/NF-UQ-NIDS-v2.csv'
sample_size = 1000000
data = pd.read_csv(dataset_path, nrows=sample_size)

# Separar características y objetivo
features = data.drop(['Attack'], axis=1)
target = data['Attack']

# Eliminar filas con valores nulos (NaN)
features = features.dropna()
target = target.loc[features.index]  # Asegurarse de que target tenga las mismas filas

# Eliminar valores demasiado grandes (outliers)
# Definimos el umbral como el percentil 99
for col in features.select_dtypes(include=[np.number]).columns:
    upper_limit = features[col].quantile(0.99)  # Umbral superior en el percentil 99
    lower_limit = features[col].quantile(0.01)  # Umbral inferior en el percentil 1
    features = features[features[col] <= upper_limit]
    features = features[features[col] >= lower_limit]
    target = target.loc[features.index]  # Asegurarse de que target tenga las mismas filas

# Codificación de variables no numéricas
non_numeric_cols = features.select_dtypes(include=['object']).columns
encoder = LabelEncoder()
for col in non_numeric_cols:
    features[col] = encoder.fit_transform(features[col])

# Codificación de la variable objetivo
target = encoder.fit_transform(target)

# Normalización de las características numéricas
numeric_cols = features.select_dtypes(include=[np.number]).columns
scaler = RobustScaler()
features[numeric_cols] = scaler.fit_transform(features[numeric_cols])

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)


## Model

In [3]:
model = xgb.XGBClassifier(eval_metric='logloss')

## Training

In [4]:
model.fit(X_train, y_train)

XGBoostError: [11:29:07] /workspace/src/data/../common/../data/gradient_index.h:94: Check failed: valid: Input data contains `inf` or a value too large, while `missing` is not set to `inf`
Stack trace:
  [bt] (0) /home/carles/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x22dcbc) [0x7c82e382dcbc]
  [bt] (1) /home/carles/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x52d198) [0x7c82e3b2d198]
  [bt] (2) /home/carles/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x52dcff) [0x7c82e3b2dcff]
  [bt] (3) /home/carles/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x51b98d) [0x7c82e3b1b98d]
  [bt] (4) /home/carles/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x51dbbc) [0x7c82e3b1dbbc]
  [bt] (5) /home/carles/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x4cd1ca) [0x7c82e3acd1ca]
  [bt] (6) /home/carles/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(XGQuantileDMatrixCreateFromCallback+0x18c) [0x7c82e374556c]
  [bt] (7) /lib/x86_64-linux-gnu/libffi.so.8(+0x7e2e) [0x7c8353d7ee2e]
  [bt] (8) /lib/x86_64-linux-gnu/libffi.so.8(+0x4493) [0x7c8353d7b493]



## Testing

In [44]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(accuracy)

0.986
