In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [2]:
# Load and preprocess data
def preprocess_data(df):
    # Create copy to avoid modifying original data
    df = df.copy()
    
    # Split features and target
    X = df.drop(['is_benign', 'attack', 'category'], axis=1)
    y = df['is_benign']
    
    return X, y

In [3]:
def train_xgboost(X, y):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )
    
    # Initial XGBoost model with balanced class weights
    model = xgb.XGBClassifier(
        objective='binary:logistic',
        scale_pos_weight=len(y_train[y_train==0]) / len(y_train[y_train==1]),
        random_state=42,
        n_estimators=100,
        learning_rate=0.1
    )
    
    # Define parameter grid for optimization
    param_grid = {
        'max_depth': [3, 5, 7],
        'min_child_weight': [1, 3, 5],
        'gamma': [0, 0.1, 0.2],
        'subsample': [0.8, 0.9],
        'colsample_bytree': [0.8, 0.9]
    }
    
    # Perform grid search with cross-validation
    grid_search = GridSearchCV(
        model,
        param_grid,
        cv=5,
        scoring='roc_auc',
        n_jobs=-1,
        verbose=1
    )
    
    # Fit the model
    grid_search.fit(X_train, y_train)
    
    # Get best model
    best_model = grid_search.best_estimator_
    
    # Evaluate the model
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]
    
    # Print results
    print("\nBest parameters:", grid_search.best_params_)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, digits=5))
    print("\nROC AUC Score: {:.5f}".format(roc_auc_score(y_test, y_pred_proba)))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    # Get feature importance
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(10))
    
    return best_model, feature_importance

In [None]:
# Load your CSV file
df = pd.read_csv('/kaggle/input/dataset/train_binary.csv')

# Preprocess the data
X, y = preprocess_data(df)

# Train and evaluate the model
best_model, feature_importance = train_xgboost(X, y)
best_model.save_model('xgboost_network_traffic_model.json')