In [6]:
import pandas as pd
from ydata_profiling import ProfileReport
import pdfkit

In [3]:
# First, install required packages if they're not already installed
import sys
import subprocess

def install_package(package):
    print(f"Installing {package}...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

required_packages = ['pandas', 'numpy', 'scikit-learn', 'xgboost', 'lightgbm', 'catboost', 'imbalanced-learn']

for package in required_packages:
    try:
        __import__(package)
        print(f"{package} is already installed.")
    except ImportError:
        install_package(package)

# Now import all required libraries
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.impute import SimpleImputer, KNNImputer
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss
from imblearn.combine import SMOTETomek, SMOTEENN
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_STATE = 42

# ============= NEW DATA CLEANING AND PREPROCESSING FUNCTIONS =============

def load_and_inspect_data(file_path):
    """Load the raw dataset and print basic information"""
    df = pd.read_csv(file_path)
    print(f"Dataset loaded successfully. Shape: {df.shape}")
    print("\nDataset Info:")
    print(df.info())
    print("\nMissing values per column:")
    print(df.isnull().sum())
    print("\nData sample:")
    print(df.head())
    return df

def clean_data(df):
    """Perform basic data cleaning operations"""
    # Make a copy of the original dataframe
    df_cleaned = df.copy()
    
    # 1. Handle duplicates
    original_rows = df_cleaned.shape[0]
    df_cleaned = df_cleaned.drop_duplicates()
    print(f"Removed {original_rows - df_cleaned.shape[0]} duplicate rows")
    
    # 2. Convert data types if needed
    # Convert object columns that should be numeric
    for col in df_cleaned.columns:
        if df_cleaned[col].dtype == 'object':
            try:
                # Try to convert to numeric
                df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')
                print(f"Converted {col} to numeric")
            except:
                print(f"Keeping {col} as object type")
    
    # 3. Handle missing values
    # First, identify columns with missing values
    missing_cols = df_cleaned.columns[df_cleaned.isnull().any()].tolist()
    print(f"Columns with missing values: {missing_cols}")
    
    # For numeric columns, impute with median - FIXED to handle one column at a time
    numeric_cols = df_cleaned.select_dtypes(include=['float64', 'int64']).columns
    missing_numeric_cols = [col for col in missing_cols if col in numeric_cols]
    
    if missing_numeric_cols:
        print(f"Imputing missing values in numeric columns with median")
        for col in missing_numeric_cols:
            if df_cleaned[col].isnull().any():
                # Create a 2D array for the imputer by reshaping
                col_data = df_cleaned[col].values.reshape(-1, 1)
                imputer = SimpleImputer(strategy='median')
                df_cleaned[col] = imputer.fit_transform(col_data).ravel()
    
    # For categorical columns, impute with most frequent value - FIXED to handle one column at a time
    categorical_cols = df_cleaned.select_dtypes(include=['object']).columns
    missing_categorical_cols = [col for col in missing_cols if col in categorical_cols]
    
    if missing_categorical_cols:
        print(f"Imputing missing values in categorical columns with most frequent value")
        for col in missing_categorical_cols:
            if df_cleaned[col].isnull().any():
                # Create a 2D array for the imputer
                col_data = df_cleaned[col].values.reshape(-1, 1)
                cat_imputer = SimpleImputer(strategy='most_frequent')
                df_cleaned[col] = cat_imputer.fit_transform(col_data).ravel()
    
    return df_cleaned

def handle_outliers(df, method='winsorize'):
    """Handle outliers in numeric columns"""
    df_processed = df.copy()
    numeric_cols = df_processed.select_dtypes(include=['float64', 'int64']).columns
    
    if method == 'winsorize':
        for col in numeric_cols:
            # Skip target column if present
            if col.lower() in ['target', 'label', 'class', 'y']:
                continue
                
            # Calculate Q1, Q3 and IQR
            Q1 = df_processed[col].quantile(0.25)
            Q3 = df_processed[col].quantile(0.75)
            IQR = Q3 - Q1
            
            # Define bounds for winsorization
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            # Winsorize (clip) values outside the bounds
            df_processed[col] = df_processed[col].clip(lower=lower_bound, upper=upper_bound)
        
        print("Applied winsorization to handle outliers")
    
    elif method == 'remove':
        for col in numeric_cols:
            # Skip target column if present
            if col.lower() in ['target', 'label', 'class', 'y']:
                continue
                
            # Calculate Q1, Q3 and IQR
            Q1 = df_processed[col].quantile(0.25)
            Q3 = df_processed[col].quantile(0.75)
            IQR = Q3 - Q1
            
            # Define bounds
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            # Flag outliers
            outliers = (df_processed[col] < lower_bound) | (df_processed[col] > upper_bound)
            
            # Replace outliers with NaN
            df_processed.loc[outliers, col] = np.nan
            
        # Re-impute after outlier removal
        for col in numeric_cols:
            if df_processed[col].isnull().any():
                col_data = df_processed[col].values.reshape(-1, 1)
                imputer = SimpleImputer(strategy='median')
                df_processed[col] = imputer.fit_transform(col_data).ravel()
        
        print("Identified and handled outliers by replacing with NaN and then imputing")
    
    return df_processed
    
def encode_categorical_features(df):
    """Encode categorical features using one-hot encoding"""
    df_encoded = df.copy()
    
    # Identify categorical columns (excluding the target column)
    categorical_cols = df_encoded.select_dtypes(include=['object']).columns
    
    # Apply one-hot encoding to categorical columns
    if len(categorical_cols) > 0:
        print(f"One-hot encoding {len(categorical_cols)} categorical features")
        df_encoded = pd.get_dummies(df_encoded, columns=categorical_cols, drop_first=True)
    
    return df_encoded

def scale_features(df, scaler_type='robust'):
    """Scale numeric features using the specified scaler"""
    # Make a copy
    df_scaled = df.copy()
    
    # Identify numeric columns (excluding the target column)
    numeric_cols = df_scaled.select_dtypes(include=['float64', 'int64']).columns
    # Remove potential target columns
    numeric_cols = [col for col in numeric_cols if col.lower() not in ['target', 'label', 'class', 'y']]
    
    if len(numeric_cols) > 0:
        # Apply scaling on a column-by-column basis to avoid shape issues
        if scaler_type == 'standard':
            for col in numeric_cols:
                scaler = StandardScaler()
                df_scaled[col] = scaler.fit_transform(df_scaled[col].values.reshape(-1, 1)).ravel()
        elif scaler_type == 'robust':
            for col in numeric_cols:
                scaler = RobustScaler()
                df_scaled[col] = scaler.fit_transform(df_scaled[col].values.reshape(-1, 1)).ravel()
        elif scaler_type == 'minmax':
            for col in numeric_cols:
                scaler = MinMaxScaler()
                df_scaled[col] = scaler.fit_transform(df_scaled[col].values.reshape(-1, 1)).ravel()
        else:
            raise ValueError(f"Unknown scaler type: {scaler_type}")
        
        print(f"Applied {scaler_type} scaling to {len(numeric_cols)} numeric features")
    
    return df_scaled

# ============= EXISTING HELPER FUNCTIONS UPDATED =============

def log_transform(X):
    """Apply log transformation to numeric features"""
    numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns
    # Skip potential target columns
    numeric_cols = [col for col in numeric_cols if col.lower() not in ['target', 'label', 'class', 'y']]
    
    # Add a small constant to avoid log(0)
    X_transformed = X.copy()
    for col in numeric_cols:
        # Skip columns with negative values or zeros
        if (X[col] <= 0).any():
            # For columns with zero or negative values, use log1p(x + |min| + 1) if needed
            if (X[col] < 0).any():
                min_val = abs(X[col].min()) + 1
                X_transformed[col] = np.log1p(X[col] + min_val)
            else:
                # For columns with zeros but no negative values
                X_transformed[col] = np.log1p(X[col])
        else:
            # Standard log for strictly positive columns
            X_transformed[col] = np.log(X[col])
    
    return X_transformed

def evaluate_model(model, X_test, y_test, model_name):
    """Evaluate model and return metrics"""
    start_time = time.time()
    try:
        y_pred = model.predict(X_test)
        
        # For binary classification ensure we have probability estimates for AUC
        try:
            y_pred_proba = model.predict_proba(X_test)[:, 1]
            auc = roc_auc_score(y_test, y_pred_proba)
        except:
            auc = None
        
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        inference_time = time.time() - start_time
        
        results = {
            'Model': model_name,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1,
            'ROC AUC': auc,
            'Inference Time (s)': inference_time,
            'Error': None
        }
    except Exception as e:
        results = {
            'Model': model_name,
            'Accuracy': None,
            'Precision': None,
            'Recall': None,
            'F1 Score': None,
            'ROC AUC': None,
            'Inference Time (s)': None,
            'Error': str(e)
        }
        print(f"Error evaluating {model_name}: {e}")
    
    return results

# ============= MAIN EXECUTION CODE =============

# Load raw dataset
print("Loading raw dataset...")
raw_data_path = r"C:\Machine Learning\Phase 2\Data\phase2_students_before_cleaning.csv"
raw_df = load_and_inspect_data(raw_data_path)

# Apply data cleaning and preprocessing
print("\nCleaning data...")
df_cleaned = clean_data(raw_df)

print("\nHandling outliers...")
df_no_outliers = handle_outliers(df_cleaned, method='winsorize')

print("\nEncoding categorical features...")
df_encoded = encode_categorical_features(df_no_outliers)

print("\nIdentifying target column...")
# Assuming the target column is called 'target', 'label', 'class', or 'y'
# Let's try to identify it automatically
target_candidates = [col for col in df_encoded.columns if col.lower() in ['target', 'label', 'class', 'y']]

if target_candidates:
    target_column = target_candidates[0]
    print(f"Target column identified: {target_column}")
else:
    # If no standard name is found, ask the user to provide the target column name
    print("No standard target column name found in the dataset.")
    print("Available columns:", df_encoded.columns.tolist())
    print("Assuming the last column is the target variable")
    target_column = df_encoded.columns[-1]
    print(f"Using '{target_column}' as the target variable")

# Split features and target
X = df_encoded.drop(columns=[target_column])
y = df_encoded[target_column]

# Ensure the target is numeric
try:
    y = y.astype(int)
    print(f"Target variable converted to integer type")
except:
    print(f"Warning: Could not convert target to integer type. Using as-is.")

# Check if target is binary
unique_classes = y.nunique()
if unique_classes > 2:
    print(f"Target has {unique_classes} classes. Treating as a multi-class problem.")
else:
    print("Target is binary. Treating as a binary classification problem.")

# Split into train and test sets
print("\nSplitting data into train and test sets...")
try:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y if y.nunique() > 1 else None
    )
except ValueError as e:
    print(f"Warning: Could not stratify split due to: {e}")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=RANDOM_STATE
    )

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

try:
    print(f"Class distribution in y_train: {np.bincount(y_train.astype(int))}")
    print(f"Class distribution in y_test: {np.bincount(y_test.astype(int))}")
except ValueError as e:
    print(f"Warning: Could not print class distribution: {e}")
    print(f"y_train unique values: {y_train.unique()}")
    print(f"y_test unique values: {y_test.unique()}")

# Scale features
print("\nScaling features...")
X_train = scale_features(X_train, scaler_type='robust')
X_test = scale_features(X_test, scaler_type='robust')

# Continue with the existing ML pipeline from the original code
# Define imbalance handling techniques
imbalance_techniques = {
    'Original': None,
    'SMOTE': SMOTE(random_state=RANDOM_STATE),
    'ADASYN': ADASYN(random_state=RANDOM_STATE),
    'RandomOverSampler': RandomOverSampler(random_state=RANDOM_STATE),
    'RandomUnderSampler': RandomUnderSampler(random_state=RANDOM_STATE),
    'TomekLinks': TomekLinks(),
    'NearMiss': NearMiss(),
    'SMOTETomek': SMOTETomek(random_state=RANDOM_STATE),
    'SMOTEENN': SMOTEENN(random_state=RANDOM_STATE)
}

# Define feature selection techniques
feature_selection_techniques = {
    'NoSelection': None,
    'SelectFromModel_RF': SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)),
    # 'RFE_RF': RFE(RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE), n_features_to_select=X_train.shape[1] // 2)
}

# Define model creation functions instead of instances
def get_model(model_name, class_ratio=None):
    """Get a fresh model instance with appropriate parameters for class imbalance"""
    if model_name == 'XGBoost':
        model = XGBClassifier(
            n_estimators=100, 
            learning_rate=0.1, 
            max_depth=6, 
            random_state=RANDOM_STATE,
            use_label_encoder=False,
            eval_metric='logloss'
        )
        if class_ratio is not None:
            model.set_params(scale_pos_weight=class_ratio)
    elif model_name == 'LightGBM':
        model = LGBMClassifier(
            n_estimators=100, 
            learning_rate=0.1, 
            max_depth=6, 
            random_state=RANDOM_STATE
        )
        if class_ratio is not None:
            model.set_params(is_unbalance=True)
    elif model_name == 'CatBoost':
        model = CatBoostClassifier(
            iterations=100, 
            learning_rate=0.1, 
            depth=6, 
            random_seed=RANDOM_STATE,
            verbose=0
        )
        if class_ratio is not None:
            model.set_params(auto_class_weights='Balanced')
    elif model_name == 'HIST-G-Boosting':
        model = HistGradientBoostingClassifier(
            max_iter=100, 
            learning_rate=0.1, 
            max_depth=6, 
            random_state=RANDOM_STATE
        )
    elif model_name == 'ExtraTrees':
        model = ExtraTreesClassifier(
            n_estimators=100, 
            max_depth=6, 
            random_state=RANDOM_STATE
        )
    elif model_name == 'BalancedRandomForest':
        model = BalancedRandomForestClassifier(
            n_estimators=100, 
            max_depth=6, 
            random_state=RANDOM_STATE
        )
    elif model_name == 'RandomForest':
        model = RandomForestClassifier(
            n_estimators=100, 
            max_depth=6, 
            random_state=RANDOM_STATE
        )
    elif model_name == 'DecisionTree':
        model = DecisionTreeClassifier(
            max_depth=6, 
            random_state=RANDOM_STATE
        )
    else:
        raise ValueError(f"Unknown model: {model_name}")
    
    return model

# List of model names
model_names = [
    'XGBoost', 'LightGBM', 'CatBoost', 'HIST-G-Boosting', 
    'ExtraTrees', 'BalancedRandomForest', 'RandomForest', 'DecisionTree'
]

# Main execution loop for model training and evaluation
all_results = []

for imbalance_name, imbalance_technique in imbalance_techniques.items():
    print(f"\n{'='*50}")
    print(f"Applying imbalance technique: {imbalance_name}")
    print(f"{'='*50}")
    
    # Apply imbalance handling (if needed)
    try:
        if imbalance_technique:
            X_train_resampled, y_train_resampled = imbalance_technique.fit_resample(X_train, y_train)
            try:
                print(f"After resampling - X_train shape: {X_train_resampled.shape}, Class distribution: {np.bincount(y_train_resampled.astype(int))}")
            except ValueError:
                print(f"After resampling - X_train shape: {X_train_resampled.shape}")
        else:
            X_train_resampled, y_train_resampled = X_train.copy(), y_train.copy()
        
        # Apply Log Transformation
        X_train_transformed = log_transform(X_train_resampled)
        X_test_transformed = log_transform(X_test.copy())
        print("Applied Log Transformation")
        
        # Shuffle if required - FIX: use test_size=0.0001 instead of 0
        # This keeps almost all data in the training set but satisfies scikit-learn's validation
        X_train_shuffled, _, y_train_shuffled, _ = train_test_split(
            X_train_transformed, y_train_resampled, test_size=0.0001, random_state=RANDOM_STATE
        )
        print("Applied shuffling")
        
        # Calculate class ratio for imbalanced data handling if needed
        class_ratio = None
        try:
            if len(np.unique(y_train_shuffled)) > 1:
                class_counts = np.bincount(y_train_shuffled.astype(int))
                if len(class_counts) > 1 and class_counts[0] != class_counts[1]:
                    class_ratio = class_counts[0] / class_counts[1]
        except Exception as e:
            print(f"Warning: Could not calculate class ratio: {e}")
        
        for fs_name, fs_technique in feature_selection_techniques.items():
            print(f"\n{'-'*30}")
            print(f"Feature selection: {fs_name}")
            print(f"{'-'*30}")
            
            try:
                # Apply feature selection
                if fs_technique:
                    fs_technique.fit(X_train_shuffled, y_train_shuffled)
                    selected_features = X_train_shuffled.columns[fs_technique.get_support()]
                    X_train_selected = X_train_shuffled[selected_features]
                    X_test_selected = X_test_transformed[selected_features]
                    print(f"Selected {len(selected_features)} features")
                else:
                    X_train_selected = X_train_shuffled
                    X_test_selected = X_test_transformed
                
                # Train and evaluate models
                for model_name in model_names:
                    print(f"Training {model_name}...")
                    
                    try:
                        # Get a fresh model instance with appropriate parameters
                        model = get_model(model_name, class_ratio)
                        
                        # Train the model
                        start_time = time.time()
                        model.fit(X_train_selected, y_train_shuffled)
                        training_time = time.time() - start_time
                        
                        # Evaluate
                        results = evaluate_model(model, X_test_selected, y_test, model_name)
                        
                        # Add additional info
                        results.update({
                            'Imbalance Technique': imbalance_name,
                            'Feature Selection': fs_name,
                            'Training Time (s)': training_time,
                            'Number of Features': X_train_selected.shape[1]
                        })
                        
                        all_results.append(results)
                        print(f"Completed {model_name}. Training time: {training_time:.2f}s, Accuracy: {results['Accuracy']}")
                    except Exception as e:
                        print(f"Error training {model_name}: {e}")
                        all_results.append({
                            'Model': model_name,
                            'Imbalance Technique': imbalance_name,
                            'Feature Selection': fs_name,
                            'Accuracy': None,
                            'Precision': None,
                            'Recall': None,
                            'F1 Score': None,
                            'ROC AUC': None,
                            'Training Time (s)': None,
                            'Inference Time (s)': None,
                            'Number of Features': X_train_selected.shape[1] if 'X_train_selected' in locals() else None,
                            'Error': str(e)
                        })
            except Exception as e:
                print(f"Error in feature selection {fs_name}: {e}")
    except Exception as e:
        print(f"Error in imbalance technique {imbalance_name}: {e}")

# Convert to DataFrame and display results
results_df = pd.DataFrame(all_results)
print("\nComplete Results:")
print(results_df)

# Save results
results_df.to_csv('ml_pipeline_results_with_cleaning.csv', index=False)
print("\nResults saved to 'ml_pipeline_results_with_cleaning.csv'")

# Find best model based on F1 score if available
try:
    # Filter out rows with errors or None F1 scores
    valid_results = results_df.dropna(subset=['F1 Score'])
    
    if not valid_results.empty:
        best_model_idx = valid_results['F1 Score'].idxmax()
        best_model = results_df.iloc[best_model_idx]
        print("\nBest Model Configuration:")
        print(f"Model: {best_model['Model']}")
        print(f"Imbalance Technique: {best_model['Imbalance Technique']}")
        print(f"Feature Selection: {best_model['Feature Selection']}")
        print(f"F1 Score: {best_model['F1 Score']:.4f}")
        print(f"Accuracy: {best_model['Accuracy']:.4f}")
        print(f"ROC AUC: {best_model['ROC AUC']:.4f}" if best_model['ROC AUC'] else "ROC AUC: N/A")
    else:
        print("\nNo valid models with F1 scores found.")
except Exception as e:
    print(f"\nError finding best model: {e}")

pandas is already installed.
numpy is already installed.
Installing scikit-learn...
xgboost is already installed.
lightgbm is already installed.
catboost is already installed.
Installing imbalanced-learn...
Loading raw dataset...


MemoryError: Unable to allocate 256. KiB for an array with shape (32768,) and data type int64

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_selection import SelectFromModel
import lightgbm as lgb

# 1. Load data
data_path = "C:\Machine Learning\Phase 2\Data\phase2_students_before_cleaning.csv"

def load_data(path):
    df = pd.read_csv(path)
    return df

# 2. Data cleaning: dedupe, shuffle, winsorize + log1p
num_cols = [
    "flow_time", "header_size", "packet_duration", "overall_rate",
    "src_rate", "dst_rate", "fin_packets", "urg_packets",
    "rst_packets", "max_value", "value_covariance"
]

def clean_data(df):
    df = df.drop_duplicates()
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    for col in num_cols:
        lower, upper = np.percentile(df[col], [1, 99])
        df[col] = df[col].clip(lower, upper)
        df[col] = np.log1p(df[col])
    return df

# 3. Preprocess: encode target, scale, class weights, feature selection
def preprocess(df):
    df = clean_data(df)
    X = df.drop(columns=["label"])
    y = df["label"].values

    # Encode string labels to integers
    le = LabelEncoder()
    y_enc = le.fit_transform(y)
    class_names = le.classes_

    # Scale numerical features
    scaler = StandardScaler()
    X[num_cols] = scaler.fit_transform(X[num_cols])

    # Handle class imbalance with sample weights
    class_weights = compute_class_weight(
        class_weight="balanced",
        classes=np.arange(len(class_names)),
        y=y_enc
    )
    sample_weights = class_weights[y_enc]

    # Feature selection using LightGBM importance
    selector = lgb.LGBMClassifier(
        objective="multiclass",
        num_class=len(class_names),
        n_estimators=50,
        learning_rate=0.1,
        class_weight="balanced",
        random_state=42
    )
    selector.fit(X, y_enc)
    sfm = SelectFromModel(selector, prefit=True, threshold="median")
    X_selected = sfm.transform(X)
    selected_features = X.columns[sfm.get_support()]
    X = pd.DataFrame(X_selected, columns=selected_features)

    return X, y_enc, sample_weights, class_names

# 4. Train with stratified K-fold and LightGBM
def train_model(X, y, sample_weights, folds=5):
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    oof_preds = np.zeros(len(y), dtype=int)

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"Training fold {fold+1}/{folds}...")
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        w_train = sample_weights[train_idx]

        lgb_train = lgb.Dataset(X_train, label=y_train, weight=w_train)
        lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train)

        params = {
            "objective": "multiclass",
            "num_class": len(np.unique(y)),
            "metric": "multi_logloss",
            "boosting_type": "gbdt",
            "learning_rate": 0.05,
            "num_leaves": 31,
            "max_depth": -1,
            "seed": 42,
            "verbosity": -1
        }

        model = lgb.train(
            params,
            lgb_train,
            num_boost_round=500,
            valid_sets=[lgb_train, lgb_val],
            valid_names=["train", "valid"],
            callbacks=[
                lgb.early_stopping(stopping_rounds=30),
                lgb.log_evaluation(period=100)
            ]
        )

        preds = np.argmax(model.predict(X_val), axis=1)
        oof_preds[val_idx] = preds

    print("OOF Accuracy:", accuracy_score(y, oof_preds))
    print(classification_report(y, oof_preds, target_names=class_names))
    return model

# 5. Main
def main():
    df = load_data(data_path)
    X, y, sample_weights, class_names = preprocess(df)
    model = train_model(X, y, sample_weights, folds=5)
    model.save_model("lgbm_cyber_attack_model.txt")
    print("Model saved.")

if __name__ == "__main__":
    main()


  data_path = "C:\Machine Learning\Phase 2\Data\phase2_students_before_cleaning.csv"


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054673 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2417
[LightGBM] [Info] Number of data points in the train set: 842396, number of used features: 20
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




Training fold 1/5...
Training until validation scores don't improve for 30 rounds
[100]	train's multi_logloss: 0.337805	valid's multi_logloss: 0.344537
[200]	train's multi_logloss: 0.292102	valid's multi_logloss: 0.326038
[300]	train's multi_logloss: 0.262308	valid's multi_logloss: 0.318283
[400]	train's multi_logloss: 0.239991	valid's multi_logloss: 0.313716
[500]	train's multi_logloss: 0.221219	valid's multi_logloss: 0.310867
Did not meet early stopping. Best iteration is:
[500]	train's multi_logloss: 0.221219	valid's multi_logloss: 0.310867
Training fold 2/5...
Training until validation scores don't improve for 30 rounds
[100]	train's multi_logloss: 0.337398	valid's multi_logloss: 0.347813
[200]	train's multi_logloss: 0.292207	valid's multi_logloss: 0.329002
[300]	train's multi_logloss: 0.262185	valid's multi_logloss: 0.320624
[400]	train's multi_logloss: 0.239823	valid's multi_logloss: 0.316033
[500]	train's multi_logloss: 0.221727	valid's multi_logloss: 0.313014
Did not meet early

  data_path = "C:\Machine Learning\Phase 2\Data\phase2_students_before_cleaning.csv"


NameError: name 'class_names' is not defined