In [None]:
import polars as pl
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb
import joblib
import json
from pathlib import Path

In [None]:
# Load processed data set
processed_df = pl.read_parquet("processed_data/comprehensive_eeg_features.parquet")

print(f"Shape: {processed_df.shape}")
print(f"Columns: {processed_df.columns}")

In [None]:
# Data preparation functions
def prepare_classification_data(df, target_column, feature_columns=None):
    """
    Prepare data for classification by separating labeled and unlabeled data
    """
    # Get labeled data (where target is not null)
    labeled_df = df.filter(pl.col(target_column).is_not_null())
    unlabeled_df = df.filter(pl.col(target_column).is_null())
    
    # If feature columns not specified, use all except target columns
    if feature_columns is None:
        exclude_cols = ['seizure_type', 'localization', 'lateralization', 'patient_id', 'seizure_id']
        feature_columns = [col for col in df.columns if col not in exclude_cols]
    
    X_labeled = labeled_df.select(feature_columns).to_numpy()
    y_labeled = labeled_df.select(target_column).to_numpy().ravel()
    
    X_unlabeled = unlabeled_df.select(feature_columns).to_numpy()
    
    return X_labeled, y_labeled, X_unlabeled, feature_columns

In [None]:
def encode_labels(y_train, y_test=None):
    """
    Encode string labels to integers
    """
    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train)
    
    if y_test is not None:
        y_test_encoded = le.transform(y_test)
        return y_train_encoded, y_test_encoded, le
    
    return y_train_encoded, le

In [None]:
# Model training functions
def train_xgboost_classifier(X_train, y_train, X_test, y_test):
    """
    Train XGBoost classifier with basic parameters
    """
    # Encode labels if they're strings
    if isinstance(y_train[0], str):
        y_train, y_test, label_encoder = encode_labels(y_train, y_test)
    else:
        label_encoder = None
    
    # Initialize and train model
    model = xgb.XGBClassifier(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        objective='multi:softprob',
        random_state=42
    )
    
    model.fit(X_train, y_train)
    
    # Evaluate
    y_pred = model.predict(X_test)
    
    return model, y_pred, label_encoder

In [None]:
def evaluate_model(y_true, y_pred, label_encoder=None):
    """
    Print evaluation metrics
    """
    if label_encoder:
        y_true_labels = label_encoder.inverse_transform(y_true)
        y_pred_labels = label_encoder.inverse_transform(y_pred)
        print("Classification Report:")
        print(classification_report(y_true_labels, y_pred_labels))
        print("\nConfusion Matrix:")
        print(confusion_matrix(y_true_labels, y_pred_labels))
    else:
        print("Classification Report:")
        print(classification_report(y_true, y_pred))
        print("\nConfusion Matrix:")
        print(confusion_matrix(y_true, y_pred))

In [None]:
# Cross-validation function
def cross_validate_model(X, y, cv_folds=5):
    """
    Perform cross-validation to assess model stability
    """
    if isinstance(y[0], str):
        y_encoded, le = encode_labels(y)
    else:
        y_encoded = y
    
    model = xgb.XGBClassifier(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        objective='multi:softprob',
        random_state=42
    )
    
    scores = cross_val_score(model, X, y_encoded, cv=cv_folds, scoring='accuracy')
    print(f"Cross-validation scores: {scores}")
    print(f"Mean accuracy: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
    
    return scores

In [None]:
# Prediction functions
def predict_unlabeled_seizures(model, X_unlabeled, label_encoder=None):
    """
    Predict labels for unlabeled seizures
    """
    predictions = model.predict(X_unlabeled)
    probabilities = model.predict_proba(X_unlabeled)
    
    if label_encoder:
        predictions = label_encoder.inverse_transform(predictions)
    
    return predictions, probabilities

In [None]:
def get_feature_importance(model, feature_names):
    """
    Get and display feature importance
    """
    importance = model.feature_importances_
    feature_importance = list(zip(feature_names, importance))
    feature_importance.sort(key=lambda x: x[1], reverse=True)
    
    print("Top 10 Most Important Features:")
    for feat, imp in feature_importance[:10]:
        print(f"{feat}: {imp:.4f}")
    
    return feature_importance

In [None]:
# Pipeline function
def build_seizure_classifier(df, target_column):
    """
    Complete pipeline to build and evaluate a seizure classifier
    """
    # Prepare data
    X_labeled, y_labeled, X_unlabeled, feature_columns = prepare_classification_data(df, target_column)
    
    # Split labeled data
    X_train, X_test, y_train, y_test = train_test_split(
        X_labeled, y_labeled, test_size=0.2, random_state=42, stratify=y_labeled
    )
    
    # Train model
    model, y_pred, label_encoder = train_xgboost_classifier(X_train, y_train, X_test, y_test)
    
    # Evaluate
    evaluate_model(y_test, y_pred, label_encoder)
    
    # Get feature importance
    feature_importance = get_feature_importance(model, feature_columns)
    
    # Cross-validate
    cv_scores = cross_validate_model(X_labeled, y_labeled)
    
    # Predict unlabeled
    if len(X_unlabeled) > 0:
        predictions, probabilities = predict_unlabeled_seizures(model, X_unlabeled, label_encoder)
        print(f"\nPredicted {len(predictions)} unlabeled seizures")
    else:
        predictions, probabilities = None, None
    
    return model, label_encoder, predictions, probabilities

In [None]:
# Save and load functions
def save_model(model, label_encoder, filepath_prefix):
    """
    Save model and label encoder
    """
    joblib.dump(model, f"{filepath_prefix}_model.pkl")
    joblib.dump(label_encoder, f"{filepath_prefix}_encoder.pkl")

In [None]:
def load_model(filepath_prefix):
    """
    Load model and label encoder
    """
    model = joblib.load(f"{filepath_prefix}_model.pkl")
    label_encoder = joblib.load(f"{filepath_prefix}_encoder.pkl")
    return model, label_encoder

In [None]:
def train_all_classifiers(processed_df):
    """
    Train classifiers for all three target variables
    """
    results = {}
    
    for target in ['seizure_type', 'localization', 'lateralization']:
        print(f"\n{'='*50}")
        print(f"Training classifier for: {target}")
        print('='*50)
        
        model, encoder, predictions, probabilities = build_seizure_classifier(processed_df, target)
        
        results[target] = {
            'model': model,
            'encoder': encoder,
            'predictions': predictions,
            'probabilities': probabilities
        }
        
        # Save model
        save_model(model, encoder, f"{target}_classifier")
    
    return results

## Data Encoding

In [None]:
def encode_categoricals(df):
    # Copy dataframe to avoid modifying original
    encoded_df = df.clone()
    
    # Dictionary to store encoding mappings
    encoding_mappings = {}
    
    # Process each column
    for col in df.columns:
        dtype = df[col].dtype
        
        # Check if column is string/object type
        if dtype == pl.Utf8 or dtype == pl.Object:
            # Convert to categorical
            encoded_df = encoded_df.with_columns(
                pl.col(col).cast(pl.Categorical)
            )
            
            # Get unique values and create mapping
            unique_vals = encoded_df[col].unique().sort()
            mapping = {str(val): i for i, val in enumerate(unique_vals) if val is not None}
            encoding_mappings[col] = mapping
            
            # Apply numeric encoding
            encoded_df = encoded_df.with_columns(
                pl.col(col).to_physical().alias(col)
            )
    
    return encoded_df

In [None]:
processed_df = encode_categoricals(processed_df)

In [None]:
# Train all classifiers
results = train_all_classifiers(processed_df)

In [None]:
# Access individual results
seizure_type_model = results['seizure_type']['model']
seizure_type_predictions = results['seizure_type']['predictions']