# First Classification attempt - Problem baseline

I'm going to use machine learning models to predict the injury status of the players with the data we have.
I do not expect to get good results, but I want to get a baseline to compare after improving the data and trying more advance techniques and models.

I'm going to use the following models:
- Logistic Regression
- Random Forest
- XGBoost
- Support Vector Machines

## Approach: Using Scikit-learn Pipelines and Grid Search

I'll implement a systematic approach using:
1. **Pipelines**: To ensure consistent preprocessing across all models
2. **Grid Search**: To find optimal hyperparameters for each model
3. **Cross-validation**: To get reliable performance estimates
4. **Model comparison**: To identify the best performing model

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score, 
    precision_score, recall_score, f1_score, roc_auc_score,
    roc_curve, precision_recall_curve
)
import xgboost as xgb
# import warnings

import core.constants as c

# warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

## 1. Data Loading and Exploration

In [4]:
X = pd.read_csv(c.RICKD_BASELINE_X_FILE, index_col=0)
Y = pd.read_csv(c.RICKD_BASELINE_Y_FILE, index_col=0)

print(f"X Dataset shape: {X.shape}")
print(f"X Columns: {list(X.columns)}")
display(X.head())

print(f"Y Dataset shape: {Y.shape}")
print(f"Y Columns: {list(Y.columns)}")
display(Y.head())

X Dataset shape: (1832, 93)
X Columns: ['speed_r', 'age', 'Height', 'Weight', 'YrsRunning', 'NumRaces', 'l_step_width', 'l_stride_rate', 'l_stride_length', 'l_swing_time', 'l_stance_time', 'l_pelvis_peak_drop_angle', 'l_pelvis_drop_excursion', 'l_ankle_df_peak_angle', 'l_ankle_eve_peak_angle', 'l_ankle_eve_percent_stance', 'l_ankle_eve_excursion', 'l_ankle_rot_peak_angle', 'l_ankle_rot_excursion', 'l_knee_flex_peak_angle', 'l_knee_add_peak_angle', 'l_knee_add_excursion', 'l_knee_abd_peak_angle', 'l_knee_abd_excursion', 'l_knee_rot_peak_angle', 'l_knee_rot_excursion', 'l_hip_ext_peak_angle', 'l_hip_add_peak_angle', 'l_hip_add_excursion', 'l_hip_rot_peak_angle', 'l_hip_rot_excursion', 'l_foot_prog_angle', 'l_foot_ang_at_hs', 'l_mhw_exc_from_to', 'l_ankle_eve_peak_vel', 'l_ankle_rot_peak_vel', 'l_knee_abd_peak_vel', 'l_knee_add_peak_vel', 'l_hip_abd_peak_vel', 'l_knee_rot_peak_vel', 'l_hip_rot_peak_vel', 'l_pronation_onset', 'l_pronation_offset', 'l_peak_hip_add_velocity', 'l_peak_pelvic_

Unnamed: 0_level_0,speed_r,age,Height,Weight,YrsRunning,NumRaces,l_step_width,l_stride_rate,l_stride_length,l_swing_time,...,r_peak_hip_add_velocity,r_peak_pelvic_drop_velocity,r_vertical_oscillation,Gender_female,Gender_male,DominantLeg_ambidextrous,DominantLeg_left,DominantLeg_right,Level_competitive,Level_recreational
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100433_20101005t132240,-2.406926,1.231321,-11.741497,-5.056746,0.578686,-0.393107,-0.307243,0.000986,-2.622528,-0.338913,...,-1.7437,0.239058,-1.935588,False,False,False,False,False,False,True
100434_20101117t132240,-1.095016,1.066816,-11.741497,-5.056746,1.308477,-0.393107,-1.83146,0.886585,-1.458975,-0.466027,...,0.764244,-0.390267,-1.34474,True,False,False,False,False,False,True
100537_20120703t102550,-1.325076,-3.128072,0.097259,-0.179365,-0.568129,-0.393107,-0.037324,-0.643565,-1.241474,-0.084685,...,2.255015,0.604189,-0.284941,True,False,False,False,True,False,True
100560_20120717t103748,-0.215279,-0.413733,0.521293,0.931754,-0.776641,-0.393107,0.677045,-1.61039,0.43924,-1.165155,...,3.526927,-1.746297,1.346349,True,False,False,False,True,False,True
101481_20120717t105021,-0.282875,-0.495986,0.316116,-0.828721,-0.776641,-0.393107,0.104643,-0.74601,-0.016982,-0.148242,...,0.475554,1.000454,0.657459,True,False,False,False,False,False,False


Y Dataset shape: (1832, 13)
Y Columns: ['injury_severity_code', 'injury_severity_value', 'injury_code', 'injury2_code', 'injury_desc', 'injury2_desc', 'injury_name', 'injury2_name', 'injured_joint_code', 'injured_joint2_code', 'injured_side_code', 'injured_side2_code', 'is_injured']


Unnamed: 0_level_0,injury_severity_code,injury_severity_value,injury_code,injury2_code,injury_desc,injury2_desc,injury_name,injury2_name,injured_joint_code,injured_joint2_code,injured_side_code,injured_side2_code,is_injured
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
100433_20101005t132240,volume_intensity,2.0,pain,no_injury,General sensation of discomfort without speci...,No injury has been diagnosed.,pain,no injury,knee,no_injury,right,right,1
100434_20101117t132240,volume_intensity,2.0,disc_dege,no_injury,Breakdown and gradual loss of spinal disc cush...,No injury has been diagnosed.,disc degeneration,no injury,lumbar_spine,no_injury,bilateral,right,1
100537_20120703t102550,missed_2_workouts,3.0,pain,no_injury,General sensation of discomfort without speci...,No injury has been diagnosed.,pain,no injury,hip_pelvis,no_injury,right,right,1
100560_20120717t103748,no_injury,0.0,no_injury,no_injury,No injury has been diagnosed.,No injury has been diagnosed.,no injury,no injury,no_injury,no_injury,right,right,0
101481_20120717t105021,no_injury,0.0,no_injury,no_injury,No injury has been diagnosed.,No injury has been diagnosed.,no injury,no injury,no_injury,no_injury,no_injury,no_injury,0


## 2. Data Preprocessing and Target Definition

In [6]:
y = Y["is_injured"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Training target distribution: {np.bincount(y_train)}")
print(f"Test target distribution: {np.bincount(y_test)}")

Training set: 1465 samples
Test set: 367 samples
Training target distribution: [532 933]
Test target distribution: [133 234]


## 3. Define Model Pipelines with Grid Search

In [9]:
models_config = {
    'Logistic Regression': {
        'pipeline': Pipeline([
            ('classifier', LogisticRegression(random_state=RANDOM_STATE))
        ]),
        'param_grid': {
            'classifier__C': [0.1, 1, 10, 100],
            'classifier__penalty': ['l1', 'l2'],
            'classifier__solver': ['liblinear']
        }
    },
    
    'Random Forest': {
        'pipeline': Pipeline([
            ('classifier', RandomForestClassifier(random_state=RANDOM_STATE))
        ]),
        'param_grid': {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__max_depth': [None, 10, 20, 30],
            'classifier__min_samples_split': [2, 5, 10],
            'classifier__min_samples_leaf': [1, 2, 4]
        }
    },
    
    'XGBoost': {
        'pipeline': Pipeline([
            ('classifier', xgb.XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss'))
        ]),
        'param_grid': {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__max_depth': [3, 6, 9],
            'classifier__learning_rate': [0.01, 0.1, 0.2],
            'classifier__subsample': [0.8, 0.9, 1.0]
        }
    },
    
    'SVM': {
        'pipeline': Pipeline([
            ('classifier', SVC(random_state=RANDOM_STATE, probability=True))
        ]),
        'param_grid': {
            'classifier__C': [0.1, 1, 10],
            'classifier__kernel': ['rbf', 'linear'],
            'classifier__gamma': ['scale', 'auto', 0.1, 0.01]
        }
    }
}


## 4. Train Models with Grid Search and Cross-Validation

In [10]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
trained_models = {}
results = {}

for model_name, config in models_config.items():
    print(f"\n{'======================'}")
    print(f"Training {model_name}...")
    print(f"\n{'======================'}")
    
    grid_search = GridSearchCV(
        estimator=config['pipeline'],
        param_grid=config['param_grid'],
        cv=cv,
        scoring='f1_weighted',
        n_jobs=-1,  # Use all available cores
        verbose=1
    )
    
    
    grid_search.fit(X_train, y_train)
    
    trained_models[model_name] = grid_search
    results[model_name] = {
        'best_score': grid_search.best_score_,
        'best_params': grid_search.best_params_,
        'cv_results': grid_search.cv_results_
    }
    
    print(f"Best CV Score: {grid_search.best_score_:.4f}")
    print(f"Best Parameters: {grid_search.best_params_}")
    
    # Make predictions on test set
    y_pred = grid_search.predict(X_test)
    y_pred_proba = grid_search.predict_proba(X_test)
    
    # Calculate test metrics
    test_accuracy = accuracy_score(y_test, y_pred)
    test_precision = precision_score(y_test, y_pred, average='weighted')
    test_recall = recall_score(y_test, y_pred, average='weighted')
    test_f1 = f1_score(y_test, y_pred, average='weighted')
    
    # For binary classification, calculate ROC AUC
    if len(np.unique(y)) == 2:
        test_roc_auc = roc_auc_score(y_test, y_pred_proba[:, 1])
    else:
        test_roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
    
    results[model_name].update({
        'test_accuracy': test_accuracy,
        'test_precision': test_precision,
        'test_recall': test_recall,
        'test_f1': test_f1,
        'test_roc_auc': test_roc_auc,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    })
    
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Test Precision: {test_precision:.4f}")
    print(f"Test Recall: {test_recall:.4f}")
    print(f"Test F1-Score: {test_f1:.4f}")
    print(f"Test ROC AUC: {test_roc_auc:.4f}")


Training Logistic Regression...

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best CV Score: 0.7574
Best Parameters: {'classifier__C': 1, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Test Accuracy: 0.7766
Test Precision: 0.7793
Test Recall: 0.7766
Test F1-Score: 0.7632
Test ROC AUC: 0.7818

Training Random Forest...

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best CV Score: 0.7789
Best Parameters: {'classifier__max_depth': 20, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 50}
Test Accuracy: 0.7929
Test Precision: 0.8221
Test Recall: 0.7929
Test F1-Score: 0.7715
Test ROC AUC: 0.7643

Training XGBoost...

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best CV Score: 0.8041
Best Parameters: {'classifier__learning_rate': 0.2, 'classifier__max_depth': 9, 'classifier__n_estimators': 100, 'classifier__subsample': 0.8}
Test Accuracy: 0.8038
Test Precision: 0.8170
Test Reca

## 5. Model Comparison and Results Analysis

In [None]:
# Create comparison dataframe
comparison_data = []
for model_name, result in results.items():
    comparison_data.append({
        'Model': model_name,
        'CV F1-Score': result['best_score'],
        'Test Accuracy': result['test_accuracy'],
        'Test Precision': result['test_precision'],
        'Test Recall': result['test_recall'],
        'Test F1-Score': result['test_f1'],
        'Test ROC AUC': result['test_roc_auc']
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('Test F1-Score', ascending=False)

print("Model Performance Comparison:")
print("="*80)
print(comparison_df.round(4))

# Find best model
best_model_name = comparison_df.iloc[0]['Model']
print(f"\nBest performing model: {best_model_name}")
print(f"Best F1-Score: {comparison_df.iloc[0]['Test F1-Score']:.4f}")

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Model Performance Comparison', fontsize=16, fontweight='bold')

# Plot 1: F1-Score comparison
axes[0, 0].bar(comparison_df['Model'], comparison_df['Test F1-Score'], color='skyblue')
axes[0, 0].set_title('Test F1-Score Comparison')
axes[0, 0].set_ylabel('F1-Score')
axes[0, 0].tick_params(axis='x', rotation=45)

# Plot 2: Accuracy comparison
axes[0, 1].bar(comparison_df['Model'], comparison_df['Test Accuracy'], color='lightgreen')
axes[0, 1].set_title('Test Accuracy Comparison')
axes[0, 1].set_ylabel('Accuracy')
axes[0, 1].tick_params(axis='x', rotation=45)

# Plot 3: Precision vs Recall
axes[1, 0].scatter(comparison_df['Test Precision'], comparison_df['Test Recall'], 
                   s=100, alpha=0.7)
for i, model in enumerate(comparison_df['Model']):
    axes[1, 0].annotate(model, (comparison_df['Test Precision'].iloc[i], 
                               comparison_df['Test Recall'].iloc[i]),
                        xytext=(5, 5), textcoords='offset points')
axes[1, 0].set_xlabel('Precision')
axes[1, 0].set_ylabel('Recall')
axes[1, 0].set_title('Precision vs Recall')
axes[1, 0].grid(True, alpha=0.3)

# Plot 4: ROC AUC comparison
axes[1, 1].bar(comparison_df['Model'], comparison_df['Test ROC AUC'], color='orange')
axes[1, 1].set_title('Test ROC AUC Comparison')
axes[1, 1].set_ylabel('ROC AUC')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 6. Detailed Analysis of Best Model

In [None]:
# Detailed analysis of the best model
best_model = trained_models[best_model_name]
best_results = results[best_model_name]

print(f"Detailed Analysis of {best_model_name}")
print("="*50)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, best_results['y_pred'], 
                          target_names=le.classes_))

# Confusion matrix
cm = confusion_matrix(y_test, best_results['y_pred'])
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title(f'Confusion Matrix - {best_model_name}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Feature importance (for tree-based models)
if hasattr(best_model.best_estimator_.named_steps['classifier'], 'feature_importances_'):
    feature_importance = best_model.best_estimator_.named_steps['classifier'].feature_importances_
    feature_names = X.columns
    
    # Create feature importance dataframe
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': feature_importance
    }).sort_values('importance', ascending=False)
    
    # Plot top 20 features
    plt.figure(figsize=(12, 8))
    top_features = importance_df.head(20)
    plt.barh(range(len(top_features)), top_features['importance'])
    plt.yticks(range(len(top_features)), top_features['feature'])
    plt.xlabel('Feature Importance')
    plt.title(f'Top 20 Feature Importances - {best_model_name}')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
    print("\nTop 10 Most Important Features:")
    print(importance_df.head(10))

## 7. Cross-Validation Results Analysis

In [None]:
# Analyze cross-validation results for all models
cv_results_df = pd.DataFrame()

for model_name, result in results.items():
    cv_scores = result['cv_results']['mean_test_score']
    cv_results_df[model_name] = cv_scores

# Plot CV score distributions
plt.figure(figsize=(12, 6))
cv_results_df.boxplot()
plt.title('Cross-Validation Score Distributions')
plt.ylabel('F1-Score')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.show()

# Print CV statistics
print("Cross-Validation Statistics:")
print("="*50)
for model_name in cv_results_df.columns:
    scores = cv_results_df[model_name]
    print(f"\n{model_name}:")
    print(f"  Mean CV Score: {scores.mean():.4f}")
    print(f"  Std CV Score: {scores.std():.4f}")
    print(f"  Min CV Score: {scores.min():.4f}")
    print(f"  Max CV Score: {scores.max():.4f}")

## 8. Model Persistence and Summary

In [None]:
# Save the best model and results
import joblib
import json
from datetime import datetime

# Create results directory if it doesn't exist
import os
os.makedirs('../../results/models', exist_ok=True)

# Save best model
model_filename = f"../../results/models/{best_model_name.lower().replace(' ', '_')}_best_model.pkl"
joblib.dump(best_model, model_filename)
print(f"Best model saved to: {model_filename}")

# Save label encoder
encoder_filename = f"../../results/models/label_encoder.pkl"
joblib.dump(le, encoder_filename)
print(f"Label encoder saved to: {encoder_filename}")

# Save comparison results
results_filename = f"../../results/models/model_comparison_results.csv"
comparison_df.to_csv(results_filename, index=False)
print(f"Comparison results saved to: {results_filename}")

# Save detailed results as JSON
detailed_results = {
    'timestamp': datetime.now().isoformat(),
    'best_model': best_model_name,
    'best_model_params': results[best_model_name]['best_params'],
    'model_comparison': comparison_df.to_dict('records'),
    'data_info': {
        'n_samples': len(data),
        'n_features': X.shape[1],
        'n_classes': len(le.classes_),
        'class_distribution': dict(zip(le.classes_, np.bincount(y)))
    }
}

json_filename = f"../../results/models/experiment_results.json"
with open(json_filename, 'w') as f:
    json.dump(detailed_results, f, indent=2)
print(f"Detailed results saved to: {json_filename}")

# Print final summary
print("\n" + "="*60)
print("EXPERIMENT SUMMARY")
print("="*60)
print(f"Best Model: {best_model_name}")
print(f"Best Test F1-Score: {comparison_df.iloc[0]['Test F1-Score']:.4f}")
print(f"Best Test Accuracy: {comparison_df.iloc[0]['Test Accuracy']:.4f}")
print(f"Dataset: {len(data)} samples, {X.shape[1]} features")
print(f"Classes: {len(le.classes_)} ({', '.join(le.classes_)})")
print("="*60)