In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import (classification_report, confusion_matrix, accuracy_score,
                             f1_score, cohen_kappa_score)
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
import warnings
warnings.filterwarnings('ignore')

In [2]:
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
except ImportError:
    XGBOOST_AVAILABLE = False
    print("[INFO] XGBoost not installed. Run: pip install xgboost")

try:
    import shap
    SHAP_AVAILABLE = True
except ImportError:
    SHAP_AVAILABLE = False
    print("[INFO] SHAP not installed. Run: pip install shap")

try:
    import joblib
    JOBLIB_AVAILABLE = True
except ImportError:
    import pickle
    JOBLIB_AVAILABLE = False

np.random.seed(42)

In [3]:
def generate_training_data(n_samples=200000):
    """
    Generate realistic synthetic dataset with 200,000 samples.
    FIX 1: Sport profiles are now physiologically DISTINCT to improve separability.
    FIX 3: Age and Gender added as features.
    """
    data = []
    sports = ['Football', 'Volleyball', 'Swimming', 'Basketball', 'Athletics', 'Gymnastics']

    # =========================================================================
    # FIX 1 - DISTINCT SPORT PROFILES (changed from original overlapping ranges)
    # Key distinctors:
    #   Basketball  → VERY TALL (172-195 cm)
    #   Volleyball  → TALL (165-190 cm)
    #   Gymnastics  → SHORT + LIGHTEST (130-158 cm, 28-50 kg) + HIGHEST flexibility
    #   Athletics   → VERY LEAN (38-58 kg) + HIGHEST gait/walking
    #   Swimming    → HIGHEST flexibility (78-98) + LOW gait
    #   Football    → HIGH strength (72-95) + HIGH gait
    # =========================================================================
    sport_profiles = {
        'Football': {
            'height_range':      (148, 170),   # Medium height
            'weight_range':      (45, 68),
            'strength_range':    (72, 95),     # HIGH strength ← distinct
            'flexibility_range': (45, 65),     # LOW flexibility ← distinct
            'gait_range':        (75, 95),     # HIGH gait ← distinct
            'walking_range':     (70, 90),
            'gender_bias':       0.75,         # 75% male in school football
        },
        'Volleyball': {
            'height_range':      (165, 190),   # TALL ← distinct
            'weight_range':      (55, 78),
            'strength_range':    (55, 78),
            'flexibility_range': (65, 85),
            'gait_range':        (60, 80),
            'walking_range':     (58, 78),
            'gender_bias':       0.50,
        },
        'Swimming': {
            'height_range':      (155, 182),
            'weight_range':      (42, 63),     # LEAN
            'strength_range':    (50, 75),
            'flexibility_range': (78, 98),     # VERY HIGH flexibility ← distinct
            'gait_range':        (50, 70),     # LOW gait ← distinct
            'walking_range':     (48, 68),
            'gender_bias':       0.50,
        },
        'Basketball': {
            'height_range':      (172, 195),   # VERY TALL ← distinct
            'weight_range':      (60, 85),
            'strength_range':    (65, 88),
            'flexibility_range': (52, 72),
            'gait_range':        (78, 96),     # HIGH gait
            'walking_range':     (75, 92),
            'gender_bias':       0.60,
        },
        'Athletics': {
            'height_range':      (150, 175),
            'weight_range':      (38, 58),     # VERY LEAN ← distinct
            'strength_range':    (70, 95),     # HIGH strength
            'flexibility_range': (62, 82),
            'gait_range':        (85, 100),    # HIGHEST gait ← distinct
            'walking_range':     (82, 100),    # HIGHEST walking ← distinct
            'gender_bias':       0.50,
        },
        'Gymnastics': {
            'height_range':      (130, 158),   # SHORT ← distinct
            'weight_range':      (28, 50),     # LIGHTEST ← distinct
            'strength_range':    (65, 90),
            'flexibility_range': (85, 100),    # HIGHEST flexibility ← distinct
            'gait_range':        (68, 88),
            'walking_range':     (65, 85),
            'gender_bias':       0.20,         # 80% female in school gymnastics
        },
    }

    samples_per_sport = n_samples // len(sports)

    for sport in sports:
        profile = sport_profiles[sport]
        for _ in range(samples_per_sport):

            # FIX 2 - TIGHTER NOISE (reduced from ±3/5/4 to ±1.5/2.5/2.0)
            # Less noise = cleaner class boundaries = higher accuracy
            height       = np.random.uniform(*profile['height_range'])      + np.random.normal(0, 1.5)
            weight       = np.random.uniform(*profile['weight_range'])       + np.random.normal(0, 1.5)
            bmi          = weight / ((height / 100) ** 2)
            strength     = np.random.uniform(*profile['strength_range'])     + np.random.normal(0, 2.5)
            flexibility  = np.random.uniform(*profile['flexibility_range'])  + np.random.normal(0, 2.0)
            gait         = np.random.uniform(*profile['gait_range'])         + np.random.normal(0, 2.0)
            walking_score= np.random.uniform(*profile['walking_range'])      + np.random.normal(0, 1.5)

            # Clip to realistic ranges
            height        = np.clip(height,        130, 200)
            weight        = np.clip(weight,         28,  90)
            bmi           = np.clip(bmi,            14,  32)
            strength      = np.clip(strength,       20, 100)
            flexibility   = np.clip(flexibility,    20, 100)
            gait          = np.clip(gait,           30, 100)
            walking_score = np.clip(walking_score,  30, 100)

            # FIX 3 - AGE AND GENDER (new features)
            age    = np.random.randint(11, 15)   # 6th-8th standard = age 11-14
            gender = 1 if np.random.random() < profile['gender_bias'] else 0  # 1=Male, 0=Female

            data.append({
                'height_cm':          round(height,        1),
                'weight_kg':          round(weight,        1),
                'bmi':                round(bmi,           2),
                'strength_score':     round(strength,      1),
                'flexibility_score':  round(flexibility,   1),
                'gait_score':         round(gait,          1),
                'walking_score':      round(walking_score, 1),
                'age':                age,          # FIX 3 - NEW
                'gender':             gender,       # FIX 3 - NEW (1=Male, 0=Female)
                'sport_label':        sport
            })

    df = pd.DataFrame(data)
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    return df



In [4]:

def engineer_features(df):
    """Add domain-relevant composite features."""
    df = df.copy()

    df['athleticism_index'] = (
        df['strength_score']    * 0.3 +
        df['flexibility_score'] * 0.2 +
        df['gait_score']        * 0.25 +
        df['walking_score']     * 0.25
    )
    df['power_to_weight']     = df['strength_score'] / df['weight_kg']
    df['mobility_score']      = (df['gait_score'] + df['walking_score'] + df['flexibility_score']) / 3
    df['height_weight_ratio'] = df['height_cm'] / df['weight_kg']
    df['bmi_category']        = pd.cut(df['bmi'],
                                        bins=[0, 18.5, 23, 27.5, 100],
                                        labels=[0, 1, 2, 3]).astype(int)
    return df


In [5]:
def preprocess_data(df):
    """Encode labels, scale features, and split data."""
    df = engineer_features(df)

    # FIX 3 - 'age' and 'gender' added to feature list
    feature_cols = [
        'height_cm', 'weight_kg', 'bmi', 'strength_score', 'flexibility_score',
        'gait_score', 'walking_score', 'age', 'gender',
        'athleticism_index', 'power_to_weight',
        'mobility_score', 'height_weight_ratio', 'bmi_category'
    ]

    X = df[feature_cols]
    y = df['sport_label']

    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled  = scaler.transform(X_test)

    return X_train_scaled, X_test_scaled, y_train, y_test, scaler, label_encoder, feature_cols


In [6]:
# =============================================================================
# 4. MODEL BUILDING
# =============================================================================

def build_models():
    """Define all candidate models."""
    models = {
        'Random Forest': RandomForestClassifier(
            n_estimators=300,
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            max_features='sqrt',
            random_state=42,
            n_jobs=-1
        ),
        'Gradient Boosting': GradientBoostingClassifier(
            n_estimators=200,
            learning_rate=0.1,
            max_depth=4,
            subsample=0.8,
            random_state=42
        ),
        'SGD Classifier': CalibratedClassifierCV(
            SGDClassifier(
                loss='modified_huber',
                max_iter=1000,
                tol=1e-3,
                random_state=42,
                n_jobs=-1
            ),
            cv=3
        ),
        'Neural Network': MLPClassifier(
            hidden_layer_sizes=(256, 128, 64),
            activation='relu',
            learning_rate='adaptive',
            max_iter=500,
            random_state=42,
            early_stopping=True,
            validation_fraction=0.1
        ),
    }

    if XGBOOST_AVAILABLE:
        models['XGBoost'] = xgb.XGBClassifier(
            n_estimators=300,
            learning_rate=0.1,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            use_label_encoder=False,
            eval_metric='mlogloss',
            random_state=42,
            n_jobs=-1
        )

    return models

In [7]:
# =============================================================================
# 5. ENSEMBLE MODEL
# =============================================================================

def build_ensemble(X_train, y_train):
    """Voting Ensemble: RF + GB + SGD."""
    rf  = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
    gb  = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=4, random_state=42)
    sgd = CalibratedClassifierCV(
        SGDClassifier(loss='modified_huber', max_iter=1000, random_state=42, n_jobs=-1),
        cv=3
    )

    ensemble = VotingClassifier(
        estimators=[('rf', rf), ('gb', gb), ('sgd', sgd)],
        voting='soft',
        n_jobs=-1
    )
    ensemble.fit(X_train, y_train)
    return ensemble


In [8]:
def tune_random_forest(X_train, y_train):
    param_grid = {
        'n_estimators':     [200, 300, 500],
        'max_depth':        [None, 10, 20],
        'min_samples_split':[2, 5],
        'max_features':     ['sqrt', 'log2'],
    }
    rf  = RandomForestClassifier(random_state=42, n_jobs=-1)
    cv  = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    gs  = GridSearchCV(rf, param_grid, cv=cv, scoring='f1_macro', n_jobs=-1, verbose=1)
    gs.fit(X_train, y_train)
    print(f"\nBest RF Parameters: {gs.best_params_}")
    print(f"Best CV Score     : {gs.best_score_:.4f}")
    return gs.best_estimator_



In [9]:
def evaluate_model(model, X_test, y_test, label_encoder, model_name="Model"):
    y_pred = model.predict(X_test)
    acc    = accuracy_score(y_test, y_pred)
    f1     = f1_score(y_test, y_pred, average='macro')
    kappa  = cohen_kappa_score(y_test, y_pred)

    print(f"\n{'='*60}")
    print(f"  {model_name} - Evaluation Results")
    print(f"{'='*60}")
    print(f"  Accuracy        : {acc:.4f} ({acc*100:.2f}%)")
    print(f"  F1 Score (Macro): {f1:.4f}")
    print(f"  Cohen's Kappa   : {kappa:.4f}")
    print(f"\n  Classification Report:")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
    return acc, f1, kappa, y_pred


def plot_confusion_matrix(y_test, y_pred, label_encoder, model_name):
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=label_encoder.classes_,
                yticklabels=label_encoder.classes_)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('Actual Sport')
    plt.xlabel('Predicted Sport')
    plt.tight_layout()
    plt.savefig(f'confusion_matrix_{model_name.replace(" ", "_")}.png', dpi=150)
    plt.close()
    print(f"  Confusion matrix saved.")


def plot_feature_importance(model, feature_cols):
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        fi_df = pd.DataFrame({'Feature': feature_cols, 'Importance': importances})
        fi_df = fi_df.sort_values('Importance', ascending=True)
        plt.figure(figsize=(10, 7))
        plt.barh(fi_df['Feature'], fi_df['Importance'], color='steelblue')
        plt.xlabel('Feature Importance')
        plt.title('Feature Importances - Random Forest')
        plt.tight_layout()
        plt.savefig('feature_importance.png', dpi=150)
        plt.close()
        print("  Feature importance chart saved.")


def compare_models(results_dict):
    names      = list(results_dict.keys())
    accuracies = [results_dict[n]['accuracy'] for n in names]
    f1s        = [results_dict[n]['f1']       for n in names]

    x     = np.arange(len(names))
    width = 0.35
    fig, ax = plt.subplots(figsize=(12, 6))
    bars1 = ax.bar(x - width/2, accuracies, width, label='Accuracy',         color='steelblue')
    bars2 = ax.bar(x + width/2, f1s,        width, label='F1 Score (Macro)', color='darkorange')
    ax.set_xlabel('Model')
    ax.set_ylabel('Score')
    ax.set_title('Model Comparison - Accuracy & F1 Score')
    ax.set_xticks(x)
    ax.set_xticklabels(names, rotation=15, ha='right')
    ax.legend()
    ax.set_ylim(0, 1)
    ax.bar_label(bars1, fmt='%.3f', padding=3)
    ax.bar_label(bars2, fmt='%.3f', padding=3)
    plt.tight_layout()
    plt.savefig('model_comparison.png', dpi=150)
    plt.close()
    print("  Model comparison chart saved.")


In [10]:
# =============================================================================
# 8. PREDICTION FUNCTION
# =============================================================================

def predict_sport(model, scaler, label_encoder, student_data: dict):
    """
    Predict sport for a single student.
    student_data keys: height_cm, weight_kg, bmi, strength_score,
                       flexibility_score, gait_score, walking_score,
                       age, gender  ← FIX 3: age & gender now required
    """
    df = pd.DataFrame([student_data])
    df = engineer_features(df)

    feature_cols = [
        'height_cm', 'weight_kg', 'bmi', 'strength_score', 'flexibility_score',
        'gait_score', 'walking_score', 'age', 'gender',
        'athleticism_index', 'power_to_weight',
        'mobility_score', 'height_weight_ratio', 'bmi_category'
    ]

    X            = scaler.transform(df[feature_cols])
    pred_encoded = model.predict(X)[0]
    pred_proba   = model.predict_proba(X)[0]
    sport        = label_encoder.inverse_transform([pred_encoded])[0]

    top3_idx    = np.argsort(pred_proba)[::-1][:3]
    top3_sports = label_encoder.inverse_transform(top3_idx)
    top3_proba  = pred_proba[top3_idx]

    return {
        'primary_recommendation': sport,
        'confidence': f"{pred_proba[pred_encoded]*100:.1f}%",
        'top_3_recommendations': [
            {'sport': s, 'probability': f"{p*100:.1f}%"}
            for s, p in zip(top3_sports, top3_proba)
        ]
    }

In [11]:
# =============================================================================
# 9. SAVE / LOAD MODEL
# =============================================================================

def save_model(model, scaler, label_encoder, filename='sport_prediction_model'):
    if JOBLIB_AVAILABLE:
        joblib.dump({'model': model, 'scaler': scaler, 'label_encoder': label_encoder},
                    f'{filename}.joblib')
        print(f"  Model saved as: {filename}.joblib")
    else:
        with open(f'{filename}.pkl', 'wb') as f:
            pickle.dump({'model': model, 'scaler': scaler, 'label_encoder': label_encoder}, f)
        print(f"  Model saved as: {filename}.pkl")


def load_model(filename='sport_prediction_model'):
    if JOBLIB_AVAILABLE:
        data = joblib.load(f'{filename}.joblib')
    else:
        with open(f'{filename}.pkl', 'rb') as f:
            data = pickle.load(f)
    return data['model'], data['scaler'], data['label_encoder']


In [12]:
# =============================================================================
# 10. CROSS-VALIDATION
# =============================================================================

def cross_validate_model(model, X_train, y_train, model_name):
    cv       = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores= cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy', n_jobs=-1)
    print(f"\n  {model_name} - 5-Fold CV Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
    return cv_scores


In [13]:
# =============================================================================
# 11. MAIN PIPELINE
# =============================================================================

def main():
    print("\n" + "="*60)
    print("  SPORT SUITABILITY PREDICTION SYSTEM")
    print("  Multi-Class ML Model for School Children")
    print("  Fixes Applied: Distinct Profiles + Tighter Noise + Age/Gender")
    print("="*60)

    # Step 1: Generate training data
    print("\n[1] Generating training dataset (200,000 samples)...")
    df = generate_training_data(n_samples=200000)
    df.to_csv('sport_training_data.csv', index=False)
    print(f"    Dataset shape: {df.shape}")
    print(f"    Sport distribution:\n{df['sport_label'].value_counts().to_string()}")

    # Step 2: Preprocess
    print("\n[2] Preprocessing data & engineering features...")
    X_train, X_test, y_train, y_test, scaler, label_encoder, feature_cols = preprocess_data(df)
    print(f"    Train size   : {X_train.shape[0]}")
    print(f"    Test size    : {X_test.shape[0]}")
    print(f"    Feature count: {len(feature_cols)}")

    # Step 3: Train & evaluate all models
    print("\n[3] Training individual models...")
    models  = build_models()
    results = {}

    for name, model in models.items():
        print(f"\n    Training: {name}...")
        model.fit(X_train, y_train)
        acc, f1, kappa, y_pred = evaluate_model(model, X_test, y_test, label_encoder, name)
        cross_validate_model(model, X_train, y_train, name)
        plot_confusion_matrix(y_test, y_pred, label_encoder, name)
        results[name] = {'accuracy': acc, 'f1': f1, 'kappa': kappa}

    # Step 4: Ensemble
    print("\n[4] Building Ensemble Model (Recommended)...")
    ensemble = build_ensemble(X_train, y_train)
    acc, f1, kappa, y_pred = evaluate_model(ensemble, X_test, y_test, label_encoder, "Ensemble")
    plot_confusion_matrix(y_test, y_pred, label_encoder, "Ensemble")
    results['Ensemble'] = {'accuracy': acc, 'f1': f1, 'kappa': kappa}

    # Step 5: Feature importance
    print("\n[5] Generating feature importance...")
    plot_feature_importance(models['Random Forest'], feature_cols)

    # Step 6: Compare
    print("\n[6] Comparing all models...")
    compare_models(results)

    # Step 7: Best model
    best_model_name = max(results, key=lambda k: results[k]['accuracy'])
    print(f"\n[7] Best Model: {best_model_name}")
    print(f"    Accuracy   : {results[best_model_name]['accuracy']*100:.2f}%")

    best_model = ensemble

    # Step 8: Save
    print("\n[8] Saving trained model...")
    save_model(best_model, scaler, label_encoder)

    # Step 9: Example prediction
    # FIX 3 - age and gender now included in student data
    print("\n[9] Example Student Prediction:")
    example_student = {
        'height_cm':          162.0,
        'weight_kg':           55.0,
        'bmi':                 20.97,
        'strength_score':      78.0,
        'flexibility_score':   82.0,
        'gait_score':          75.0,
        'walking_score':       73.0,
        'age':                 13,     # FIX 3 - NEW FIELD
        'gender':               1,     # FIX 3 - NEW FIELD (1=Male, 0=Female)
    }

    prediction = predict_sport(best_model, scaler, label_encoder, example_student)
    print(f"\n    Student Data : {example_student}")
    print(f"\n    PRIMARY RECOMMENDATION : {prediction['primary_recommendation']}")
    print(f"    Confidence             : {prediction['confidence']}")
    print(f"\n    Top 3 Recommendations:")
    for i, rec in enumerate(prediction['top_3_recommendations'], 1):
        print(f"      {i}. {rec['sport']} - {rec['probability']}")

    print("\n" + "="*60)
    print("  PIPELINE COMPLETE!")
    print("  Files generated:")
    print("    - sport_training_data.csv        (200,000 row dataset)")
    print("    - sport_prediction_model.joblib  (trained ensemble model)")
    print("    - confusion_matrix_*.png         (per model)")
    print("    - feature_importance.png")
    print("    - model_comparison.png")
    print("="*60)

    return best_model, scaler, label_encoder, results


if __name__ == "__main__":
    best_model, scaler, label_encoder, results = main()



  SPORT SUITABILITY PREDICTION SYSTEM
  Multi-Class ML Model for School Children
  Fixes Applied: Distinct Profiles + Tighter Noise + Age/Gender

[1] Generating training dataset (200,000 samples)...
    Dataset shape: (199998, 10)
    Sport distribution:
sport_label
Athletics     33333
Basketball    33333
Volleyball    33333
Football      33333
Gymnastics    33333
Swimming      33333

[2] Preprocessing data & engineering features...
    Train size   : 159998
    Test size    : 40000
    Feature count: 14

[3] Training individual models...

    Training: Random Forest...

  Random Forest - Evaluation Results
  Accuracy        : 0.9861 (98.61%)
  F1 Score (Macro): 0.9861
  Cohen's Kappa   : 0.9834

  Classification Report:
              precision    recall  f1-score   support

   Athletics       0.98      0.98      0.98      6666
  Basketball       0.99      0.99      0.99      6666
    Football       0.98      0.97      0.98      6667
  Gymnastics       1.00      1.00      1.00      66