In [1]:
# Football Header Impact - Model Training and Export (.pkl)

import pandas as pd
import numpy as np
import joblib

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler

In [2]:
# === 1. Load Data ===
df = pd.read_csv("HeadingFeatures.csv")

In [3]:
# === 2. Define inputs ===
features = ["1_Dist_Bef_Head", "2_Fli_Bef_Head", "3_Fli_Aft_Head", "4_Dist_Aft_Head", "5_Head_Type"]
targets = {
    "PLA": "PLA (Based on Median)",
    "PAA": "PAA (Based on Median)"
}

In [4]:
# === 3. Define models with hyperparameters ===
models_dict = {
    "RandomForest": RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=42),
    "SVM": SVC(probability=True, C=1, kernel='rbf', class_weight='balanced', random_state=42),
    "LogisticRegression": LogisticRegression(solver='liblinear', C=1, class_weight='balanced', random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=100, max_depth=5, scale_pos_weight=5.0, random_state=42)
}


In [5]:
# === 4. Training loop ===
for target_key, target_col in targets.items():
    print(f"\n📊 Training models for target: {target_key}")
    y_raw = df[target_col]

    if y_raw.dtype == object:
        y = y_raw.map({"Low": 0, "High": 1}).values
    else:
        y = y_raw.values

    X = df[features]

    # === Fit scaler ===
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Save one common scaler per target
    joblib.dump(scaler, f"scaler_{target_key}.pkl")

    for model_name, model in models_dict.items():
        print(f"🔧 Training {model_name}...")
        model.fit(X_scaled, y)

        # Predict probabilities and compute median threshold
        proba = model.predict_proba(X_scaled)[:, 1]
        threshold = float(np.median(proba))  # Convert to float to make it JSON serializable

        # Save model and threshold
        joblib.dump(model, f"{model_name.lower()}_{target_key}.pkl")
        joblib.dump(threshold, f"threshold_{model_name.lower()}_{target_key}.pkl")

        print(f"✅ Saved model + threshold: {model_name.lower()}_{target_key}.pkl (median threshold: {threshold:.3f})")

print("\n🎉 All models and thresholds saved.")


📊 Training models for target: PLA
🔧 Training RandomForest...
✅ Saved model + threshold: randomforest_PLA.pkl (median threshold: 0.475)
🔧 Training SVM...
✅ Saved model + threshold: svm_PLA.pkl (median threshold: 0.478)
🔧 Training LogisticRegression...
✅ Saved model + threshold: logisticregression_PLA.pkl (median threshold: 0.571)
🔧 Training XGBoost...
✅ Saved model + threshold: xgboost_PLA.pkl (median threshold: 0.823)

📊 Training models for target: PAA
🔧 Training RandomForest...
✅ Saved model + threshold: randomforest_PAA.pkl (median threshold: 0.455)
🔧 Training SVM...
✅ Saved model + threshold: svm_PAA.pkl (median threshold: 0.487)
🔧 Training LogisticRegression...
✅ Saved model + threshold: logisticregression_PAA.pkl (median threshold: 0.426)
🔧 Training XGBoost...
✅ Saved model + threshold: xgboost_PAA.pkl (median threshold: 0.804)

🎉 All models and thresholds saved.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
