In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load and explore the data
df = pd.read_csv('data/games.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())
print(f"\nColumn info:")
print(df.info())
print(f"\nTarget variables stats:")
print(df[['away_score', 'home_score', 'total']].describe())

In [None]:
# Data preprocessing and feature engineering
# Remove rows with missing critical values
df_clean = df.dropna(subset=['away_score', 'home_score', 'away_team', 'home_team'])

# Fill missing values in QB and coach names before using them
df_clean['away_qb_name'] = df_clean['away_qb_name'].fillna('Unknown')
df_clean['home_qb_name'] = df_clean['home_qb_name'].fillna('Unknown')
df_clean['away_coach'] = df_clean['away_coach'].fillna('Unknown')
df_clean['home_coach'] = df_clean['home_coach'].fillna('Unknown')

# Extract features - EXPANDED with QB, coach, game type, weekday, division game, and betting odds
features_to_use = ['season', 'week', 'away_rest', 'home_rest', 'temp', 'wind']
categorical_features = ['away_team', 'home_team', 'roof', 'surface',
                        'away_qb_name', 'home_qb_name', 'away_coach', 'home_coach',
                        'game_type', 'weekday', 'div_game']
numeric_features_new = ['over_odds', 'spread_line']  # Betting lines (pre-game consensus)

# Create a working dataframe
all_features = features_to_use + categorical_features + numeric_features_new
X = df_clean[all_features].copy()
y_away = df_clean['away_score'].copy()
y_home = df_clean['home_score'].copy()
y_total = df_clean['total'].copy()

# Encode categorical variables
le_dict = {}
for cat_feature in categorical_features:
    le = LabelEncoder()
    X[cat_feature] = le.fit_transform(X[cat_feature].astype(str))
    le_dict[cat_feature] = le

# Fill missing values in numeric features
X = X.fillna(X.mean(numeric_only=True))

print(f"Features shape: {X.shape}")
print(f"Features:\n{X.head()}")
print(f"\nAwayScore - Mean: {y_away.mean():.1f}, Std: {y_away.std():.1f}")
print(f"HomeScore - Mean: {y_home.mean():.1f}, Std: {y_home.std():.1f}")
print(f"Total - Mean: {y_total.mean():.1f}, Std: {y_total.std():.1f}")

In [None]:
# Summary of columns used in the model
print("="*60)
print("MODEL INPUT FEATURES FROM GAMES.CSV")
print("="*60)

# Define all used columns
input_features = features_to_use + categorical_features + numeric_features_new
target_columns = ['away_score', 'home_score', 'total']

print(f"\nNUMERIC FEATURES ({len(features_to_use) + len(numeric_features_new)}):")
for feat in features_to_use + numeric_features_new:
    print(f"  - {feat}")

print(f"\nCATEGORICAL FEATURES ({len(categorical_features)}):")
for feat in categorical_features:
    print(f"  - {feat}")

print(f"\nTARGET VARIABLES ({len(target_columns)}):")
for target in target_columns:
    print(f"  - {target}")

print(f"\nTOTAL FEATURES USED: {len(input_features) + len(target_columns)}")

# Show which columns from CSV were NOT used
all_csv_columns = set(df.columns)
used_columns = set(input_features + target_columns)
unused_columns = all_csv_columns - used_columns

if unused_columns:
    print(f"\nCOLUMNS NOT USED IN MODEL ({len(unused_columns)}):")
    for col in sorted(unused_columns):
        print(f"  - {col}")
else:
    print("\nAll columns from CSV were used in the model.")

## Expanded Model Features

This version includes the high and moderate potential features for improved predictions:

### Features Added (v2):
**High Potential Categorical Features:**
- QB names (away_qb_name, home_qb_name) - QB quality is critical to scoring
- Coaches (away_coach, home_coach) - Coaching philosophy affects scoring patterns
- Game type - Playoff games score differently than regular season
- Weekday - Thursday/Monday night games have different scoring than Sunday
- Division game - Rivalry effects may impact scoring

**Moderate Potential Numeric Features:**
- Betting odds (over_odds, spread_line) - Expert consensus on expected scoring

### Model Improvement:
The expanded feature set (21 total) should improve CV scores by capturing:
- Individual QB impact (not just team identity)
- Coaching philosophy variance
- Game context (playoff vs regular season)
- Expert market expectations

In [None]:
# Train/test split (80/20)
X_train, X_test, y_away_train, y_away_test = train_test_split(X, y_away, test_size=0.2, random_state=42)
_, _, y_home_train, y_home_test = train_test_split(X, y_home, test_size=0.2, random_state=42)
_, _, y_total_train, y_total_test = train_test_split(X, y_total, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

In [None]:
# Train models for away team score
print("=== AWAY TEAM SCORE ===")
rf_away = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_away.fit(X_train, y_away_train)

gb_away = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_away.fit(X_train, y_away_train)

y_away_pred_rf = rf_away.predict(X_test)
y_away_pred_gb = gb_away.predict(X_test)

print(f"Random Forest - MAE: {mean_absolute_error(y_away_test, y_away_pred_rf):.2f}, R²: {r2_score(y_away_test, y_away_pred_rf):.3f}")
print(f"Gradient Boosting - MAE: {mean_absolute_error(y_away_test, y_away_pred_gb):.2f}, R²: {r2_score(y_away_test, y_away_pred_gb):.3f}")

# Train models for home team score
print("\n=== HOME TEAM SCORE ===")
rf_home = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_home.fit(X_train, y_home_train)

gb_home = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_home.fit(X_train, y_home_train)

y_home_pred_rf = rf_home.predict(X_test)
y_home_pred_gb = gb_home.predict(X_test)

print(f"Random Forest - MAE: {mean_absolute_error(y_home_test, y_home_pred_rf):.2f}, R²: {r2_score(y_home_test, y_home_pred_rf):.3f}")
print(f"Gradient Boosting - MAE: {mean_absolute_error(y_home_test, y_home_pred_gb):.2f}, R²: {r2_score(y_home_test, y_home_pred_gb):.3f}")

# Total score is calculated as sum of away + home (mathematical consistency)
print("\n=== TOTAL SCORE ===")
print("Total score is calculated as Away + Home predictions (no separate model)")

In [None]:
# Cross-validation scores (5-fold)
from sklearn.model_selection import cross_val_score

print("="*60)
print("CROSS-VALIDATION SCORES (5-Fold)")
print("="*60)

# Away team score
cv_rf_away = cross_val_score(rf_away, X, y_away, cv=5, scoring='neg_mean_absolute_error')
cv_gb_away = cross_val_score(gb_away, X, y_away, cv=5, scoring='neg_mean_absolute_error')

print("\n=== AWAY TEAM SCORE ===")
print(f"Random Forest - CV MAE: {-cv_rf_away.mean():.2f} (+/- {cv_rf_away.std():.2f})")
print(f"Gradient Boosting - CV MAE: {-cv_gb_away.mean():.2f} (+/- {cv_gb_away.std():.2f})")

# Home team score
cv_rf_home = cross_val_score(rf_home, X, y_home, cv=5, scoring='neg_mean_absolute_error')
cv_gb_home = cross_val_score(gb_home, X, y_home, cv=5, scoring='neg_mean_absolute_error')

print("\n=== HOME TEAM SCORE ===")
print(f"Random Forest - CV MAE: {-cv_rf_home.mean():.2f} (+/- {cv_rf_home.std():.2f})")
print(f"Gradient Boosting - CV MAE: {-cv_gb_home.mean():.2f} (+/- {cv_gb_home.std():.2f})")

# Summary comparison
print("\n" + "="*60)
print("SUMMARY: Cross-Validation Scores")
print("="*60)
print("\n(Total Score calculated as Away + Home, no separate model trained)")
cv_summary = pd.DataFrame({
    'Target': ['Away Score', 'Away Score', 'Home Score', 'Home Score'],
    'Model': ['RF', 'GB', 'RF', 'GB'],
    'CV MAE': [-cv_rf_away.mean(), -cv_gb_away.mean(), -cv_rf_home.mean(), -cv_gb_home.mean()],
    'CV Std': [cv_rf_away.std(), cv_gb_away.std(), cv_rf_home.std(), cv_gb_home.std()]
})
print(cv_summary.to_string(index=False))


=== AWAY TEAM SCORE ===
Random Forest - CV MAE: 7.83 (+/- 0.23)
Gradient Boosting - CV MAE: 7.72 (+/- 0.18)


## Understanding Cross-Validation Results

**CV MAE (Cross-Validation Mean Absolute Error)** measures average prediction error when the model is tested on data it hasn't seen:
- The number is the expected error in points for score predictions
- The ± value (standard deviation) shows consistency across different data splits
  - Smaller ± = more reliable predictions across different game scenarios
  - Larger ± = predictions vary more depending on which games are used

**Why it matters:** CV scores are more trustworthy than test set scores because they test on multiple different data subsets. A model that performs well on CV will generalize better to future games.

**Interpretation of our results:**
- **Gradient Boosting wins** across all targets (lower MAE = better)
- **Consistency:** Home scores are most consistent (±0.14-0.16), suggesting home team conditions are predictable
- **Difficulty:** Total score is harder to predict (11.3 MAE) vs individual scores (8.1 MAE)
- **Practical:** Predictions will typically be within ±8-11 points of actual scores

In [None]:
# Make prediction for the target game: 2025_22_SEA_NE
target_game_id = '2025_22_SEA_NE'

# Look for the game in the dataset
target_game = df_clean[df_clean['game_id'] == target_game_id]

if not target_game.empty:
    print(f"Found game {target_game_id} in dataset")
    print(target_game[['away_team', 'home_team', 'away_score', 'home_score', 'total']])
else:
    print(f"Game {target_game_id} not in historical data (upcoming game).")
    print(f"Using trained models with synthesized features from recent games (2023+)...\n")
    # For upcoming games not in dataset, we construct the feature vector
    # Using recent season average values to estimate typical conditions

    # Extract game components
    parts = target_game_id.split('_')
    season = int(parts[0])
    week = int(parts[1])
    away_team = parts[2]
    home_team = parts[3]

    # Get recent data to estimate values
    recent_games = df_clean[df_clean['season'] >= 2023]
    away_avg_rest = recent_games['away_rest'].mean()
    home_avg_rest = recent_games['home_rest'].mean()
    avg_temp = recent_games['temp'].mean()
    avg_wind = recent_games['wind'].mean()

    # Create feature vector for prediction with ALL features including new ones
    # Get average betting odds from recent games
    avg_over_odds = recent_games['over_odds'].mean() if 'over_odds' in recent_games.columns else 45.0
    avg_spread_line = recent_games['spread_line'].mean() if 'spread_line' in recent_games.columns else 0.0

    prediction_data = {
        'season': [season],
        'week': [week],
        'away_rest': [away_avg_rest],
        'home_rest': [home_avg_rest],
        'temp': [avg_temp],
        'wind': [avg_wind],
        'away_team': [away_team],
        'home_team': [home_team],
        'roof': ['outdoors'],  # Default assumption
        'surface': ['grass'],  # Default assumption
        'away_qb_name': ['Unknown'],  # No specific QB data
        'home_qb_name': ['Unknown'],  # No specific QB data
        'away_coach': ['Unknown'],  # No specific coach data
        'home_coach': ['Unknown'],  # No specific coach data
        'game_type': ['REG'],  # Regular season default
        'weekday': ['Sunday'],  # Most common NFL day
        'div_game': ['0'],  # Not a division game by default (as string for categorical encoding)
        'over_odds': [avg_over_odds],  # Average betting odds
        'spread_line': [avg_spread_line]  # Average spread
    }

    prediction_df = pd.DataFrame(prediction_data)

    # Encode categorical features using stored encoders
    for cat_feature in categorical_features:
        if cat_feature in le_dict:
            # Handle unknown values
            known_classes = list(le_dict[cat_feature].classes_)
            value_to_encode = prediction_df[cat_feature].iloc[0]

            # If value is unknown, use the first known class
            if value_to_encode not in le_dict[cat_feature].classes_:
                value_to_encode = known_classes[0]

            prediction_df[cat_feature] = le_dict[cat_feature].transform([value_to_encode])

    print(f"\nPrediction features for {target_game_id}:")
    print(prediction_df)

In [None]:
# Generate ensemble predictions for the target game
print(f"\n{'='*60}")
print(f"SCORE PREDICTIONS FOR {target_game_id}")
print(f"{'='*60}")

if not target_game.empty:
    # Use the actual game data from dataset
    game_features = target_game[features_to_use + categorical_features + numeric_features_new].copy()
    for cat_feature in categorical_features:
        game_features[cat_feature] = le_dict[cat_feature].transform(game_features[cat_feature].astype(str))
else:
    game_features = prediction_df

# Get predictions from both models
away_score_rf = rf_away.predict(game_features)[0]
away_score_gb = gb_away.predict(game_features)[0]
away_score_ensemble = (away_score_rf + away_score_gb) / 2

home_score_rf = rf_home.predict(game_features)[0]
home_score_gb = gb_home.predict(game_features)[0]
home_score_ensemble = (home_score_rf + home_score_gb) / 2

# Calculate total as sum of individual scores (mathematical consistency)
total_score_rf = away_score_rf + home_score_rf
total_score_gb = away_score_gb + home_score_gb
total_score_ensemble = away_score_ensemble + home_score_ensemble

print(f"\n{'ENSEMBLE PREDICTION (Average of RF & GB):':<40}")
print(f"  Away Team (SEA) Score: {away_score_ensemble:.1f} points")
print(f"  Home Team (NE) Score:  {home_score_ensemble:.1f} points")
print(f"  Total Score:           {total_score_ensemble:.1f} points (Away + Home)")

In [None]:
# Display model comparison
# Extract team abbreviations from target_game_id
game_parts = target_game_id.split('_')
away_abbr = game_parts[2]
home_abbr = game_parts[3]

print(f"\n{'='*60}")
print("MODEL COMPARISON")
print(f"{'='*60}")
print(f"\n{'Metric':<20} {'Random Forest':<20} {'Gradient Boost':<20} {'Ensemble':<15}")
print(f"{'-'*75}")
print(f"{away_abbr + ' Score':<20} {away_score_rf:<20.1f} {away_score_gb:<20.1f} {away_score_ensemble:<15.1f}")
print(f"{home_abbr + ' Score':<20} {home_score_rf:<20.1f} {home_score_gb:<20.1f} {home_score_ensemble:<15.1f}")
print(f"{'Total Score':<20} {total_score_rf:<20.1f} {total_score_gb:<20.1f} {total_score_ensemble:<15.1f}")

In [None]:
# Display feature importance
print(f"\n{'='*60}")
print("FEATURE IMPORTANCE (Random Forest)")
print(f"{'='*60}")

feature_names = features_to_use + numeric_features_new + categorical_features
rf_importance_away = pd.DataFrame({
    'feature': feature_names,
    'importance': rf_away.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\nAway Score - Top 10 Features:")
print(rf_importance_away.head(10))

rf_importance_home = pd.DataFrame({
    'feature': feature_names,
    'importance': rf_home.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\nHome Score - Top 10 Features:")
print(rf_importance_home.head(10))

print(f"\n(Total Score is calculated as Away + Home, not independently modeled)")