In [None]:
# Import necessary libraries for data manipulation, modeling, and visualization
import pandas as pd  # For handling data in DataFrames
import numpy as np  # For numerical operations
import xgboost as xgb  # For the XGBoost machine learning algorithm
from xgboost import XGBRegressor, plot_importance  # Specific functions from XGBoost
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV  # For model evaluation and tuning
from sklearn.metrics import mean_absolute_error, make_scorer  # For evaluating model performance
import matplotlib.pyplot as plt  # For plotting graphs
import warnings  # To suppress warnings
warnings.filterwarnings('ignore')  # Ignore warnings for cleaner output

# ---------------------------------------------------------
# Step 1: Load and Inspect the Dataset
# ---------------------------------------------------------

# Load the dataset containing historical quarterback (QB) statistics
# Replace 'qb_stats.csv' with the path to your actual data file
data = pd.read_csv('qb_stats.csv')

# Display the first few rows to understand the structure of the dataset
print("First few rows of the dataset:")
print(data.head())

# ---------------------------------------------------------
# Step 2: Data Preprocessing
# ---------------------------------------------------------

# Fill any missing values in the dataset
# 'ffill' method propagates the last valid observation forward
data.fillna(method='ffill', inplace=True)

# Identify categorical features that need to be converted into numerical format
# For example, 'home_or_away' might be 'Home' or 'Away', and 'opponent_team' is the team the QB played against
categorical_features = ['home_or_away', 'opponent_team']

# Convert categorical variables into dummy/indicator variables
# This creates a new binary column for each category (one-hot encoding)
data = pd.get_dummies(data, columns=categorical_features)

# ---------------------------------------------------------
# Step 3: Feature Engineering
# ---------------------------------------------------------

# Create new features that might help the model make better predictions
# Calculate the rolling average of passing yards over the last 3 games for each QB
data['rolling_avg_passing_yards'] = data.groupby('qb_name')['passing_yards'].transform(
    lambda x: x.rolling(window=3, min_periods=1).mean())

# If available, you can include additional features like the opponent's defensive ranking
# For example:
# data = data.merge(defensive_rankings, on='opponent_team', how='left')

# ---------------------------------------------------------
# Step 4: Prepare Features and Target Variable
# ---------------------------------------------------------

# Define the target variable 'y' that we want to predict (passing yards)
y = data['passing_yards']

# Define the feature set 'X' by removing columns that are not useful for prediction
# We drop 'passing_yards' (the target), 'qb_name' (identifier), and 'game_date' (unless processed differently)
X = data.drop(['passing_yards', 'qb_name', 'game_date'], axis=1)

# ---------------------------------------------------------
# Step 5: Set Up Cross-Validation Strategy
# ---------------------------------------------------------

# For time series data, it's important to keep the chronological order during cross-validation
# TimeSeriesSplit splits the data in a way that respects the time order
tscv = TimeSeriesSplit(n_splits=5)

# ---------------------------------------------------------
# Step 6: Train the XGBoost Model
# ---------------------------------------------------------

# Initialize the XGBoost regressor with basic parameters
model = XGBRegressor(
    objective='reg:squarederror',  # Specifies the regression task
    n_estimators=100,  # Number of trees to build
    learning_rate=0.05,  # Step size shrinkage to prevent overfitting
    max_depth=6,  # Maximum depth of each tree
    random_state=42  # Seed for reproducibility
)

# Fit the model on the entire dataset
# The model learns the relationship between the features in X and the target y
model.fit(X, y)

# ---------------------------------------------------------
# Step 7: Evaluate the Model
# ---------------------------------------------------------

# Define Mean Absolute Error (MAE) as the scoring metric
# MAE measures the average magnitude of the errors without considering their direction
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

# Perform cross-validation to evaluate the model's performance on different subsets of data
cv_scores = cross_val_score(model, X, y, cv=tscv, scoring=mae_scorer)

# Calculate and print the mean MAE from cross-validation
print(f'\nCross-Validated MAE: {-cv_scores.mean():.2f}')

# ---------------------------------------------------------
# Step 8: Hyperparameter Tuning (Optional but Recommended)
# ---------------------------------------------------------

# Define a grid of hyperparameters to search for the best combination
param_grid = {
    'n_estimators': [100, 200],       # Try different numbers of trees
    'max_depth': [4, 6, 8],           # Try different tree depths
    'learning_rate': [0.01, 0.05, 0.1],  # Try different learning rates
    'subsample': [0.7, 1.0]           # Try different subsample ratios
}

# Initialize GridSearchCV to search for the best hyperparameters
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=tscv,
    scoring=mae_scorer,
    n_jobs=-1  # Use all available CPU cores
)

# Fit GridSearchCV to find the best combination of hyperparameters
grid_search.fit(X, y)

# Print the best parameters found
print(f'\nBest Parameters from Grid Search: {grid_search.best_params_}')

# Update the model with the best found parameters
best_model = grid_search.best_estimator_

# ---------------------------------------------------------
# Step 9: Prepare Data for Upcoming Games
# ---------------------------------------------------------

# Load the dataset containing information about upcoming games
# Replace 'upcoming_games.csv' with the path to your actual data file
upcoming_games = pd.read_csv('upcoming_games.csv')

# Preprocess the upcoming games data in the same way as the training data
upcoming_games.fillna(method='ffill', inplace=True)
upcoming_games = pd.get_dummies(upcoming_games, columns=categorical_features)

# Ensure that the upcoming_games DataFrame has the same columns as X
# Reindex the DataFrame to have the same columns, filling missing columns with zeros
upcoming_games = upcoming_games.reindex(columns=X.columns, fill_value=0)

# ---------------------------------------------------------
# Step 10: Make Predictions on Upcoming Games
# ---------------------------------------------------------

# Use the trained model to predict passing yards for the upcoming games
predictions = best_model.predict(upcoming_games)

# Add the predictions to the upcoming_games DataFrame
upcoming_games['predicted_passing_yards'] = predictions

# Display the predicted passing yards for each QB
print("\nPredicted Passing Yards for Upcoming Games:")
print(upcoming_games[['qb_name', 'predicted_passing_yards']])

# ---------------------------------------------------------
# Step 11: Analyze Feature Importance
# ---------------------------------------------------------

# Plot the feature importance to understand which features contribute most to the model
plt.figure(figsize=(12, 8))  # Set the size of the plot
plot_importance(best_model, max_num_features=10)  # Show the top 10 features
plt.title('Feature Importance')
plt.show()
