# Pitcher Game K

Train a model to predict the number of K's a pitcher will get in a given game against a given opponent

In [33]:
import pandas as pd

# Load the pitcher game stats with opponent team stats
df = pd.read_csv('../../data/processed/pitcher_game_opponent_stats.csv')

In [34]:
# Define feature columns for modeling
feature_cols = [
    'height_inches',
    'weight',
    'throws',
    'age',
    'pit_ewma_k_pct',
    'pit_ewma_csw_pct',
    'opp',
    'opp_ewma_k_perc_vs_lhp',
    'opp_ewma_k_perc_vs_rhp',
    'opp_ewma_csw_perc_vs_rhp',
    'opp_ewma_csw_perc_vs_rhp',
    'condition',
    'temp',
    'wind_speed',
    'wind_direction',
]

# Extract features (X) and target variable (y)
X = df[feature_cols]
y = df['strikeouts']

In [36]:
X

Unnamed: 0,height_inches,weight,throws,age,pit_ewma_k_pct,pit_ewma_csw_pct,opp,opp_ewma_k_perc_vs_lhp,opp_ewma_k_perc_vs_rhp,opp_ewma_csw_perc_vs_rhp,opp_ewma_csw_perc_vs_rhp.1,condition,temp,wind_speed,wind_direction
0,79,230,1,40.561259,0.125000,0.300000,10,0.335252,0.418919,0.359181,0.359181,4,81,16,9
1,79,230,1,40.574949,0.190592,0.273763,10,0.312296,0.373455,0.404178,0.404178,7,79,6,4
2,79,230,1,40.676249,0.200968,0.281293,11,0.190137,0.222942,0.261990,0.261990,3,54,9,8
3,79,230,1,40.706366,0.182165,0.279434,24,0.234687,0.183608,0.228576,0.228576,4,79,10,7
4,79,230,1,40.750171,0.163086,0.276535,22,0.209629,0.245425,0.291892,0.291892,3,88,11,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7841,70,176,1,25.828884,0.280073,0.303608,11,0.108652,0.187423,0.263842,0.263842,0,77,9,4
7842,70,176,1,26.067077,0.261681,0.288778,4,0.234981,0.218052,0.274452,0.274452,0,75,6,5
7843,70,176,1,26.099932,0.279911,0.294923,7,0.386263,0.260813,0.263240,0.263240,7,81,4,4
7844,70,176,1,26.193018,0.261579,0.278716,18,0.228482,0.237869,0.305892,0.305892,4,77,5,4


In [35]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

def evaluate_model(model, X, y, n_splits=5):
    # Create KFold object
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Initialize lists to store metrics
    train_scores = []
    val_scores = []
    
    # Train and evaluate model on each fold
    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        # Split data
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        # Train model
        model.fit(X_train, y_train)
        
        # Make predictions
        train_preds = model.predict(X_train)
        val_preds = model.predict(X_val)
        
        # Calculate RMSE
        train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
        val_rmse = np.sqrt(mean_squared_error(y_val, val_preds))
        
        train_scores.append(train_rmse)
        val_scores.append(val_rmse)
        
        print(f"Fold {fold+1}:")
        print(f"Training RMSE: {train_rmse:.3f}")
        print(f"Validation RMSE: {val_rmse:.3f}")
        print()
    
    print("Average scores across folds:")
    print(f"Training RMSE: {np.mean(train_scores):.3f} (+/- {np.std(train_scores):.3f})")
    print(f"Validation RMSE: {np.mean(val_scores):.3f} (+/- {np.std(val_scores):.3f})")
    
    return train_scores, val_scores

# Evaluate linear regression model
print("Linear Regression Results:")
linear_model = LinearRegression()
train_scores, val_scores = evaluate_model(linear_model, X, y)
print("\n" + "="*50 + "\n")

# Evaluate Ridge regression model
print("Ridge Regression Results:")
ridge_model = Ridge(alpha=1.0)
train_scores, val_scores = evaluate_model(ridge_model, X, y)
print("\n" + "="*50 + "\n")

# Evaluate Random Forest model
print("Random Forest Results:")
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
train_scores, val_scores = evaluate_model(rf_model, X, y)


Linear Regression Results:
Fold 1:
Training RMSE: 2.397
Validation RMSE: 2.396

Fold 2:
Training RMSE: 2.405
Validation RMSE: 2.365

Fold 3:
Training RMSE: 2.389
Validation RMSE: 2.427

Fold 4:
Training RMSE: 2.390
Validation RMSE: 2.425

Fold 5:
Training RMSE: 2.399
Validation RMSE: 2.390

Average scores across folds:
Training RMSE: 2.396 (+/- 0.006)
Validation RMSE: 2.401 (+/- 0.023)


Ridge Regression Results:
Fold 1:
Training RMSE: 2.399
Validation RMSE: 2.398

Fold 2:
Training RMSE: 2.406
Validation RMSE: 2.363

Fold 3:
Training RMSE: 2.391
Validation RMSE: 2.430

Fold 4:
Training RMSE: 2.391
Validation RMSE: 2.428

Fold 5:
Training RMSE: 2.400
Validation RMSE: 2.389

Average scores across folds:
Training RMSE: 2.397 (+/- 0.006)
Validation RMSE: 2.402 (+/- 0.025)


Random Forest Results:
Fold 1:
Training RMSE: 0.889
Validation RMSE: 2.389

Fold 2:
Training RMSE: 0.896
Validation RMSE: 2.361

Fold 3:
Training RMSE: 0.894
Validation RMSE: 2.401

Fold 4:
Training RMSE: 0.889
Validati