# Pitcher Game K

Train a model to predict the number of K's a pitcher will get in a given game against a given opponent

In [49]:
import pandas as pd

# Load the pitcher game stats with opponent team stats
df = pd.read_csv('../../data/training/pitcher_k_training.csv')

# Filter for starter pitchers only
df = df[df['is_starter'] == 1]

In [50]:
# Define feature columns for modeling
feature_cols = [
    'pitcher_handedness',
    'pit_ewma_k_perc_vs_lhh',
    'pit_ewma_k_perc_vs_rhh', 
    'pit_ewma_csw_perc_vs_lhh',
    'pit_ewma_csw_perc_vs_rhh',
    'pit_trailing_5_bf_avg',
    'pit_trailing_5_pitches_avg',
    'opp_ewma_k_perc_vs_lhp',
    'opp_ewma_k_perc_vs_rhp',
    'opp_ewma_csw_perc_vs_lhp',
    'opp_ewma_csw_perc_vs_rhp',
    'opp_ewma_contact_perc_vs_lhp',
    'opp_ewma_contact_perc_vs_rhp',
    'condition',
    'temp',
    'wind_speed',
    'wind_direction',
]

# Extract features (X) and target variable (y)
X = df[feature_cols]
y = df['strikeouts']

In [51]:
X

Unnamed: 0,pitcher_handedness,pit_ewma_k_perc_vs_lhh,pit_ewma_k_perc_vs_rhh,pit_ewma_csw_perc_vs_lhh,pit_ewma_csw_perc_vs_rhh,pit_trailing_5_bf_avg,pit_trailing_5_pitches_avg,opp_ewma_k_perc_vs_lhp,opp_ewma_k_perc_vs_rhp,opp_ewma_csw_perc_vs_lhp,opp_ewma_csw_perc_vs_rhp,opp_ewma_contact_perc_vs_lhp,opp_ewma_contact_perc_vs_rhp,condition,temp,wind_speed,wind_direction
5,1,0.137942,0.424529,0.163949,0.508851,14.0,47.6,0.222078,0.223292,0.276813,0.261947,0.747721,0.774553,0,72.0,8.0,9
6,0,0.399975,0.224109,0.400266,0.252443,17.0,56.0,0.251322,0.208856,0.280879,0.282128,0.785550,0.787354,0,72.0,8.0,9
8,1,0.316761,0.105911,0.209899,0.115275,13.0,33.0,0.245194,0.236666,0.282271,0.289273,0.743307,0.731709,8,88.0,10.0,9
13,0,0.179273,0.277836,0.277304,0.303599,16.4,55.4,0.301122,0.241208,0.256143,0.284024,0.759980,0.737480,8,88.0,10.0,9
21,1,0.331386,0.167163,0.501526,0.264176,19.0,62.0,0.332466,0.243809,0.264960,0.289045,0.771856,0.740133,0,73.0,9.0,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51460,1,0.221583,0.238256,0.282449,0.324801,22.6,87.8,0.311160,0.238803,0.297263,0.255660,0.756030,0.768530,4,66.0,2.0,2
51466,0,0.248617,0.303542,0.231231,0.265571,4.2,18.0,0.279030,0.188738,0.310603,0.270851,0.730496,0.796388,7,73.0,0.0,5
51471,1,0.211879,0.233050,0.282138,0.258394,24.2,97.8,0.308364,0.251683,0.311102,0.252875,0.703606,0.720028,7,73.0,0.0,5
51478,0,0.345989,0.257587,0.348576,0.279731,25.8,94.8,0.326105,0.229839,0.315949,0.251653,0.684327,0.733898,7,73.0,0.0,5


In [52]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

def evaluate_model(model, X, y, n_splits=5):
    # Create KFold object
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Initialize lists to store metrics
    train_scores = []
    val_scores = []
    
    # Train and evaluate model on each fold
    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        # Split data
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        # Train model
        model.fit(X_train, y_train)
        
        # Make predictions
        train_preds = model.predict(X_train)
        val_preds = model.predict(X_val)
        
        # Calculate RMSE
        train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
        val_rmse = np.sqrt(mean_squared_error(y_val, val_preds))
        
        train_scores.append(train_rmse)
        val_scores.append(val_rmse)
        
        print(f"Fold {fold+1}:")
        print(f"Training RMSE: {train_rmse:.3f}")
        print(f"Validation RMSE: {val_rmse:.3f}")
        print()
    
    print("Average scores across folds:")
    print(f"Training RMSE: {np.mean(train_scores):.3f} (+/- {np.std(train_scores):.3f})")
    print(f"Validation RMSE: {np.mean(val_scores):.3f} (+/- {np.std(val_scores):.3f})")
    
    return train_scores, val_scores

# Evaluate linear regression model
print("Linear Regression Results:")
linear_model = LinearRegression()
train_scores, val_scores = evaluate_model(linear_model, X, y)
print("\n" + "="*50 + "\n")

Linear Regression Results:
Fold 1:
Training RMSE: 2.275
Validation RMSE: 2.293

Fold 2:
Training RMSE: 2.292
Validation RMSE: 2.226

Fold 3:
Training RMSE: 2.273
Validation RMSE: 2.304

Fold 4:
Training RMSE: 2.273
Validation RMSE: 2.304

Fold 5:
Training RMSE: 2.278
Validation RMSE: 2.284

Average scores across folds:
Training RMSE: 2.278 (+/- 0.007)
Validation RMSE: 2.282 (+/- 0.029)


