In [1]:
# Cell 1: Imports and Data Loading
import pandas as pd
import numpy as np
import os
import glob
from datetime import datetime
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import warnings
warnings.filterwarnings('ignore')

def load_data():
    # Find the most recent betting data file
    betting_files = glob.glob('betting_data_*.csv')
    
    if not betting_files:
        # Fallback to the default name if no date-specific files found
        betting_file = 'betting_data.csv'
        print(f"Usando arquivo padrão: {betting_file}")
    else:
        # Sort files by date (assuming format betting_data_YYYY-MM-DD.csv)
        betting_files.sort(key=lambda x: datetime.strptime(x.split('_')[2].split('.')[0], '%Y-%m-%d'), reverse=True)
        betting_file = betting_files[0]
        print(f"Usando arquivo mais recente: {betting_file}")
    
    # Load all datasets
    k_percentage_df = pd.read_csv('team_strikeout_percentage.csv')
    pitcher_data = pd.read_csv('pitchers_data.csv')
    betting_data = pd.read_csv(betting_file)
    
    # Merge and preprocess data
    pitcher_data = pitcher_data.merge(
        betting_data[['Name_abbreviation', 'Team']], 
        left_on='Pitcher', 
        right_on='Name_abbreviation', 
        how='left'
    )
    
    # Feature engineering
    pitcher_data['SO_per_IP'] = pitcher_data['SO'] / pitcher_data['IP']
    pitcher_data['BB_per_IP'] = pitcher_data['BB'] / pitcher_data['IP']
    pitcher_data['K-BB%'] = pitcher_data['SO_per_IP'] - pitcher_data['BB_per_IP']
    
    # Merge with team stats
    pitcher_data = pitcher_data.merge(k_percentage_df, on='Team', how='left')
    pitcher_data.rename(columns={'%K': 'Team_K%'}, inplace=True)
    pitcher_data = pitcher_data.merge(
        k_percentage_df.rename(columns={'%K': 'Opp_K%'}), 
        left_on='Opp', right_on='Team', how='left'
    )
    
    # Handle missing values
    pitcher_data.fillna({
        'SO_per_IP': pitcher_data['SO_per_IP'].mean(),
        'BB_per_IP': pitcher_data['BB_per_IP'].mean(),
        'Team_K%': pitcher_data['Team_K%'].mean(),
        'Opp_K%': pitcher_data['Opp_K%'].mean()
    }, inplace=True)
    
    return pitcher_data, k_percentage_df, betting_file

pitchers_df, k_percentage_df, betting_file_used = load_data()

Usando arquivo mais recente: betting_data_2025-03-30.csv


In [7]:
# Cell 2: Performance Calculation Function
def calculate_weighted_performance(pitcher_data, current_season, last_season=None):
    current_season_data = pitcher_data[pitcher_data['Season'] == current_season]
    last_5_games = current_season_data.tail(5)
    last_10_games = current_season_data.tail(10)
    
    # Calculate rolling averages
    current_season_data['SO_rolling_5'] = current_season_data['SO'].rolling(5).mean()
    current_season_data['SO_rolling_10'] = current_season_data['SO'].rolling(10).mean()
    
    # Calculate home/away splits
    home_stats = current_season_data[current_season_data['Home'] == 1.0].mean(numeric_only=True)
    away_stats = current_season_data[current_season_data['Home'] == 0.0].mean(numeric_only=True)

    if last_season is not None:
        last_season_data = pitcher_data[pitcher_data['Season'] == last_season]
        weight_current_season = 0.40
        weight_last_5_games = 0.25
        weight_last_10_games = 0.15
        weight_last_season = 0.20
    else:
        last_season_data = pd.DataFrame()
        weight_current_season = 0.50
        weight_last_5_games = 0.30
        weight_last_10_games = 0.20
        weight_last_season = 0.0

    metrics = ['IP', 'H', 'BB', 'ERA', 'FIP', 'SO', 'SO_rolling_5', 'SO_rolling_10']
    weighted_values = {}

    for metric in metrics:
        current_mean = current_season_data[metric].mean() if not current_season_data.empty else 0
        last_5_mean = last_5_games[metric].mean() if not last_5_games.empty else 0
        last_10_mean = last_10_games[metric].mean() if not last_10_games.empty else 0
        last_season_mean = last_season_data[metric].mean() if not last_season_data.empty else 0

        weighted_values[metric] = (
            weight_current_season * current_mean +
            weight_last_5_games * last_5_mean +
            weight_last_10_games * last_10_mean +
            weight_last_season * last_season_mean
        )
    
    # Add home/away splits
    weighted_values['Home_IP'] = home_stats.get('IP', 0)
    weighted_values['Away_IP'] = away_stats.get('IP', 0)
    weighted_values['Home_SO'] = home_stats.get('SO', 0)
    weighted_values['Away_SO'] = away_stats.get('SO', 0)
    weighted_values['Home'] = current_season_data['Home'].mean()
    
    return weighted_values

In [13]:
from sklearn.preprocessing import RobustScaler
import warnings

warnings.filterwarnings("ignore", category=UserWarning, module="lightgbm")

def train_model(pitchers_df, k_percentage_df):
    # Calculate weighted performance for each pitcher
    weighted_pitcher_data = []
    pitchers_df = pitchers_df[(pitchers_df['Season'] == 2023) | (pitchers_df['Season'] == 2024)]

    for pitcher in pitchers_df['Pitcher'].unique():
        pitcher_data = pitchers_df[pitchers_df['Pitcher'] == pitcher].copy()
        
        if 'SO' not in pitcher_data.columns:
            pitcher_data['SO'] = 0
            
        pitcher_data = pitcher_data.sort_values('Season')
        pitcher_data['SO_rolling_5'] = pitcher_data['SO'].rolling(5, min_periods=1).mean()
        pitcher_data['SO_rolling_10'] = pitcher_data['SO'].rolling(10, min_periods=1).mean()
        
        pitcher_data['Home_IP'] = pitcher_data[pitcher_data['Home'] == 1.0]['IP'].mean()
        pitcher_data['Away_IP'] = pitcher_data[pitcher_data['Home'] == 0.0]['IP'].mean()
        pitcher_data['Home_SO'] = pitcher_data[pitcher_data['Home'] == 1.0]['SO'].mean()
        pitcher_data['Away_SO'] = pitcher_data[pitcher_data['Home'] == 0.0]['SO'].mean()
        
        performance = calculate_weighted_performance(pitcher_data, current_season=2024, last_season=2023)
        performance['Pitcher'] = pitcher
        performance['Opp_K%'] = pitcher_data['Opp_K%'].iloc[0] if not pitcher_data.empty else k_percentage_df['%K'].mean()
        performance['Team_K%'] = pitcher_data['Team_K%'].iloc[0] if not pitcher_data.empty else k_percentage_df['%K'].mean()
        weighted_pitcher_data.append(performance)

    weighted_df = pd.DataFrame(weighted_pitcher_data)
    
    weighted_df['IP'] = weighted_df['IP'].replace(0, 1)
    weighted_df['SO_per_IP'] = weighted_df['SO'] / weighted_df['IP']
    weighted_df['BB_per_IP'] = weighted_df['BB'] / weighted_df['IP']
    weighted_df['K-BB%'] = weighted_df['SO_per_IP'] - weighted_df['BB_per_IP']
    
    required_features = [
        'IP', 'H', 'BB', 'ERA', 'FIP', 'SO_per_IP', 'BB_per_IP', 'K-BB%', 
        'Opp_K%', 'Team_K%', 'Home', 'SO_rolling_5', 'SO_rolling_10',
        'Home_IP', 'Away_IP', 'Home_SO', 'Away_SO'
    ]
    
    for feature in required_features:
        if feature not in weighted_df.columns:
            weighted_df[feature] = 0
            print(f"Warning: Initialized missing feature {feature} with zeros")
    
    X = weighted_df[required_features].fillna(0)
    y = weighted_df['SO']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    models = {
        'RandomForest': RandomForestRegressor(random_state=42),
        'XGBoost': XGBRegressor(random_state=42),
        'LightGBM': LGBMRegressor(random_state=42, num_leaves=31, min_data_in_leaf=1, max_depth=-1, verbose=-1),
        'GradientBoosting': GradientBoostingRegressor(random_state=42)
    }

    best_model = None
    best_score = -np.inf
    best_model_name = ""
    results = {}

    for name, model in models.items():
        pipeline = make_pipeline(RobustScaler(), model)
        
        scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2')
        avg_score = np.mean(scores)
        
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        test_r2 = r2_score(y_test, y_pred)
        test_mae = mean_absolute_error(y_test, y_pred)
        
        results[name] = {
            'CV_R2': avg_score,
            'Test_R2': test_r2,
            'Test_MAE': test_mae
        }
        
        print(f"{name:15} | CV R2: {avg_score:.4f} | Test R2: {test_r2:.4f} | Test MAE: {test_mae:.4f}")
        
        if avg_score > best_score:
            best_score = avg_score
            best_model = pipeline
            best_model_name = name

    print(f"\nBest model: {best_model_name} with R2 score: {best_score:.4f}")
    return best_model, results

model, model_results = train_model(pitchers_df, k_percentage_df)

RandomForest    | CV R2: 0.4990 | Test R2: 0.7676 | Test MAE: 0.6934
XGBoost         | CV R2: -0.4037 | Test R2: 0.5750 | Test MAE: 0.7233
LightGBM        | CV R2: -2.3210 | Test R2: 0.9730 | Test MAE: 0.2524
GradientBoosting | CV R2: -0.5121 | Test R2: 0.7739 | Test MAE: 0.5503

Best model: RandomForest with R2 score: 0.4990


In [5]:
def predict_strikeouts_with_confidence(model, pitchers_df, k_percentage_df, pitcher_name, opponent_team, strikeout_line):
    """
    Make prediction with confidence metrics for a pitcher vs opponent.
    
    Args:
        model: Trained ML pipeline
        pitchers_df: Pitchers historical data
        k_percentage_df: Team strikeout percentages
        pitcher_name: Pitcher abbreviation
        opponent_team: Opponent team abbreviation
        strikeout_line: Betting line for strikeouts
        
    Returns:
        Dictionary with prediction, recommendation and confidence metrics
        or None if prediction couldn't be made
    """
    # Create a copy to avoid SettingWithCopyWarning
    pitcher_data = pitchers_df[pitchers_df['Pitcher'] == pitcher_name].copy()
    
    if pitcher_data.empty:
        print(f"No data found for pitcher: {pitcher_name}")
        return None

    # Ensure we have required columns for rolling calculations
    required_columns = ['Season', 'SO', 'IP', 'Home']
    for col in required_columns:
        if col not in pitcher_data.columns:
            pitcher_data[col] = 0  # Initialize with default value
            
    # Calculate rolling features with proper sorting
    pitcher_data = pitcher_data.sort_values('Season')
    pitcher_data['SO_rolling_5'] = pitcher_data['SO'].rolling(5, min_periods=1).mean()
    pitcher_data['SO_rolling_10'] = pitcher_data['SO'].rolling(10, min_periods=1).mean()
    
    # Calculate home/away splits
    pitcher_data['Home_IP'] = pitcher_data[pitcher_data['Home'] == 1.0]['IP'].mean()
    pitcher_data['Away_IP'] = pitcher_data[pitcher_data['Home'] == 0.0]['IP'].mean()
    pitcher_data['Home_SO'] = pitcher_data[pitcher_data['Home'] == 1.0]['SO'].mean()
    pitcher_data['Away_SO'] = pitcher_data[pitcher_data['Home'] == 0.0]['SO'].mean()

    # Get opponent strikeout rate
    opponent_k = k_percentage_df.loc[k_percentage_df['Team'] == opponent_team, '%K'].mean()
    if np.isnan(opponent_k):
        opponent_k = k_percentage_df['%K'].mean()  # Fallback to league average
    
    # Calculate weighted performance metrics
    performance = calculate_weighted_performance(
        pitcher_data=pitcher_data,
        current_season=2024,
        last_season=2023 if not pitcher_data[pitcher_data['Season'] == 2023].empty else None
    )
    
    # Prepare features DataFrame
    features = pd.DataFrame([performance])
    
    # Add derived features with safety checks
    features['IP'] = features['IP'].replace(0, 1)  # Avoid division by zero
    features['SO_per_IP'] = features['SO'] / features['IP']
    features['BB_per_IP'] = features['BB'] / features['IP']
    features['K-BB%'] = features['SO_per_IP'] - features['BB_per_IP']
    features['Opp_K%'] = opponent_k
    features['Team_K%'] = pitcher_data['Team_K%'].iloc[0] if not pitcher_data.empty else k_percentage_df['%K'].mean()
    
    # Define and validate all required features
    model_features = [
        'IP', 'H', 'BB', 'ERA', 'FIP', 'SO_per_IP', 'BB_per_IP', 'K-BB%', 
        'Opp_K%', 'Team_K%', 'Home', 'SO_rolling_5', 'SO_rolling_10',
        'Home_IP', 'Away_IP', 'Home_SO', 'Away_SO'
    ]
    
    # Ensure all features exist in the DataFrame
    for feature in model_features:
        if feature not in features.columns:
            features[feature] = 0  # Initialize missing features with 0
            print(f"Warning: Initialized missing feature {feature} with zeros")
    
    input_features = features[model_features].fillna(0)
    
    try:
        # Make prediction
        predicted_strikeouts = model.predict(input_features)[0]
        
        # Calculate confidence metrics
        if hasattr(model, 'named_steps') and 'randomforestregressor' in model.named_steps:
            # For RandomForest - use tree variance
            predictions = [tree.predict(input_features) for tree in 
                         model.named_steps.randomforestregressor.estimators_]
            std_dev = np.std(predictions)
            confidence = max(0, min(1 - (std_dev / 3), 1))
        else:
            # For other models - use simple confidence based on line proximity
            std_dev = 0
            confidence = 0.8 - (abs(predicted_strikeouts - strikeout_line) / 10)
            confidence = max(0, min(confidence, 1))
        
        # Determine recommendation
        recommended_side = "Over" if predicted_strikeouts > strikeout_line else "Under"
        
        return {
            'predicted_value': float(predicted_strikeouts),
            'recommended_side': recommended_side,
            'confidence_percentage': float(confidence * 100),
            'std_dev': float(std_dev)
        }
        
    except Exception as e:
        print(f"Prediction failed for {pitcher_name}: {str(e)}")
        return None
    

In [21]:
def process_betting_data(model, pitchers_df, k_percentage_df, betting_data_path, output_dir=None):
    """
    Process betting data and add predictions using the trained model.
    
    Args:
        model: Trained ML pipeline
        pitchers_df: Pitchers historical data
        k_percentage_df: Team strikeout percentages
        betting_data_path: Path to betting data CSV
        output_dir: Directory to save the output file (defaults to current directory)
        
    Returns:
        Updated betting DataFrame with predictions (only rows with valid predictions and selected columns)
    """
    import pandas as pd
    import numpy as np
    from datetime import datetime
    import os
    
    # Load betting data
    betting_data = pd.read_csv(betting_data_path)
    
    # Initialize prediction columns
    betting_data['ML Strikeout Line'] = (betting_data['Over Line'] + betting_data['Under Line']) / 2
    betting_data['ML Predict Value'] = np.nan
    betting_data['ML Recommend Side'] = np.nan
    betting_data['ML Confidence Percentage'] = np.nan
    betting_data['Pitcher 2023'] = False
    
    def has_2023_data(pitcher_name):
        """Check if pitcher has 2023 data available."""
        return not pitchers_df[(pitchers_df['Pitcher'] == pitcher_name) & 
                             (pitchers_df['Season'] == 2023)].empty
    
    for index, row in betting_data.iterrows():
        pitcher_name = row['Name_abbreviation']
        opponent_team = row['Opponent']
        strikeout_line = row['ML Strikeout Line']
        
        pitcher_2023 = has_2023_data(pitcher_name)
        betting_data.at[index, 'Pitcher 2023'] = pitcher_2023
        
        # Make prediction
        result = predict_strikeouts_with_confidence(
            model=model,
            pitchers_df=pitchers_df,
            k_percentage_df=k_percentage_df,
            pitcher_name=pitcher_name,
            opponent_team=opponent_team,
            strikeout_line=strikeout_line
        )
        
        if result:
            betting_data.at[index, 'ML Predict Value'] = result['predicted_value']
            betting_data.at[index, 'ML Recommend Side'] = result['recommended_side']
            betting_data.at[index, 'ML Confidence Percentage'] = result['confidence_percentage']
            
            # Print progress
            print(f"{pitcher_name} vs {opponent_team}: "
                  f"Line {strikeout_line:.1f} → Pred {result['predicted_value']:.1f} "
                  f"({result['recommended_side']}, {result['confidence_percentage']:.0f}%)")
        else:
            print(f"⚠️ No prediction for {pitcher_name} vs {opponent_team} - missing data")
    
    # FILTRAR APENAS LINHAS COM PREVISÕES VÁLIDAS
    filtered_data = betting_data.dropna(subset=['ML Predict Value'])
    
    # REMOVER COLUNAS INDESEJADAS
    columns_to_drop = ['Opponent', 'Home Team', 'Away Team', 'Probability', 'Bet Rating']
    columns_to_keep = [col for col in filtered_data.columns if col not in columns_to_drop]
    filtered_data = filtered_data[columns_to_keep]
    
    # Extract date from input filename or use current date
    import re
    date_match = re.search(r'xPbetting_data_(\d{4}-\d{2}-\d{2})\.csv', betting_data_path)
    if date_match:
        file_date = date_match.group(1)
    else:
        file_date = datetime.now().strftime("%Y-%m-%d")
    
    # Create output filename
    output_filename = f"betting_data_predicted_{file_date}.csv"
    
    # Set output path
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        output_path = os.path.join(output_dir, output_filename)
    else:
        output_path = output_filename
    
    # Save the filtered data
    filtered_data.to_csv(output_path, index=False)
    print(f"\nSaved predictions to {output_path}")
    print(f"Original rows: {len(betting_data)} | Filtered rows with predictions: {len(filtered_data)}")
    print(f"Columns kept: {len(columns_to_keep)} | Columns removed: {len(columns_to_drop)}")
    
    return filtered_data

In [22]:
# Usar o mesmo arquivo que foi carregado inicialmente
results_df = process_betting_data(
    model=model,
    pitchers_df=pitchers_df,
    k_percentage_df=k_percentage_df,
    betting_data_path=betting_file_used,  # usando a variável da célula 1
    output_dir='predictions'
)

No data found for pitcher: degroja
⚠️ No prediction for degroja vs nan - missing data
oberba vs nan: Line 4.8 → Pred 5.6 (Over, 77%)
bibeeta vs nan: Line 4.5 → Pred 5.9 (Over, 75%)
meyerma vs nan: Line 4.0 → Pred 3.9 (Under, 75%)
nolaaa vs nan: Line 5.2 → Pred 5.7 (Over, 77%)
bradlta vs nan: Line 6.5 → Pred 6.0 (Under, 75%)
heanean vs nan: Line 4.2 → Pred 4.8 (Over, 75%)
No data found for pitcher: suganto
⚠️ No prediction for suganto vs nan - missing data
kochaja vs nan: Line 2.8 → Pred 3.5 (Over, 77%)
pivetni vs nan: Line 5.5 → Pred 5.7 (Over, 76%)
boydma vs nan: Line 4.5 → Pred 5.5 (Over, 74%)
searsjp vs nan: Line 4.2 → Pred 3.9 (Under, 75%)
woobr vs nan: Line 5.5 → Pred 4.4 (Under, 77%)
pallaan vs nan: Line 4.0 → Pred 2.8 (Under, 75%)
martida vs nan: Line 4.2 → Pred 4.1 (Under, 75%)
No data found for pitcher: rodried
⚠️ No prediction for rodried vs nan - missing data
stromma vs nan: Line 3.8 → Pred 4.0 (Over, 75%)
wachami vs nan: Line 4.5 → Pred 4.8 (Over, 75%)
civalaa vs nan: Line 