In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Função para calcular o desempenho ponderado do pitcher
def calculate_weighted_performance(pitcher_data, current_season, last_season=None):
    current_season_data = pitcher_data[pitcher_data['Season'] == current_season]
    last_5_games = current_season_data.tail(5)

    if last_season is not None:
        last_season_data = pitcher_data[pitcher_data['Season'] == last_season]
        weight_current_season = 0.40
        weight_last_5_games = 0.30
        weight_last_season = 0.30
    else:
        last_season_data = pd.DataFrame()
        weight_current_season = 0.65
        weight_last_5_games = 0.35
        weight_last_season = 0.0

    def weighted_mean(data, weights):
        return np.average(data, weights=weights)

    metrics = ['IP', 'H', 'BB', 'ERA', 'FIP', 'SO']
    weighted_values = {}

    for metric in metrics:
        current_mean = current_season_data[metric].mean() if not current_season_data.empty else 0
        last_5_mean = last_5_games[metric].mean() if not last_5_games.empty else 0
        last_season_mean = last_season_data[metric].mean() if not last_season_data.empty else 0

        weighted_values[metric] = (
            weight_current_season * current_mean +
            weight_last_5_games * last_5_mean +
            weight_last_season * last_season_mean
        )
    
    return weighted_values

def load_data():
    pitchers_df = pd.DataFrame()

    # Carregar %K dos times
    k_percentage_df = pd.read_csv('team_strikeout_percentage.csv')

    # Lendo dados de pitchers
    pitcher_file = 'pitchers_data.csv'
    pitcher_data = pd.read_csv(pitcher_file)

    # Lendo dados de apostas
    betting_file = 'betting_data.csv'
    betting_data = pd.read_csv(betting_file)

    # Merge da betting_data com pitchers_data
    pitcher_data = pitcher_data.merge(betting_data[['Name_abbreviation', 'Team']], 
                                       left_on='Pitcher', 
                                       right_on='Name_abbreviation', 
                                       how='left')

    # Tratar possíveis valores ausentes, se necessário
    pitcher_data.fillna(0, inplace=True)  # Substitui NaNs por 0 ou outra lógica, se preferir

    pitchers_df = pd.concat([pitchers_df, pitcher_data], ignore_index=True)

    # Merge da %K com pitchers_df
    pitchers_df = pitchers_df.merge(k_percentage_df, on='Team', how='left')

    # Tratar possíveis valores ausentes após a mesclagem
    pitchers_df.fillna(0, inplace=True)

    return pitchers_df, k_percentage_df  # Retorna pitchers_df e k_percentage_df

# Função atualizada para treinar o modelo
def train_model(pitchers_df, k_percentage_df):
    if pitchers_df.empty:
        print("Não há dados de pitchers disponíveis para treinamento.")
        return None

    # Merge dos dados do pitcher com a %K média do time oponente
    batting_avg_k = k_percentage_df.copy()  # Usando o k_percentage_df diretamente
    batting_avg_k.rename(columns={'%K': '%K_Opponent'}, inplace=True)

    # Merge dos dados do pitcher com a %K média do time oponente
    pitchers_df = pitchers_df.merge(batting_avg_k, left_on='Opp', right_on='Team', how='left')

    # Calcular a performance ponderada
    weighted_pitcher_data = []
    for pitcher in pitchers_df['Pitcher'].unique():
        pitcher_data = pitchers_df[pitchers_df['Pitcher'] == pitcher]
        performance = calculate_weighted_performance(pitcher_data, current_season=2024, last_season=2023)
        performance['Pitcher'] = pitcher
        performance['%K_Opponent'] = pitcher_data['%K_Opponent'].iloc[0]
        weighted_pitcher_data.append(performance)

    weighted_df = pd.DataFrame(weighted_pitcher_data)

    # Definindo features e target
    X = weighted_df[['IP', 'H', 'BB', 'ERA', 'FIP', '%K_Opponent']].fillna(0)
    y = weighted_df['SO'] 
    
    # Dividir os dados em treino e teste
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Otimizar o modelo usando GridSearchCV para ajustar parâmetros do RandomForest
    pipeline = make_pipeline(StandardScaler(), RandomForestRegressor(random_state=42))

    # Parâmetros para otimização
    param_grid = {
        'randomforestregressor__n_estimators': [100, 200],
        'randomforestregressor__max_depth': [10, 20, None],
        'randomforestregressor__min_samples_split': [2, 5],
    }

    # Usando validação cruzada e busca de grid
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)

    print(f"Melhores parâmetros: {grid_search.best_params_}")
    print(f"Acurácia média com validação cruzada: {-grid_search.best_score_:.2f}")

    return grid_search.best_estimator_

# Função para prever strikeouts
def predict_strikeouts_with_confidence(pipeline, pitchers_df, k_percentage_df, pitcher_name, opponent_team, strikeout_line):
    pitcher_data = pitchers_df[pitchers_df['Pitcher'] == pitcher_name]
    
    if pitcher_data.empty:
        return None

    opponent_k = k_percentage_df[k_percentage_df['Team'] == opponent_team]['%K'].mean()
    if np.isnan(opponent_k):
        return None

    performance = calculate_weighted_performance(pitcher_data, current_season=2024, last_season=2023)
    features = pd.DataFrame([performance])
    
    if 'SO' in features.columns:
        features = features.drop(columns=['SO'])
    
    features['%K_Opponent'] = opponent_k

    # Prevendo os strikeouts e calculando variância para confiança
    predicted_strikeouts = pipeline.predict(features)
    
    if isinstance(pipeline.named_steps['randomforestregressor'], RandomForestRegressor):
        # Extraindo variância das previsões das árvores
        predictions_per_tree = np.array([tree.predict(features) for tree in pipeline.named_steps['randomforestregressor'].estimators_])
        variance = np.var(predictions_per_tree, axis=0)
        confidence = 1 - variance / np.mean(predictions_per_tree)  # Confiança inversamente proporcional à variância
        confidence_percentage = max(min(confidence[0], 1), 0) * 100  # Converte para percentual e limita entre 0 e 100
    else:
        confidence_percentage = np.nan  # Caso não seja um RandomForest, a confiança não pode ser calculada

    recommended_side = "Over" if predicted_strikeouts[0] > strikeout_line else "Under"
    
    return {
        'predicted_value': predicted_strikeouts[0],
        'recommended_side': recommended_side,
        'confidence_percentage': confidence_percentage
    }

# Função para verificar se o pitcher tem dados de 2023
def has_2023_data(pitcher_data):
    return not pitcher_data[pitcher_data['Season'] == 2023].empty

# Carregar dados
pitchers_df, k_percentage_df = load_data()
# Treinando o modelo
pipeline = train_model(pitchers_df, k_percentage_df)


Melhores parâmetros: {'randomforestregressor__max_depth': 10, 'randomforestregressor__min_samples_split': 2, 'randomforestregressor__n_estimators': 200}
Acurácia média com validação cruzada: 0.40


In [6]:
import pandas as pd

betting_data = pd.read_csv('betting_data.csv')

# Preencher as variáveis para previsão
for index, row in betting_data.iterrows():
    pitcher_name = row['Name_abbreviation']  # Nome do pitcher
    opponent_team = row['Opponent']          # Time oponente
    strikeout_line = (row['Over Line'] + row['Under Line']) / 2  # Média da linha de strikeouts

    # Verificar se o pitcher tem dados de 2023
    pitcher_data = pitchers_df[pitchers_df['Pitcher'] == pitcher_name]
    pitcher_2023 = has_2023_data(pitcher_data)

    # Prevendo os strikeouts usando o modelo de ML
    result = predict_strikeouts_with_confidence(pipeline, pitchers_df, k_percentage_df, pitcher_name, opponent_team, strikeout_line)

    # Tratamento para garantir valores numéricos ou categóricos corretos
    if result is None:
        predicted_value = np.nan
        recommended_side = np.nan
        confidence_percentage = np.nan
    else:
        predicted_value = result.get('predicted_value', np.nan)
        recommended_side = result.get('recommended_side', np.nan)
        confidence_percentage = result.get('confidence_percentage', np.nan)

    # Preenchendo as colunas com os resultados, garantindo o tipo correto
    betting_data.at[index, 'ML Strikeout Line'] = strikeout_line
    betting_data.at[index, 'ML Predict Value'] = predicted_value
    betting_data.at[index, 'ML Recommend Side'] = recommended_side
    betting_data.at[index, 'ML Confidence Percentage'] = confidence_percentage  # Adiciona a confiança percentual
    betting_data.at[index, 'Pitcher 2023'] = pitcher_2023  # Adiciona a coluna booleana 'Pitcher 2023'

    # Exibir resultado (opcional)
    print(f"Pitcher: {pitcher_name}, Opponent: {opponent_team}, Strikeout Line: {strikeout_line:.2f} -> Predicted: {predicted_value}, Recommended: {recommended_side}, Confidence: {confidence_percentage:.2f}%, Pitcher 2023: {pitcher_2023}")

# Salvar o arquivo atualizado
betting_data.to_csv('betting_data.csv', index=False)



Pitcher: burneco, Opponent: KCR, Strikeout Line: 5.00 -> Predicted: 5.644153981942581, Recommended: Over, Confidence: 94.69%, Pitcher 2023: True
Pitcher: skubata, Opponent: HOU, Strikeout Line: 6.00 -> Predicted: 5.683862533394307, Recommended: Under, Confidence: 94.75%, Pitcher 2023: True
Pitcher: kingmi, Opponent: ATL, Strikeout Line: 5.50 -> Predicted: 5.4725398096285565, Recommended: Under, Confidence: 94.69%, Pitcher 2023: True
Pitcher: raganco, Opponent: BAL, Strikeout Line: 6.00 -> Predicted: 6.059873425300092, Recommended: Over, Confidence: 94.69%, Pitcher 2023: True
Pitcher: severlu, Opponent: MIL, Strikeout Line: 5.00 -> Predicted: 5.455930398153936, Recommended: Over, Confidence: 94.69%, Pitcher 2023: True
Pitcher: valdefr, Opponent: DET, Strikeout Line: 6.50 -> Predicted: 5.976315511506392, Recommended: Under, Confidence: 94.69%, Pitcher 2023: True
Pitcher: peralfr, Opponent: NYM, Strikeout Line: 6.50 -> Predicted: 6.118153606958544, Recommended: Under, Confidence: 94.69%, 

