In [79]:
import numpy as np
import pandas as pd

import os
import csv 

TEAMS = ['hou', 'laa', 'oak', 'sea', 'tex', 'bal', 'bos', 'nyy', 'tbr', 'tor', 
         'chw', 'cle', 'det', 'kcr', 'min', 'atl', 'mia', 'nym', 'phi', 'wsn', 
         'chc', 'cin', 'mil', 'pit', 'stl', 'ari', 'col', 'lad', 'sdp', 'sfg']

def make_csv_paths(teams, data_type):
    base_path = f"C:/data/2022/{data_type}"
    return [f"{base_path}/{team}-{data_type}.csv" for team in teams]

def read_csv_paths(paths):
    dataframes = [pd.read_csv(path) for path in paths]
    return pd.concat(dataframes)

game_data_files = make_csv_paths(TEAMS, 'games')
batting_data_files = make_csv_paths(TEAMS, 'batting')
pitching_data_files = make_csv_paths(TEAMS, 'pitching')

In [80]:
def clean_data(df, percentage_cols, numeric_cols):
    # Replace any string-based missing values with numpy NaN
    df.replace('?', np.nan, inplace=True)

    # Convert percentage columns to float
    for col in percentage_cols:
        df[col] = df[col].str.rstrip('%').astype('float') / 100.0

    # Convert all other numeric columns to float
    for col in numeric_cols:
        df[col] = df[col].astype(float)

    # Handle missing values
    for col in df.columns:
        if df[col].dtype == np.float64:
            df[col].fillna(df[col].mean(), inplace=True)

    return df

def clean_and_combine_data(clean_function, csv_files):
    # Create an empty DataFrame to store all the data
    all_data = pd.DataFrame()

    for file_path in csv_files:
        # Clean the data
        df = clean_function(file_path)

        # Add a new column to indicate the team
        team_name = os.path.basename(file_path).split('-')[0]
        df['Team'] = team_name

        all_data = pd.concat([all_data, df])

    # Reset the index of the combined DataFrame
    all_data.reset_index(drop=True, inplace=True)

    return all_data

def clean_batting_data(filename):
    df = pd.read_csv(filename)
    percentage_cols = ['BB%', 'K%']
    numeric_cols = ["G","PA","HR","R","RBI","SB","ISO","BABIP","AVG","OBP","SLG","wOBA","xwOBA","wRC+","BsR","Off","Def","WAR","playerid"]
    return clean_data(df, percentage_cols, numeric_cols)

def clean_pitching_data(filename):
    df = pd.read_csv(filename)
    percentage_cols = ['LOB%', 'GB%', 'HR/FB']
    numeric_cols = ["W","L","SV","G","GS","IP","K/9","BB/9","HR/9","BABIP","vFA (pi)","ERA","xERA","FIP","xFIP","WAR","playerid"]
    return clean_data(df, percentage_cols, numeric_cols)

def clean_game_data(filename):
    # Load the data
    df = pd.read_csv(filename)

    # Replace any string-based missing values with numpy NaN
    df.replace('?', np.nan, inplace=True)

    # Convert the W/L column to 1 for a win and 0 for a loss
    df['W/L'] = df['W/L'].apply(lambda x: 1 if 'W' in x else 0)

    # Convert GB to a numeric value
    df['GB'] = df['GB'].apply(gb_to_float)

    # Handle missing values for numeric columns
    numeric_cols = ["Gm#", "R", "RA", "Inn", "Rank", "Attendance", "cLI"]
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Handle missing values for non-numeric columns
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col].fillna('Unknown', inplace=True)

    return df




In [78]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

pitching_data = clean_and_combine_data(clean_pitching_data, pitching_data_files)
batting_data = clean_and_combine_data(clean_batting_data, batting_data_files)
game_data = clean_and_combine_data(clean_game_data, game_data_files)

# Get the number of games each team played
num_games = game_data['Team'].value_counts()

# List of numeric columns in the pitching and batting data
pitching_numeric_cols = ['W', 'L', 'SV', 'G', 'GS', 'IP', 'K/9', 'BB/9', 'HR/9', 'BABIP', 'LOB%', 'GB%', 'HR/FB', 'vFA (pi)', 'ERA', 'xERA', 'FIP', 'xFIP', 'WAR', 'playerid']
batting_numeric_cols = ['G', 'PA', 'HR', 'R', 'RBI', 'SB', 'BB%', 'K%', 'ISO', 'BABIP', 'AVG', 'OBP', 'SLG', 'wOBA', 'xwOBA', 'wRC+', 'BsR', 'Off', 'Def', 'WAR', 'playerid']

# Divide the seasonal statistics by the number of games
pitching_data_per_game = pitching_data.copy()
batting_data_per_game = batting_data.copy()


for team in num_games.index:
    pitching_data_per_game.loc[pitching_data_per_game['Team'] == team, pitching_numeric_cols] /= num_games[team]
    batting_data_per_game.loc[batting_data_per_game['Team'] == team, batting_numeric_cols] /= num_games[team]
    pitching_data_per_game['Gm#'] = pitching_data_per_game.groupby('Team').cumcount() + 1
    batting_data_per_game['Gm#'] = batting_data_per_game.groupby('Team').cumcount() + 1
    
print(f'Pitching Data: {pitching_data_per_game}')
print(f'Batting Data: {batting_data_per_game}')
print(f'Game Data: {game_data}')

# Function to get team stats for a given game
def get_team_stats(team, game_data, batting_data, pitching_data):
    game_stats = game_data[game_data['Team'] == team]
    batting_stats = batting_data[batting_data['Team'] == team]
    pitching_stats = pitching_data[pitching_data['Team'] == team]

    # Join the batting and pitching stats with the game stats
    team_stats = pd.merge(game_stats, batting_stats, on=['Team', 'Gm#'], how='outer')
    team_stats = pd.merge(team_stats, pitching_stats, on=['Team', 'Gm#'], how='outer')

    return team_stats

# Function to generate pairs of teams
def generate_pairs(game_data, batting_data, pitching_data):
    pairs = []
    labels = []

    # Get the unique list of games
    unique_games = game_data['Gm#'].unique()

    for game in unique_games:
        # Get the teams that played in this game
        teams = game_data[game_data['Gm#'] == game]['Team'].unique()
        print(teams)
        if len(teams) != 2:
            continue  # Skip this game if it does not have exactly two teams

        team_a, team_b = teams

        team_a_stats = get_team_stats(team_a, game_data, batting_data, pitching_data)
        team_b_stats = get_team_stats(team_b, game_data, batting_data, pitching_data)

        # Continuing from the previous cell
        stat_differences = team_a_stats.loc[team_a_stats['Gm#'] == game] - team_b_stats.loc[team_b_stats['Gm#'] == game]

        # Add the differences to the pairs list
        pairs.append(stat_differences)

        # Determine the outcome of the game
        if game_data.loc[(game_data['Gm#'] == game) & (game_data['Team'] == team_a), 'W/L'].values[0] == 1:
            labels.append(1)  # Team A won
        else:
            labels.append(0)  # Team B won

    return pairs, labels

# Generate pairs and labels
pairs, labels = generate_pairs(game_data, batting_data_per_game, pitching_data_per_game)

# Function to train a model
def train_model(pairs, labels):
    # Flatten the pairs into a 2D array 
    pairs_flattened = np.array([pair.values.flatten() for pair in pairs])

    # Split into a training set and a test set
    X_train, X_test, y_train, y_test = train_test_split(pairs_flattened, labels, test_size=0.2, random_state=42)

    # Train a logistic regression model
    model = LogisticRegression()
    model.fit(X_train, y_train)

    # Evaluate the model
    accuracy = model.score(X_test, y_test)
    print('Model accuracy:', accuracy)

    return model

model = train_model(pairs, labels)

# Function to predict the outcome of a game
def predict_game(team_a, team_b, model, game_data, batting_data, pitching_data):
    team_a_stats = get_team_stats(team_a, game_data, batting_data, pitching_data)
    team_b_stats = get_team_stats(team_b, game_data, batting_data, pitching_data)

    # Compute the differences in stats
    stat_differences = team_a_stats - team_b_stats

    # Predict the outcome of the game
    prediction = model.predict(stat_differences.values.reshape(1, -1))

    if prediction[0] == 1:
        print('Predicted winner:', team_a)
    else:
        print('Predicted winner:', team_b)

# Predict the outcome of a game
predict_game('ari', 'atl', model, game_data, batting_data_per_game, pitching_data_per_game)



Pitching Data:                    Name Team         W         L        SV         G  \
0        Framber Valdez  hou  0.104938  0.037037  0.000000  0.191358   
1       Cristian Javier  hou  0.067901  0.055556  0.000000  0.185185   
2           Luis Garcia  hou  0.092593  0.049383  0.000000  0.172840   
3          Hector Neris  hou  0.037037  0.024691  0.018519  0.432099   
4        Rafael Montero  hou  0.030864  0.012346  0.086420  0.438272   
..                  ...  ...       ...       ...       ...       ...   
558       Luis Gonzalez  sfg  0.000000  0.000000  0.000000  0.030864   
559  Anthony DeSclafani  sfg  0.000000  0.012346  0.000000  0.030864   
560         Cole Waites  sfg  0.000000  0.000000  0.000000  0.043210   
561          Sammy Long  sfg  0.006173  0.018519  0.006173  0.172840   
562     Thomas Szapucki  sfg  0.000000  0.006173  0.000000  0.067901   

           GS        IP       K/9      BB/9  ...       GB%     HR/FB  \
0    0.191358  1.241358  0.053519  0.018519  ...

ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.