In [1]:
from pandas.plotting import scatter_matrix
import pandas as pd
import requests
from datetime import datetime, timedelta
import statsapi
import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures


from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error



pd.set_option('display.max_columns', None)

In [2]:
full_df = pd.read_csv('datasets/complex_pitchers.csv')
batter_df = pd.read_csv('datasets/complex_batters.csv')

  full_df = pd.read_csv('datasets/complex_pitchers.csv')
  batter_df = pd.read_csv('datasets/complex_batters.csv')


In [3]:
full_df['era'] = full_df['era'].replace('-.--', np.nan)

In [4]:
full_df['era'] = round(full_df['era'].astype(float),2)
full_df = full_df[full_df['current_inning'] >8]
full_df['winner_id'] = np.where(full_df['winning_team'] == full_df['home_name'], full_df['home_id'], full_df['away_id'])
full_df['loser_id'] = np.where(full_df['losing_team'] == full_df['home_name'], full_df['home_id'], full_df['away_id'])
full_df['isHome'] = full_df['Team'] == full_df['home_name']
full_df['opponent_id'] = np.where(full_df['Opponent'] == full_df['home_name'], full_df['home_id'], full_df['away_id'])
full_df['whip'] = round((full_df['h'] + full_df['bb']) / (full_df['pitching_outs'] / 3),3)

In [5]:
df = full_df.drop(['era','date','losing_Team', 'note','game_type', 'home_pitcher_note', 'away_pitcher_note','series_status','save_pitcher','Team','namefield', 'performance_category', 'Name', 'game_datetime','game_date','status','away_name', 'home_name', 'doubleheader', 'game_num', 'home_probable_pitcher', 'away_probable_pitcher', 'inning_state', 'venue_name', 'national_broadcasts', 'winning_pitcher', 'losing_pitcher', 'summary','name', 'winning_team', 'losing_team', 'Opponent'],axis=1)
df = df.dropna(subset=['pitch_count_MA3', 'strikeout_MA3', 'walks_MA3', 'h_MA3'])
for col in ['isWinner','isStarter','pitcherIsWinner','isHome']:
    df[col] = df[col].astype(int)

### Predicting Pitching Outs

In [76]:
# Pitcher Stats Tab
# Every pitcher's season long totals and averages per year

mini_df = full_df.drop(['namefield', 'name', 'note', 'game_datetime', 'game_date', 'game_type','status', 'doubleheader', 'game_num', 'home_probable_pitcher','away_probable_pitcher', 'home_pitcher_note', 'away_pitcher_note', 'inning_state', 'national_broadcasts', 'series_status', 'save_pitcher', 'summary', 'losing_Team'], axis=1)
mini_df = mini_df[mini_df['current_inning'] > 8]
pitcher_names = full_df['Name'].unique()

pitcher_stats_table = []

for pitcher_name in pitcher_names:
    pitcher_df = mini_df[mini_df['Name'] == pitcher_name]
    for seasonNumber in pitcher_df['seasonNumber'].unique():
        season_df = pitcher_df[pitcher_df['seasonNumber'] == seasonNumber].copy()

        season_dict = {
            'Name' : pitcher_name,
            'num_games_in_season' : len(list(mini_df[mini_df['seasonNumber'] == seasonNumber]['game_id'].unique())),
            'personId' : season_df['personId'].mode()[0],
            'seasonNumber' : seasonNumber,
            'team_id' : season_df['team_id'].mode()[0],
            'GamesPlayed' : len(season_df),
            'GamesStarted' : sum(season_df['isStarter']),
            'GamesRelieved' : len(season_df[season_df['isStarter'] == False]),
            'innings_pitched' : sum(season_df['ip']),
            'strikeouts' : sum(season_df['k']),
            'walks' : sum(season_df['bb']),
            'hits_allowed' : sum(season_df['h']),
            'earned_runs' : sum(season_df['er']),
            'pitching_outs' : sum(season_df['pitching_outs']),
            'home_runs' : sum(season_df['hr']),
            'complete_games' : len(season_df[season_df['ip'] == 9])
        }

        season_df.replace([np.inf, -np.inf], np.nan, inplace=True)
        season_df.dropna(inplace=True)

        season_dict['era'] = round(season_df['era'].mean(),2)
        season_dict['whip'] = round(season_df['whip'].mean(), 3)

        pitcher_stats_table.append(season_dict)

season_df
pitcher_stats_table = pd.DataFrame(pitcher_stats_table)

# lgERA = round(pitcher_stats_table.groupby('Season')['era'].mean(),3).reset_index()
# lgERA.rename(columns={'era': 'lgERA'}, inplace=True)
# pitcher_stats_table = pd.merge(pitcher_stats_table, lgERA, on='Season', how='left')


# lgBB = pitcher_stats_table.groupby('Season')['walks'].sum().reset_index()
# lgBB.rename(columns={'walks': 'lgBB'}, inplace=True)
# pitcher_stats_table = pd.merge(pitcher_stats_table, lgBB, on='Season', how='left')

lgPerSeason = pitcher_stats_table.groupby('seasonNumber').agg({
    'era' : 'mean',
    'walks' : 'sum',
    'strikeouts' : 'sum',
    'hits_allowed' : 'sum',
    'earned_runs' : 'sum',
    'pitching_outs' : 'sum',
    'home_runs' : 'sum'
}).reset_index()
lgPerSeason['era'] = round(lgPerSeason['era'], 2)
lgPerSeason['innings_pitched'] = round(lgPerSeason['pitching_outs']/3,2)

lgPerSeason.rename(columns={
    'era' : 'lgERA',
    'walks' : 'lgBB',
    'strikeouts' : 'lgK',
    'hits_allowed' : 'lgH',
    'earned_runs' : 'lgER',
    'pitching_outs' : 'lgPO',
    'home_runs' : 'lgHR',
    'innings_pitched' : 'lgIP'
}, inplace=True)

lgPerSeason['FIP Constant'] = lgPerSeason['lgERA'] - ((13 * lgPerSeason['lgHR'] + 3 * (lgPerSeason['lgBB']) - 2 * lgPerSeason['lgK']) / lgPerSeason['lgIP'])
#lgPerSeason['MLB_RPG'] = round(lgPerSeason['lgER'] / (81*30))

pitcher_stats_table = pd.merge(pitcher_stats_table, lgPerSeason, on='seasonNumber', how='left')
pitcher_stats_table['MLB_RPG'] = round(pitcher_stats_table['lgER'] / pitcher_stats_table['num_games_in_season'], 2)
pitcher_stats_table['RA9'] = (pitcher_stats_table['MLB_RPG'] * 9) / 0.294
pitcher_stats_table['FIP'] = ((13 * pitcher_stats_table['home_runs'] + 3 * (pitcher_stats_table['walks']) - 2 * pitcher_stats_table['strikeouts']) / pitcher_stats_table['innings_pitched']) + pitcher_stats_table['FIP Constant']
pitcher_stats_table['WAR'] = ((pitcher_stats_table['lgERA'] - pitcher_stats_table['FIP']) * (pitcher_stats_table['innings_pitched'] / 9)) / pitcher_stats_table['RA9']
pitcher_stats_table['K/9'] = round((pitcher_stats_table['strikeouts'] / (pitcher_stats_table['pitching_outs'] / 3)) * 9,2)
pitcher_stats_table['BB/9'] = round((pitcher_stats_table['walks'] / (pitcher_stats_table['pitching_outs'] / 3)) * 9,2)
pitcher_stats_table['HR/9'] = round((pitcher_stats_table['home_runs'] / (pitcher_stats_table['pitching_outs'] / 3)) * 9,2)
pitcher_stats_table['H/9'] = round((pitcher_stats_table['hits_allowed'] / (pitcher_stats_table['pitching_outs'] / 3)) * 9,2)

#lgPerSeason

### Grab Pitcher Gamelogs

In [96]:
gamelog_df = full_df[['game_id', 'personId', 'date', 'team_id', 'opponent_id', 'isHome', 'pitching_outs', 'k', 'bb', 'h', 'er', 'p', 'seasonNumber']].copy()
gamelog_df['ip'] = round(gamelog_df['pitching_outs'] / 3,2)
gamelog_df['WHIP'] = round((gamelog_df['h'] + gamelog_df['bb'])/ gamelog_df['ip'], 3)
gamelog_df['PE'] = round(gamelog_df['p']/gamelog_df['ip'], 2)
gamelog_df = gamelog_df.drop('')

Unnamed: 0,game_id,personId,date,team_id,opponent_id,isHome,pitching_outs,k,bb,h,er,p,seasonNumber,ip,WHIP,PE
0,413846,475243,2015-04-21,112,134,False,15,5,2,4,3,84,2015,5.00,1.200,16.80
1,413846,489295,2015-04-21,112,134,False,2,0,1,3,2,26,2015,0.67,5.970,38.81
2,413846,457435,2015-04-21,112,134,False,1,1,0,0,0,6,2015,0.33,0.000,18.18
3,413846,435400,2015-04-21,112,134,False,3,0,1,2,3,19,2015,1.00,3.000,19.00
4,413846,429719,2015-04-21,112,134,False,3,1,0,0,0,11,2015,1.00,0.000,11.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173960,747047,641482,2024-04-30,147,110,False,18,5,2,8,4,92,2024,6.00,1.667,15.33
173961,747047,596133,2024-04-30,147,110,False,6,3,0,1,0,25,2024,2.00,0.500,12.50
173962,747047,665152,2024-04-30,110,147,True,21,4,4,4,2,93,2024,7.00,1.143,13.29
173963,747047,669211,2024-04-30,110,147,True,2,1,0,1,0,10,2024,0.67,1.493,14.93


### Grab Opponents Stat Table

In [90]:
team_ids = batter_df['team_id'].unique()

mini_df = batter_df[['game_id','team_id', 'seasonNumber', 'ab','r','h','doubles','triples','hr', 'rbi', 'sb', 'bb', 'k', 'avg','personId', 'obp','slg', 'TB']]

opponent_stats_table = []

for team_id in team_ids:
    team_df = mini_df[mini_df['team_id'] == team_id]
    for seasonNumber in team_df['seasonNumber'].unique():
        season_df = team_df[team_df['seasonNumber'] == seasonNumber].copy()

        season_dict = {
            'opponent_id' : team_id,
            'seasonNumber' : seasonNumber,
            'opponent_GamesPlayed' : len(season_df['game_id'].unique()),
            'opponent_AtBats' : season_df['ab'].sum(),
            'opponent_RunsScored' : season_df['r'].sum(),
            'opponent_Homeruns' : season_df['hr'].sum(),
            'opponent_Strikeouts' : season_df['k'].sum(),
            'opponent_Walks' : season_df['bb'].sum(),
            'opponent_Total Bases' : season_df['TB'].sum(),
            'opponent_Hits' : season_df['h'].sum(),
            'opponent_AVG' : round(season_df['h'].sum() / season_df['ab'].sum(),3),
            'opponent_WHIP' : round((season_df['h'].sum() + season_df['bb'].sum())/ (len(season_df['game_id'].unique()) * 9),3),
            'opponent_SLG' : round(season_df['TB'].sum() / season_df['ab'].sum(), 3)
            

         }

        season_df.replace([np.inf, -np.inf], np.nan, inplace=True)
        season_df.dropna(inplace=True)

        # season_dict['era'] = round(season_df['era'].mean(),2)
        # season_dict['whip'] = round(season_df['whip'].mean(), 3)

        opponent_stats_table.append(season_dict)

opponent_stats_table = pd.DataFrame(opponent_stats_table)
opponent_stats_table['opponent_ISO'] = opponent_stats_table['opponent_SLG'] - opponent_stats_table['opponent_AVG']
opponent_stats_table['opponent_RC'] = round(((opponent_stats_table['opponent_Hits'] + opponent_stats_table['opponent_Walks']) / opponent_stats_table['opponent_AtBats']) * opponent_stats_table['opponent_Total Bases'],2)

In [91]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

In [92]:
features_to_scale = ['GamesPlayed', 'GamesStarted', 'GamesRelieved', 'innings_pitched', 'strikeouts', 'walks', 'hits_allowed', 'earned_runs', 'pitching_outs', 'home_runs', 'complete_games', 'era', 'whip']

pitcher_stats_scaled = scaler.fit_transform(pitcher_stats_table[features_to_scale])
pitcher_stats_table[features_to_scale] = pitcher_stats_scaled

In [93]:
features_to_scale = ['isHome', 'pitching_outs', 'k', 'bb', 'h', 'er', 'p']
gamelog_df_scaled = scaler.fit_transform(gamelog_df[features_to_scale])
gamelog_df[features_to_scale] = gamelog_df_scaled

In [94]:
features_to_scale = ['opponent_GamesPlayed', 'opponent_AtBats', 'opponent_RunsScored', 'opponent_Homeruns', 'opponent_Strikeouts', 'opponent_Walks', 'opponent_Total Bases', 'opponent_AVG', 'opponent_WHIP']
opponent_stats_table_scaled = scaler.fit_transform(opponent_stats_table[features_to_scale])
opponent_stats_table[features_to_scale] = opponent_stats_table_scaled
opponent_stats_table

Unnamed: 0,opponent_id,seasonNumber,opponent_GamesPlayed,opponent_AtBats,opponent_RunsScored,opponent_Homeruns,opponent_Strikeouts,opponent_Walks,opponent_Total Bases,opponent_Hits,opponent_AVG,opponent_WHIP,opponent_SLG,opponent_ISO,opponent_RC
0,112,2015,0.950000,0.919041,0.700466,0.541219,0.877122,0.825424,0.733782,1290,0.500000,0.580435,0.398,0.154,732.80
1,112,2016,0.964286,0.939947,0.856643,0.663082,0.778683,0.983051,0.820807,1382,0.675676,0.826087,0.432,0.175,873.93
2,112,2017,0.971429,0.941586,0.872960,0.741935,0.824168,0.933898,0.838608,1378,0.662162,0.756522,0.439,0.183,875.03
3,112,2018,0.935714,0.924780,0.762238,0.516129,0.784114,0.796610,0.762263,1368,0.689189,0.713043,0.410,0.152,778.87
4,112,2019,0.985714,0.948555,0.868298,0.860215,0.871690,0.872881,0.870253,1367,0.608108,0.647826,0.451,0.199,877.70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,113,2020,0.221429,0.180365,0.173660,0.225806,0.215207,0.249153,0.159019,347,0.000000,0.221739,0.388,0.181,215.97
296,113,2021,0.957143,0.915556,0.801865,0.713262,0.828921,0.808475,0.789953,1300,0.540541,0.560870,0.427,0.180,784.78
297,113,2022,0.957143,0.905718,0.656177,0.494624,0.833673,0.640678,0.667326,1219,0.364865,0.284783,0.371,0.137,615.90
298,113,2023,0.964286,0.934618,0.821678,0.648746,0.880516,0.808475,0.792326,1334,0.567568,0.595652,0.420,0.171,787.51


In [95]:
df_merge = pd.merge(pitcher_stats_table, gamelog_df, on=['personId', 'seasonNumber'])
df_merge = pd.merge(df_merge, opponent_stats_table, on=['opponent_id', 'seasonNumber'])
df_merge

Unnamed: 0,Name,num_games_in_season,personId,seasonNumber,team_id_x,GamesPlayed,GamesStarted,GamesRelieved,innings_pitched,strikeouts,walks,hits_allowed,earned_runs,pitching_outs_x,home_runs,complete_games,era,whip,lgERA,lgBB,lgK,lgH,lgER,lgPO,lgHR,lgIP,FIP Constant,MLB_RPG,RA9,FIP,WAR,K/9,BB/9,HR/9,H/9,game_id,date,team_id_y,opponent_id,isHome,pitching_outs_y,k,bb,h,er,p,ip,WHIP,PE,opponent_GamesPlayed,opponent_AtBats,opponent_RunsScored,opponent_Homeruns,opponent_Strikeouts,opponent_Walks,opponent_Total Bases,opponent_Hits,opponent_AVG,opponent_WHIP,opponent_SLG,opponent_ISO,opponent_RC
0,Travis Wood,2388,475243,2015,112,0.629630,0.257143,0.524390,0.385485,0.346626,0.383838,0.357759,0.367521,0.418338,0.25,0.0,0.027901,0.098267,4.05,13831,36878,41424,18792,128109,4823,42703.00,3.337263,7.87,240.918367,3.686756,0.014860,10.45,3.51,1.02,7.67,413846,2015-04-21,112,134,0.0,0.555556,0.25,0.25,0.266667,0.230769,0.626866,5.00,1.200,16.80,0.978571,0.978069,0.726107,0.444444,0.769179,0.659322,0.772943,1447,0.716216,0.597826,0.395,0.135,750.94
1,Travis Wood,2388,475243,2015,112,0.629630,0.257143,0.524390,0.385485,0.346626,0.383838,0.357759,0.367521,0.418338,0.25,0.0,0.027901,0.098267,4.05,13831,36878,41424,18792,128109,4823,42703.00,3.337263,7.87,240.918367,3.686756,0.014860,10.45,3.51,1.02,7.67,413697,2015-04-10,112,115,0.0,0.518519,0.25,0.25,0.466667,0.230769,0.671642,4.67,1.927,19.27,0.950000,0.940357,0.763403,0.580645,0.735234,0.537288,0.819225,1430,0.783784,0.532609,0.431,0.166,779.51
2,Travis Wood,2388,475243,2015,112,0.629630,0.257143,0.524390,0.385485,0.346626,0.383838,0.357759,0.367521,0.418338,0.25,0.0,0.027901,0.098267,4.05,13831,36878,41424,18792,128109,4823,42703.00,3.337263,7.87,240.918367,3.686756,0.014860,10.45,3.51,1.02,7.67,413771,2015-04-15,112,113,1.0,0.777778,0.35,0.25,0.200000,0.000000,0.738806,7.00,0.714,14.14,0.957143,0.944661,0.648019,0.534050,0.709437,0.694915,0.744462,1342,0.554054,0.523913,0.394,0.146,715.19
3,Travis Wood,2388,475243,2015,112,0.629630,0.257143,0.524390,0.385485,0.346626,0.383838,0.357759,0.367521,0.418338,0.25,0.0,0.027901,0.098267,4.05,13831,36878,41424,18792,128109,4823,42703.00,3.337263,7.87,240.918367,3.686756,0.014860,10.45,3.51,1.02,7.67,413953,2015-04-28,112,134,1.0,0.777778,0.45,0.00,0.333333,0.153846,0.671642,7.00,0.714,12.86,0.978571,0.978069,0.726107,0.444444,0.769179,0.659322,0.772943,1447,0.716216,0.597826,0.395,0.135,750.94
4,Travis Wood,2388,475243,2015,112,0.629630,0.257143,0.524390,0.385485,0.346626,0.383838,0.357759,0.367521,0.418338,0.25,0.0,0.027901,0.098267,4.05,13831,36878,41424,18792,128109,4823,42703.00,3.337263,7.87,240.918367,3.686756,0.014860,10.45,3.51,1.02,7.67,414028,2015-05-04,112,138,0.0,0.555556,0.25,0.25,0.400000,0.461538,0.671642,5.00,1.600,18.00,0.957143,0.930314,0.668998,0.426523,0.731840,0.733898,0.736946,1354,0.635135,0.576087,0.396,0.142,731.96
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173163,Jake Bauers,390,641343,2024,158,0.000000,0.000000,0.012195,0.004346,0.000000,0.020202,0.004310,0.000000,0.004298,0.00,0.0,,,4.58,2573,6612,6266,3051,20816,779,6938.67,3.913878,7.82,239.387755,9.913878,-0.002476,0.00,18.00,0.00,9.00,746001,2024-04-28,158,147,1.0,0.111111,0.00,0.25,0.066667,0.000000,0.194030,1.00,3.000,26.00,0.021429,0.011068,0.066434,0.046595,0.019688,0.079661,0.026108,201,0.391892,0.619565,0.369,0.133,114.85
173164,Emmanuel Ramirez,390,642629,2024,146,0.024691,0.000000,0.036585,0.013472,0.009202,0.000000,0.004310,0.000000,0.014327,0.00,0.0,,,4.58,2573,6612,6266,3051,20816,779,6938.67,3.913878,7.82,239.387755,1.978394,0.003743,8.10,0.00,0.00,2.70,746077,2024-04-28,146,120,1.0,0.111111,0.05,0.00,0.066667,0.000000,0.126866,1.00,1.000,17.00,0.014286,0.005739,0.046620,0.032258,0.017651,0.035593,0.024525,195,0.391892,0.426087,0.376,0.140,104.92
173165,Emmanuel Ramirez,390,642629,2024,146,0.024691,0.000000,0.036585,0.013472,0.009202,0.000000,0.004310,0.000000,0.014327,0.00,0.0,,,4.58,2573,6612,6266,3051,20816,779,6938.67,3.913878,7.82,239.387755,1.978394,0.003743,8.10,0.00,0.00,2.70,746074,2024-04-29,146,120,1.0,0.148148,0.00,0.00,0.000000,0.000000,0.074627,1.33,0.000,7.52,0.014286,0.005739,0.046620,0.032258,0.017651,0.035593,0.024525,195,0.391892,0.426087,0.376,0.140,104.92
173166,Emmanuel Ramirez,390,642629,2024,146,0.024691,0.000000,0.036585,0.013472,0.009202,0.000000,0.004310,0.000000,0.014327,0.00,0.0,,,4.58,2573,6612,6266,3051,20816,779,6938.67,3.913878,7.82,239.387755,1.978394,0.003743,8.10,0.00,0.00,2.70,746073,2024-04-30,146,115,1.0,0.111111,0.10,0.00,0.000000,0.000000,0.074627,1.00,0.000,10.00,0.014286,0.008813,0.038462,0.014337,0.058384,0.000000,0.026108,205,0.486486,0.319565,0.374,0.131,100.26


In [74]:
pitcher_stats_table # Pitcher Season Splits
gamelog_df # All pitcher's gamelogs
opponent_stats_table # Opponent Batting Splits by year

pitcher_stats_table
opponent_stats_table

Unnamed: 0,team_id,seasonNumber,GamesPlayed,AtBats,RunsScored,Homeruns,Strikeouts,Walks,Total Bases,Hits,AVG,WHIP,SLG,ISO,RC
0,112,2015,0.950000,0.919041,0.700466,0.541219,0.877122,0.825424,0.733782,1290,0.500000,0.580435,0.398,0.154,732.80
1,112,2016,0.964286,0.939947,0.856643,0.663082,0.778683,0.983051,0.820807,1382,0.675676,0.826087,0.432,0.175,873.93
2,112,2017,0.971429,0.941586,0.872960,0.741935,0.824168,0.933898,0.838608,1378,0.662162,0.756522,0.439,0.183,875.03
3,112,2018,0.935714,0.924780,0.762238,0.516129,0.784114,0.796610,0.762263,1368,0.689189,0.713043,0.410,0.152,778.87
4,112,2019,0.985714,0.948555,0.868298,0.860215,0.871690,0.872881,0.870253,1367,0.608108,0.647826,0.451,0.199,877.70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,113,2020,0.221429,0.180365,0.173660,0.225806,0.215207,0.249153,0.159019,347,0.000000,0.221739,0.388,0.181,215.97
296,113,2021,0.957143,0.915556,0.801865,0.713262,0.828921,0.808475,0.789953,1300,0.540541,0.560870,0.427,0.180,784.78
297,113,2022,0.957143,0.905718,0.656177,0.494624,0.833673,0.640678,0.667326,1219,0.364865,0.284783,0.371,0.137,615.90
298,113,2023,0.964286,0.934618,0.821678,0.648746,0.880516,0.808475,0.792326,1334,0.567568,0.595652,0.420,0.171,787.51


In [167]:
pitcher_stats = full_df[['team_id', 'personId', 'seasonNumber', 'ip', 'k', 'bb', 'h','er','era','whip']]
full_df

Unnamed: 0,Team,team_id,namefield,ip,h,r,er,bb,k,hr,p,s,era,name,personId,note,game_id,date,Name,game_datetime,game_date,game_type,status,away_name,home_name,away_id,home_id,doubleheader,game_num,home_probable_pitcher,away_probable_pitcher,home_pitcher_note,away_pitcher_note,away_score,home_score,current_inning,inning_state,venue_id,venue_name,national_broadcasts,series_status,winning_team,losing_team,winning_pitcher,losing_pitcher,save_pitcher,summary,losing_Team,isWinner,Opponent,isStarter,pitcherIsWinner,pitch_count_MA3,strikeout_MA3,walks_MA3,h_MA3,pitching_outs,seasonNumber,game_score,performance_category,winner_id,loser_id,isHome,opponent_id,whip
0,Chicago Cubs,112,"Wood, T",5.0,4,3,3,2,5,2,84,51,3.24,"Wood, T",475243,,413846,2015-04-21,Travis Wood,2015-04-21T23:05:00Z,2015-04-21,R,Final,Chicago Cubs,Pittsburgh Pirates,112,134,N,1,Francisco Liriano,Travis Wood,"Six times during 2014, Liriano pitched six-plu...","Wood is coming off his first win, in which he ...",9,8,9.0,Bottom,31,PNC Park,[],CHC leads 2-0,Chicago Cubs,Pittsburgh Pirates,Edwin Jackson,Mark Melancon,Héctor Rondón,2015-04-21 - Chicago Cubs (9) @ Pittsburgh Pir...,,True,Pittsburgh Pirates,True,False,,,,,15,2015,54,Average,112,134,False,134,1.200
1,Chicago Cubs,112,"Schlitter (BS, 2)",0.2,3,2,2,1,0,0,26,14,9.64,Schlitter,489295,"(BS, 2)",413846,2015-04-21,Brian Schlitter,2015-04-21T23:05:00Z,2015-04-21,R,Final,Chicago Cubs,Pittsburgh Pirates,112,134,N,1,Francisco Liriano,Travis Wood,"Six times during 2014, Liriano pitched six-plu...","Wood is coming off his first win, in which he ...",9,8,9.0,Bottom,31,PNC Park,[],CHC leads 2-0,Chicago Cubs,Pittsburgh Pirates,Edwin Jackson,Mark Melancon,Héctor Rondón,2015-04-21 - Chicago Cubs (9) @ Pittsburgh Pir...,,True,Pittsburgh Pirates,False,False,,,,,2,2015,30,Below Average,112,134,False,134,6.000
2,Chicago Cubs,112,Coke,0.1,0,0,0,0,1,0,6,3,6.75,Coke,457435,,413846,2015-04-21,Phil Coke,2015-04-21T23:05:00Z,2015-04-21,R,Final,Chicago Cubs,Pittsburgh Pirates,112,134,N,1,Francisco Liriano,Travis Wood,"Six times during 2014, Liriano pitched six-plu...","Wood is coming off his first win, in which he ...",9,8,9.0,Bottom,31,PNC Park,[],CHC leads 2-0,Chicago Cubs,Pittsburgh Pirates,Edwin Jackson,Mark Melancon,Héctor Rondón,2015-04-21 - Chicago Cubs (9) @ Pittsburgh Pir...,,True,Pittsburgh Pirates,False,False,,,,,1,2015,43,Average,112,134,False,134,0.000
3,Chicago Cubs,112,Motte,1.0,2,3,3,1,0,0,19,12,6.43,Motte,435400,,413846,2015-04-21,Jason Motte,2015-04-21T23:05:00Z,2015-04-21,R,Final,Chicago Cubs,Pittsburgh Pirates,112,134,N,1,Francisco Liriano,Travis Wood,"Six times during 2014, Liriano pitched six-plu...","Wood is coming off his first win, in which he ...",9,8,9.0,Bottom,31,PNC Park,[],CHC leads 2-0,Chicago Cubs,Pittsburgh Pirates,Edwin Jackson,Mark Melancon,Héctor Rondón,2015-04-21 - Chicago Cubs (9) @ Pittsburgh Pir...,,True,Pittsburgh Pirates,False,False,,,,,3,2015,31,Below Average,112,134,False,134,3.000
4,Chicago Cubs,112,"Jackson, E (W, 1-0)",1.0,0,0,0,0,1,0,11,7,0.00,"Jackson, E",429719,"(W, 1-0)",413846,2015-04-21,Edwin Jackson,2015-04-21T23:05:00Z,2015-04-21,R,Final,Chicago Cubs,Pittsburgh Pirates,112,134,N,1,Francisco Liriano,Travis Wood,"Six times during 2014, Liriano pitched six-plu...","Wood is coming off his first win, in which he ...",9,8,9.0,Bottom,31,PNC Park,[],CHC leads 2-0,Chicago Cubs,Pittsburgh Pirates,Edwin Jackson,Mark Melancon,Héctor Rondón,2015-04-21 - Chicago Cubs (9) @ Pittsburgh Pir...,,True,Pittsburgh Pirates,False,True,,,,,3,2015,47,Average,112,134,False,134,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173743,St. Louis Cardinals,138,Gallegos,0.2,0,0,0,2,0,0,21,11,5.00,Gallegos,606149,,745834,2024-04-28,Giovanny Gallegos,2024-04-28T17:40:00Z,2024-04-28,R,Final,St. Louis Cardinals,New York Mets,138,121,N,1,Jose Quintana,Lance Lynn,,,2,4,11.0,Bottom,3289,Citi Field,['MLB.tv Free Game'],STL wins 2-1,New York Mets,St. Louis Cardinals,Reed Garrett,Matthew Liberatore,,2024-04-28 - St. Louis Cardinals (2) @ New Yor...,,False,New York Mets,False,False,11.00,1.33,0.00,1.00,2,2024,40,Average,121,138,False,121,3.000
173744,St. Louis Cardinals,138,"Liberatore (L, 0-1)",3.0,3,3,2,1,0,1,38,31,3.14,Liberatore,669461,"(L, 0-1)",745834,2024-04-28,Matthew Liberatore,2024-04-28T17:40:00Z,2024-04-28,R,Final,St. Louis Cardinals,New York Mets,138,121,N,1,Jose Quintana,Lance Lynn,,,2,4,11.0,Bottom,3289,Citi Field,['MLB.tv Free Game'],STL wins 2-1,New York Mets,St. Louis Cardinals,Reed Garrett,Matthew Liberatore,,2024-04-28 - St. Louis Cardinals (2) @ New Yor...,,False,New York Mets,False,False,17.67,0.67,1.00,0.33,9,2024,44,Average,121,138,False,121,1.333
173745,New York Mets,121,Quintana,8.0,3,1,1,1,3,0,99,68,3.48,Quintana,500779,,745834,2024-04-28,Jose Quintana,2024-04-28T17:40:00Z,2024-04-28,R,Final,St. Louis Cardinals,New York Mets,138,121,N,1,Jose Quintana,Lance Lynn,,,2,4,11.0,Bottom,3289,Citi Field,['MLB.tv Free Game'],STL wins 2-1,New York Mets,St. Louis Cardinals,Reed Garrett,Matthew Liberatore,,2024-04-28 - St. Louis Cardinals (2) @ New Yor...,,True,St. Louis Cardinals,True,False,94.67,4.00,2.67,5.33,24,2024,80,Excellent,121,138,True,138,0.500
173746,New York Mets,121,Díaz,1.0,0,0,0,0,2,0,20,13,0.93,Díaz,621242,,745834,2024-04-28,Edwin Díaz,2024-04-28T17:40:00Z,2024-04-28,R,Final,St. Louis Cardinals,New York Mets,138,121,N,1,Jose Quintana,Lance Lynn,,,2,4,11.0,Bottom,3289,Citi Field,['MLB.tv Free Game'],STL wins 2-1,New York Mets,St. Louis Cardinals,Reed Garrett,Matthew Liberatore,,2024-04-28 - St. Louis Cardinals (2) @ New Yor...,,True,St. Louis Cardinals,False,False,17.33,1.33,1.00,0.33,3,2024,48,Average,121,138,True,138,0.000


In [155]:
#df.drop(['date'], axis=1).corr()

scaler = MinMaxScaler()
poly = PolynomialFeatures(degree=2)


In [156]:
X, y = df.drop(['k'], axis=1), df['k']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [157]:
scaler.fit(X_train)
#scaled_X_train = pd.DataFrame(scaler.fit_transform(X_train))
scaled_X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)  # This sets the column names instead of indexes


In [158]:
scaler.fit(y_train.to_frame())
scaled_y_train = pd.DataFrame(scaler.fit_transform(y_train.to_frame()))
#scaled_y_train = pd.DataFrame(scaler.fit_transform(y_train.to_frame()), columns=y_train.columns)

#pd.DataFrame(scaler.inverse_transform(scaled_y_train)) # revert to original df