In [1]:
# Imports for Analysis
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from itertools import combinations
%matplotlib inline

# Data Imports
We are going to look at the baseball_season_YEAR_full.json files for this. These extracts look at each season year and bring in inning by innings offensive runs, defensive runs allowed, home stats, away stats, and home outcomes and away outcomes. All of the information provided is from the web scraping of Baseball Reference (www.baseballreference.com) and used by the web scraping function https://github.com/anchorP34/Baseball-Reference-Extracts/blob/master/get_all_season_innings.py

In [2]:
seasons = [2015,2016,2017,2018]

for season in seasons:
    # For each season, load in the web scraped data json file
    with open('baseball_season_{}_full.json'.format(season)) as f:
        globals()['season_{}'.format(season)] = json.load(f)

In [7]:
# Show how many games the Colorado Rockies won each year
for season in seasons:
    print('{} Rockies Wins: {}'.format(season
                        , globals()['season_{}'.format(season)][str(season)]['Colorado Rockies']['TotalWins'])
         )

2015 Rockies Wins: 68
2016 Rockies Wins: 75
2017 Rockies Wins: 87
2018 Rockies Wins: 91


# Inning by Inning Cleanup

Each team in each season has a TotalOffenseInnings and TotalDefenseInnings variables that show arrays that are numbered for each inning of each game played. TotalOffenseInnings relates to the runs scored by that team for each inning that game, while TotalDefenseInnings show the runs given up for each inning that game.

Each array can be different in length, but must have at least 9 innings. You will also sometimes see a X as the 9th value of the array due to the home team already winning heading into the bottom of the 9th, therefore not needing to play it. We would replace it with it as a 0 since they wouldn't need to play that part of the inning.

In [9]:
for game in season_2018['2018']['Colorado Rockies']['TotalOffenseInnings']:
    print(game)

['1', '0', '0', '0', '0', '1', '0', '0', '0']
['0', '2', '0', '5', '0', '0', '1', '0', '0']
['0', '0', '0', '0', '0', '1', '0', '1', '0']
['1', '0', '2', '0', '2', '0', '0', '0', '2']
['0', '0', '2', '1', '0', '0', '0', '0', '1']
['3', '2', '0', '0', '0', '0', '0', '0', '0']
['0', '0', '0', '0', '0', '0', '0', '0', '3']
['1', '0', '0', '2', '0', '0', '0', '0', '0']
['0', '0', '1', '1', '0', '0', '0', '0', '0', '1']
['0', '0', '0', '0', '0', '0', '0', '0', '0']
['0', '1', '0', '3', '0', '2', '0', '0', '0']
['2', '0', '0', '0', '0', '0', '0', '0', '0']
['0', '0', '5', '0', '0', '1', '0', '0', 'X']
['2', '1', '0', '0', '0', '2', '0', '0', '0']
['0', '0', '0', '1', '0', '1', '0', '0', '0']
['2', '0', '0', '0', '0', '0', '0', '0', '0']
['0', '0', '0', '1', '0', '3', '0', '1', '1']
['2', '0', '0', '3', '0', '0', '0', '1', '0']
['1', '0', '0', '1', '0', '0', '0', '0', '0']
['0', '0', '1', '0', '0', '0', '0', '1', '0']
['3', '0', '0', '0', '0', '0', '1', '0', '1']
['0', '0', '0', '0', '5', '0'

We want to go through each season and each team to get their offensive runs scored for the wole season, and for each inning individually. Since there are teams that had games that went 15 innings and others that never made it past the 13th, I combined any runs from the 10th inning and on as its own column

In [10]:
offensive_games = pd.DataFrame(columns = ['Season','Team','TotalOffensiveRuns','Off_Inning_1','Off_Inning_2','Off_Inning_3'
                                          ,'Off_Inning_4', 'Off_Inning_5','Off_Inning_6','Off_Inning_7','Off_Inning_8'
                                         ,'Off_Inning_9','Off_Inning_10+'])
for season in seasons:

    for team in globals()['season_{}'.format(season)][str(season)]:

        total_runs = 0
        inning_1_total = 0
        inning_2_total = 0
        inning_3_total = 0
        inning_4_total = 0
        inning_5_total = 0
        inning_6_total = 0
        inning_7_total = 0
        inning_8_total = 0
        inning_9_total = 0
        inning_extra_innings_total = 0

        for game in globals()['season_{}'.format(season)][str(season)][team]['TotalOffenseInnings']:
            #print(game)
            total_runs += sum([int(inning.replace('X','0')) for inning in game])

            inning_array = [int(inning.replace('X','0')) for inning in game]

            for idx, i in enumerate(inning_array):
                inning_num = idx + 1

                if inning_num >= 10:
                    inning_extra_innings_total += i
                else:
                    globals()['inning_{}_total'.format(inning_num)] += i

        final_array = [season, team, total_runs, inning_1_total, inning_2_total, inning_3_total
                      , inning_4_total, inning_5_total, inning_6_total, inning_7_total, inning_8_total
                      , inning_9_total, inning_extra_innings_total]
        #print(final_array)

        team_offense = pd.DataFrame([final_array], columns = offensive_games.columns)

        offensive_games = offensive_games.append(team_offense)

offensive_games

Unnamed: 0,Season,Team,TotalOffensiveRuns,Off_Inning_1,Off_Inning_2,Off_Inning_3,Off_Inning_4,Off_Inning_5,Off_Inning_6,Off_Inning_7,Off_Inning_8,Off_Inning_9,Off_Inning_10+
0,2015,St. Louis Cardinals,647,80,63,66,79,84,65,77,76,41,16
0,2015,Chicago Cubs,689,88,68,103,59,117,66,63,55,49,21
0,2015,San Francisco Giants,696,72,74,101,72,78,85,84,83,37,10
0,2015,Arizona Diamondbacks,720,84,102,64,87,86,85,73,69,55,15
0,2015,Pittsburgh Pirates,697,86,72,66,77,84,80,80,69,62,21
0,2015,Cincinnati Reds,640,96,64,71,60,67,88,72,54,60,8
0,2015,Minnesota Twins,696,92,83,95,92,63,75,69,60,59,8
0,2015,Detroit Tigers,689,102,58,80,90,79,74,75,71,44,16
0,2015,Cleveland Indians,669,68,69,106,67,78,80,69,78,43,11
0,2015,Houston Astros,729,82,75,73,83,83,77,103,78,58,17


Want to do the same analysis for defensive runs as well

In [11]:
defense_games = pd.DataFrame(columns = ['Season','Team','TotalDefensiveRuns','Def_Inning_1','Def_Inning_2','Def_Inning_3'
                                          ,'Def_Inning_4', 'Def_Inning_5','Def_Inning_6','Def_Inning_7','Def_Inning_8'
                                         ,'Def_Inning_9','Def_Inning_10+'])

for season in seasons:

    for team in globals()['season_{}'.format(season)][str(season)]:

        total_runs = 0
        inning_1_total = 0
        inning_2_total = 0
        inning_3_total = 0
        inning_4_total = 0
        inning_5_total = 0
        inning_6_total = 0
        inning_7_total = 0
        inning_8_total = 0
        inning_9_total = 0
        inning_extra_innings_total = 0

        for game in globals()['season_{}'.format(season)][str(season)][team]['TotalDefenseInnings']:
            #print(game)
            total_runs += sum([int(inning.replace('X','0')) for inning in game])

            inning_array = [int(inning.replace('X','0')) for inning in game]

            for idx, i in enumerate(inning_array):
                inning_num = idx + 1

                if inning_num >= 10:
                    inning_extra_innings_total += i
                else:
                    globals()['inning_{}_total'.format(inning_num)] += i

        final_array = [season,team, total_runs, inning_1_total, inning_2_total, inning_3_total
                      , inning_4_total, inning_5_total, inning_6_total, inning_7_total, inning_8_total
                      , inning_9_total, inning_extra_innings_total]
        #print(final_array)

        team_defense = pd.DataFrame([final_array], columns = defense_games.columns)

        defense_games = defense_games.append(team_defense)

defense_games

Unnamed: 0,Season,Team,TotalDefensiveRuns,Def_Inning_1,Def_Inning_2,Def_Inning_3,Def_Inning_4,Def_Inning_5,Def_Inning_6,Def_Inning_7,Def_Inning_8,Def_Inning_9,Def_Inning_10+
0,2015,St. Louis Cardinals,525,76,44,61,51,56,58,64,51,48,16
0,2015,Chicago Cubs,608,75,71,51,66,65,77,72,59,64,8
0,2015,San Francisco Giants,627,76,63,82,70,66,81,65,55,61,8
0,2015,Arizona Diamondbacks,713,100,73,69,59,114,87,51,72,70,18
0,2015,Pittsburgh Pirates,596,87,67,70,61,66,53,76,51,48,17
0,2015,Cincinnati Reds,754,89,73,88,91,102,83,84,78,47,19
0,2015,Minnesota Twins,700,72,77,93,91,73,82,70,80,48,14
0,2015,Detroit Tigers,803,86,78,109,94,91,89,109,80,52,15
0,2015,Cleveland Indians,640,90,66,69,94,63,67,71,59,54,7
0,2015,Houston Astros,618,105,75,55,50,87,67,57,60,47,15


Now we can bring together both offensive and defensive innings together into one dataframe so we can find the net betwen the two

In [12]:
offense_and_defense = pd.merge(offensive_games, defense_games)
offense_and_defense.head()

Unnamed: 0,Season,Team,TotalOffensiveRuns,Off_Inning_1,Off_Inning_2,Off_Inning_3,Off_Inning_4,Off_Inning_5,Off_Inning_6,Off_Inning_7,...,Def_Inning_1,Def_Inning_2,Def_Inning_3,Def_Inning_4,Def_Inning_5,Def_Inning_6,Def_Inning_7,Def_Inning_8,Def_Inning_9,Def_Inning_10+
0,2015,St. Louis Cardinals,647,80,63,66,79,84,65,77,...,76,44,61,51,56,58,64,51,48,16
1,2015,Chicago Cubs,689,88,68,103,59,117,66,63,...,75,71,51,66,65,77,72,59,64,8
2,2015,San Francisco Giants,696,72,74,101,72,78,85,84,...,76,63,82,70,66,81,65,55,61,8
3,2015,Arizona Diamondbacks,720,84,102,64,87,86,85,73,...,100,73,69,59,114,87,51,72,70,18
4,2015,Pittsburgh Pirates,697,86,72,66,77,84,80,80,...,87,67,70,61,66,53,76,51,48,17


Now we can take the "net" runs of each inning. We are just going to subtract the defensive innings from the offensive innings. This way we can see if a team scored more runs than they gave up for the specific inning of the game.

In [13]:
for inn in range(1,10):
    offense_and_defense['Net_Inning_{}'.format(inn)] = offense_and_defense['Off_Inning_{}'.format(inn)] - offense_and_defense['Def_Inning_{}'.format(inn)]
    
offense_and_defense['Net_Inning_10+'] = offense_and_defense['Off_Inning_10+'] - offense_and_defense['Def_Inning_10+']
offense_and_defense['TotalNetRuns'] = offense_and_defense['TotalOffensiveRuns'] - offense_and_defense['TotalDefensiveRuns']

In [19]:
offense_and_defense.loc[:, [col for col in offense_and_defense.columns if 'Net' in col and 'Inning' in col]].head()

Unnamed: 0,Net_Inning_1,Net_Inning_2,Net_Inning_3,Net_Inning_4,Net_Inning_5,Net_Inning_6,Net_Inning_7,Net_Inning_8,Net_Inning_9,Net_Inning_10+
0,4,19,5,28,28,7,13,25,-7,0
1,13,-3,52,-7,52,-11,-9,-4,-15,13
2,-4,11,19,2,12,4,19,28,-24,2
3,-16,29,-5,28,-28,-2,22,-3,-15,-3
4,-1,5,-4,16,18,27,4,18,14,4


In [21]:
offense_and_defense[['Season','Team','TotalOffensiveRuns','TotalDefensiveRuns','TotalNetRuns']]

Unnamed: 0,Season,Team,TotalOffensiveRuns,TotalDefensiveRuns,TotalNetRuns
0,2015,St. Louis Cardinals,647,525,122
1,2015,Chicago Cubs,689,608,81
2,2015,San Francisco Giants,696,627,69
3,2015,Arizona Diamondbacks,720,713,7
4,2015,Pittsburgh Pirates,697,596,101
5,2015,Cincinnati Reds,640,754,-114
6,2015,Minnesota Twins,696,700,-4
7,2015,Detroit Tigers,689,803,-114
8,2015,Cleveland Indians,669,640,29
9,2015,Houston Astros,729,618,111


We need to now include our response variable, which is the total number of wins that the team had for that season

In [24]:
team_wins = pd.DataFrame(columns = ['Season','Team','SeasonWins'])

for season in seasons:
    for team in globals()['season_{}'.format(season)][str(season)]:
        team_wins = team_wins.append(pd.DataFrame([[season,team, globals()['season_{}'.format(season)][str(season)][team]['TotalWins']]]
                                                  , columns = team_wins.columns))
        
team_wins

Unnamed: 0,Season,Team,SeasonWins
0,2015,St. Louis Cardinals,100
0,2015,Chicago Cubs,97
0,2015,San Francisco Giants,84
0,2015,Arizona Diamondbacks,79
0,2015,Pittsburgh Pirates,98
0,2015,Cincinnati Reds,64
0,2015,Minnesota Twins,83
0,2015,Detroit Tigers,74
0,2015,Cleveland Indians,81
0,2015,Houston Astros,86


Bring everything together

In [36]:
total_analysis_df = pd.merge(offense_and_defense, team_wins)

# Convert values of integers into float values for machine learning
for col in [col for col in total_analysis_df.columns if col not in ['Season','Team']]:
    total_analysis_df[col] = total_analysis_df[col].apply(lambda x: float(x))

total_analysis_df

Unnamed: 0,Season,Team,TotalOffensiveRuns,Off_Inning_1,Off_Inning_2,Off_Inning_3,Off_Inning_4,Off_Inning_5,Off_Inning_6,Off_Inning_7,...,Net_Inning_3,Net_Inning_4,Net_Inning_5,Net_Inning_6,Net_Inning_7,Net_Inning_8,Net_Inning_9,Net_Inning_10+,TotalNetRuns,SeasonWins
0,2015,St. Louis Cardinals,647.0,80.0,63.0,66.0,79.0,84.0,65.0,77.0,...,5.0,28.0,28.0,7.0,13.0,25.0,-7.0,0.0,122.0,100.0
1,2015,Chicago Cubs,689.0,88.0,68.0,103.0,59.0,117.0,66.0,63.0,...,52.0,-7.0,52.0,-11.0,-9.0,-4.0,-15.0,13.0,81.0,97.0
2,2015,San Francisco Giants,696.0,72.0,74.0,101.0,72.0,78.0,85.0,84.0,...,19.0,2.0,12.0,4.0,19.0,28.0,-24.0,2.0,69.0,84.0
3,2015,Arizona Diamondbacks,720.0,84.0,102.0,64.0,87.0,86.0,85.0,73.0,...,-5.0,28.0,-28.0,-2.0,22.0,-3.0,-15.0,-3.0,7.0,79.0
4,2015,Pittsburgh Pirates,697.0,86.0,72.0,66.0,77.0,84.0,80.0,80.0,...,-4.0,16.0,18.0,27.0,4.0,18.0,14.0,4.0,101.0,98.0
5,2015,Cincinnati Reds,640.0,96.0,64.0,71.0,60.0,67.0,88.0,72.0,...,-17.0,-31.0,-35.0,5.0,-12.0,-24.0,13.0,-11.0,-114.0,64.0
6,2015,Minnesota Twins,696.0,92.0,83.0,95.0,92.0,63.0,75.0,69.0,...,2.0,1.0,-10.0,-7.0,-1.0,-20.0,11.0,-6.0,-4.0,83.0
7,2015,Detroit Tigers,689.0,102.0,58.0,80.0,90.0,79.0,74.0,75.0,...,-29.0,-4.0,-12.0,-15.0,-34.0,-9.0,-8.0,1.0,-114.0,74.0
8,2015,Cleveland Indians,669.0,68.0,69.0,106.0,67.0,78.0,80.0,69.0,...,37.0,-27.0,15.0,13.0,-2.0,19.0,-11.0,4.0,29.0,81.0
9,2015,Houston Astros,729.0,82.0,75.0,73.0,83.0,83.0,77.0,103.0,...,18.0,33.0,-4.0,10.0,46.0,18.0,11.0,2.0,111.0,86.0


# Regression Analysis

We want to look at how net runs in each inning have predictability of regular season wins. We want to have a baseline estimator for all of the seasons to determine if there can be more variance expalined by just using innings individually as opposed to the overall TotalNetRuns

In [42]:
Net_Innings_Reg_X = total_analysis_df['TotalNetRuns']
Net_Innings_Reg_y = total_analysis_df['SeasonWins']

Net_Innings_Reg_X = sm.add_constant(Net_Innings_Reg_X)
est = sm.OLS(Net_Innings_Reg_y, Net_Innings_Reg_X)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:             SeasonWins   R-squared:                       0.864
Model:                            OLS   Adj. R-squared:                  0.863
Method:                 Least Squares   F-statistic:                     750.6
Date:                Sun, 25 Nov 2018   Prob (F-statistic):           5.57e-53
Time:                        11:02:49   Log-Likelihood:                -346.29
No. Observations:                 120   AIC:                             696.6
Df Residuals:                     118   BIC:                             702.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const           80.9833      0.399    202.919   

We will use R-Squared and Adjusted R-Squared as the metrics to determine whether our models are improving or not. Our baseline R-Squared metric will be .864, but when we will use all of the innings, we need to look at adjusted R-Squared due to it punishing having more variables into the linear equation. 

In [40]:
# Looking at innings regression from each season individually 
for season in seasons:
    print(season)
    reg_cols = [col for col in total_analysis_df.columns if ('Inning' in col) and ('Net' in col)] 
    Net_Innings_Reg_X = total_analysis_df[total_analysis_df['Season'] == season][reg_cols]
    Net_Innings_Reg_y = total_analysis_df[total_analysis_df['Season'] == season]['SeasonWins']

    Net_Innings_Reg_X = sm.add_constant(Net_Innings_Reg_X)
    est = sm.OLS(Net_Innings_Reg_y, Net_Innings_Reg_X)
    est2 = est.fit()
    print('{}\n'.format(est2.summary()))

2015
                            OLS Regression Results                            
Dep. Variable:             SeasonWins   R-squared:                       0.833
Model:                            OLS   Adj. R-squared:                  0.746
Method:                 Least Squares   F-statistic:                     9.501
Date:                Sun, 25 Nov 2018   Prob (F-statistic):           1.79e-05
Time:                        09:22:09   Log-Likelihood:                -85.590
No. Observations:                  30   AIC:                             193.2
Df Residuals:                      19   BIC:                             208.6
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const             80.9667      0.963   

When we look at all the seasons at once, we see that we get an adjusted r-squared of .868, which beats out the original Net Total Runs by .002. This means that it might be more significant to just look at Total Net Runs rather than inning by inning since it is much easier to interpret

In [35]:
reg_cols = [col for col in total_analysis_df.columns if ('Inning' in col) and ('Net' in col)] 
Net_Innings_Reg_X = total_analysis_df[reg_cols]
Net_Innings_Reg_y = total_analysis_df['SeasonWins']

Net_Innings_Reg_X = sm.add_constant(Net_Innings_Reg_X)
est = sm.OLS(Net_Innings_Reg_y, Net_Innings_Reg_X)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:             SeasonWins   R-squared:                       0.879
Model:                            OLS   Adj. R-squared:                  0.868
Method:                 Least Squares   F-statistic:                     79.38
Date:                Sun, 25 Nov 2018   Prob (F-statistic):           2.42e-45
Time:                        09:18:08   Log-Likelihood:                -339.21
No. Observations:                 120   AIC:                             700.4
Df Residuals:                     109   BIC:                             731.1
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const             80.9833      0.391    206.

## That was looking at all of the innings, but is there a subset of innings that gives us better results?

When we are running our analysis, we are using every single inning in our regression analysis, but there could just be a subset number of innings that we could use to give us better predictions and a higher Adusted R-Squared value. 

Our first analysis is going to look at feature significance. We want to look at the best possible feature that has the highest adjusted R Square, and then continue to add innings to the model to see which set of innings give the best version, and continue to do this until adding another feature to the model doesn't create any more sigificant values to the model. This is also known as forward selection

In [44]:
complete_analysis = {}
max_r_square = 0

reg_cols = [col for col in total_analysis_df.columns if ('Inning' in col) and ('Net' in col)] 
for num_of_feats in range(1,len(reg_cols) + 1):
    print(num_of_feats)
    complete_analysis[num_of_feats] = {}
    
    feat_combos = [",".join(map(str, comb)) for comb in combinations(reg_cols, num_of_feats)]
    
    if num_of_feats == 1:
        final_combos = [combo.split(',') for combo in feat_combos]
        
    else:
        if num_of_feats == 2:
            new_best_model = [new_best_model]
        final_combos = []
        all_final_combos = [combo.split(',') for combo in feat_combos]
        for combo in all_final_combos:
            i = 0
            for col in combo:
                if col in new_best_model:
                    i+=1
            if i == (num_of_feats - 1):
                final_combos.append(combo)
                
        feat_combos = [",".join(x) for x in final_combos]
    
    
    for combo, combo_string in zip(final_combos, feat_combos):
        #print(combo)
        Net_Innings_Reg_X = total_analysis_df[combo]
        Net_Innings_Reg_y = total_analysis_df['SeasonWins']

        Net_Innings_Reg_X = sm.add_constant(Net_Innings_Reg_X)
        est = sm.OLS(Net_Innings_Reg_y, Net_Innings_Reg_X)
        est2 = est.fit()
        
        if sum(est2.pvalues < .05) == (len(combo) + 1):
            complete_analysis[num_of_feats][combo_string] = {'RSquared': est2.rsquared_adj}
    
    k = 0
    for combos in complete_analysis[num_of_feats]:
        if complete_analysis[num_of_feats][combos]['RSquared'] > max_r_square:
            new_best_model = combos
            max_r_square = complete_analysis[num_of_feats][combos]['RSquared'] 
            k +=1
    if k == 0:
        break
    else:
        print("New Best Combo of Features: {} - {}".format(new_best_model, max_r_square))

1
New Best Combo of Features: Net_Inning_4 - 0.43199217451096794
2
New Best Combo of Features: Net_Inning_4,Net_Inning_5 - 0.6087499577173356
3
New Best Combo of Features: Net_Inning_2,Net_Inning_4,Net_Inning_5 - 0.6869373806781446
4
New Best Combo of Features: Net_Inning_2,Net_Inning_4,Net_Inning_5,Net_Inning_8 - 0.7523356863699837
5
New Best Combo of Features: Net_Inning_1,Net_Inning_2,Net_Inning_4,Net_Inning_5,Net_Inning_8 - 0.7913774507559423
6
New Best Combo of Features: Net_Inning_1,Net_Inning_2,Net_Inning_4,Net_Inning_5,Net_Inning_7,Net_Inning_8 - 0.8329279098706857
7
New Best Combo of Features: Net_Inning_1,Net_Inning_2,Net_Inning_3,Net_Inning_4,Net_Inning_5,Net_Inning_7,Net_Inning_8 - 0.847532142572351
8
New Best Combo of Features: Net_Inning_1,Net_Inning_2,Net_Inning_3,Net_Inning_4,Net_Inning_5,Net_Inning_7,Net_Inning_8,Net_Inning_10+ - 0.8586642026551802
9
New Best Combo of Features: Net_Inning_1,Net_Inning_2,Net_Inning_3,Net_Inning_4,Net_Inning_5,Net_Inning_7,Net_Inning_8,N

Since each of the innings is significant, we ended up using all of hte features in the model. We also can see which innings are most influential in the model, like the 4th inning being the most influential individual inning with no other innings being considered. Since the data is small and the number of features are small, we can see every single subset of features possible to see if the forward selection might have missed something

In [45]:
complete_analysis = {}
max_r_square = 0

reg_cols = [col for col in total_analysis_df.columns if ('Inning' in col) and ('Net' in col)] 
for num_of_feats in range(1,len(reg_cols) + 1):
    print(num_of_feats)
    complete_analysis[num_of_feats] = {}
    
    feat_combos = [",".join(map(str, comb)) for comb in combinations(reg_cols, num_of_feats)]
    
    final_combos = [combo.split(',') for combo in feat_combos]

    for combo, combo_string in zip(final_combos, feat_combos):
        #print(combo)
        Net_Innings_Reg_X = total_analysis_df[combo]
        Net_Innings_Reg_y = total_analysis_df['SeasonWins']

        Net_Innings_Reg_X = sm.add_constant(Net_Innings_Reg_X)
        est = sm.OLS(Net_Innings_Reg_y, Net_Innings_Reg_X)
        est2 = est.fit()
        
        if sum(est2.pvalues < .05) == (len(combo) + 1):
            complete_analysis[num_of_feats][combo_string] = {'RSquared': est2.rsquared_adj}
    
    k = 0
    for combos in complete_analysis[num_of_feats]:
        if complete_analysis[num_of_feats][combos]['RSquared'] > max_r_square:
            new_best_model = combos
            max_r_square = complete_analysis[num_of_feats][combos]['RSquared'] 
            k +=1
    if k == 0:
        break
    else:
        print("New Best Combo of Features: {} - {}".format(new_best_model.split(','), max_r_square))

1
New Best Combo of Features: ['Net_Inning_4'] - 0.43199217451096794
2
New Best Combo of Features: ['Net_Inning_4', 'Net_Inning_5'] - 0.6087499577173356
3
New Best Combo of Features: ['Net_Inning_2', 'Net_Inning_4', 'Net_Inning_5'] - 0.6869373806781446
4
New Best Combo of Features: ['Net_Inning_2', 'Net_Inning_4', 'Net_Inning_5', 'Net_Inning_8'] - 0.7523356863699837
5
New Best Combo of Features: ['Net_Inning_1', 'Net_Inning_2', 'Net_Inning_5', 'Net_Inning_7', 'Net_Inning_8'] - 0.8040905946523569
6
New Best Combo of Features: ['Net_Inning_1', 'Net_Inning_2', 'Net_Inning_4', 'Net_Inning_5', 'Net_Inning_7', 'Net_Inning_8'] - 0.8329279098706857
7
New Best Combo of Features: ['Net_Inning_1', 'Net_Inning_2', 'Net_Inning_3', 'Net_Inning_4', 'Net_Inning_5', 'Net_Inning_7', 'Net_Inning_8'] - 0.847532142572351
8
New Best Combo of Features: ['Net_Inning_1', 'Net_Inning_2', 'Net_Inning_3', 'Net_Inning_4', 'Net_Inning_5', 'Net_Inning_7', 'Net_Inning_8', 'Net_Inning_10+'] - 0.8586642026551802
9
New 

We can see that we end up with the same best Adjusted R Square with 0.868 regardless of the path of the features. The same number of features was the same as the forward selection algorithm up until 5 features, when it didn't look at Net_Inning_4

# Conclusion

We can determine from looking at Total Net Runs for the year and looking at each subset of net runs in each inning shows that using all of the innings individually is the best way to figure out how many wins a team will have at the end of the year...by a hair. By looking at the t values of the ANOVA chart, we can see that the 5th inning, 1st inning, and 8th inning are the 3 most influential innings. Surprisingly the 6th inning is the least influential inning since the 5th inning was the most influential inning. That probably has to do with a new pitcher coming in and most pitchers trying to get to 5 innings so they can receive a win for the baseball game (teams have to go a minimum of 5 complete innings to be able to get awarded a win).

In [48]:
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:             SeasonWins   R-squared:                       0.879
Model:                            OLS   Adj. R-squared:                  0.868
Method:                 Least Squares   F-statistic:                     79.38
Date:                Sun, 25 Nov 2018   Prob (F-statistic):           2.42e-45
Time:                        13:42:37   Log-Likelihood:                -339.21
No. Observations:                 120   AIC:                             700.4
Df Residuals:                     109   BIC:                             731.1
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const             80.9833      0.391    206.