In [5]:
import pandas as pd
import numpy as np
import os
import pickle

In [6]:
import sys
sys.path.append(os.environ.get('PYTHONPATH'))
from config import config

In [7]:
pitching = pd.read_csv(os.path.join(config.PROJECT_ROOT_DIR, 'data', 'projected', 'pitchingProjected.csv'), encoding = 'latin')
hitting = pd.read_csv(os.path.join(config.PROJECT_ROOT_DIR, 'data', 'projected', 'hittingProjected.csv'), encoding = 'latin')
players = pd.read_csv(os.path.join(config.PROJECT_ROOT_DIR, 'data', 'historical', 'players.csv'), encoding = 'latin')
teams = pd.read_csv(os.path.join(config.PROJECT_ROOT_DIR, 'data', 'historical', 'teams.csv'), encoding = 'latin')

In [8]:
with open('mvp_lr.model', 'rb') as f:
    model_lr = pickle.load(f)
f.close()

with open('mvp_rf.model', 'rb') as f:
    model_rf = pickle.load(f)
f.close()

In [9]:
def nineInningNormalize(df, statColumn, inningsPitchedColumn = 'ip'):
    return df[statColumn] * 9/(df[inningsPitchedColumn] % 1 * 10 / 3 + merged[inningsPitchedColumn].round())

In [10]:
def atBatNormalize(df, statColumn, atBatColumn):
    return df[statColumn] / df[atBatColumn]

In [11]:
stats = pitching.merge(hitting, how='outer', left_on=['player_id'], right_on=['player_id'])
merged = stats.merge(players, how = 'left', left_on=['player_id'], right_on=['player_id'])
merged['season'] = config.CURRENT_SEASON
merged = merged.merge(teams, how = 'left', left_on=['team_id', 'season'], right_on=['team_id', 'season'])

In [12]:
minimumInningsPitched = 100
minimumAtBats = 300

merged = merged[((merged['ip'] > minimumInningsPitched) & (merged['primary_position'] == '1')) |
                ((merged['ab'] > minimumAtBats) & (merged['primary_position'] != '1'))]

merged['sv_pct'] = merged['sv']/merged['svo']
merged['win_pct'] = merged['w']/(merged['w'] + merged['l'])
merged['hits_9'] = nineInningNormalize(merged, 'h_x')
merged['hrs_9'] = nineInningNormalize(merged, 'hr_x')
merged['bbs_9'] = nineInningNormalize(merged, 'bb_x')
merged['ks_9'] = nineInningNormalize(merged, 'so_x')
merged['ers_9'] = nineInningNormalize(merged, 'er')

merged['hit_ab'] = atBatNormalize(merged, 'h_y', 'ab')
merged['hr_ab'] = atBatNormalize(merged, 'hr_y', 'ab')
merged['rbi_ab'] = atBatNormalize(merged, 'rbi', 'ab')
merged['bb_ab'] = atBatNormalize(merged, 'bb_y', 'ab')
merged['k_ab'] = atBatNormalize(merged, 'so_y', 'ab')

# modelData = merged[['hits_9', 'hrs_9', 'bbs_9', 'ks_9', 'ers_9', 'sv_pct', 'win_pct', 'era', 'whip', 'hit_ab', 'hr_ab',
#                     'rbi_ab', 'bb_ab', 'k_ab', 'slg', 'obp', 'name_display_first_last', 'team_name', 'division_full',
#                     'primary_position', 'ab', 'ip']]

modelData = merged[['h_x', 'hr_x', 'bb_x', 'so_x', 'er', 'sv', 'svo', 'w', 'l', 'ip', 'era', 'whip', 'h_y', 'hr_y',
                    'rbi', 'bb_y', 'so_y', 'slg', 'obp', 'ab', 'name_display_first_last', 'team_name', 'division_full',
                    'primary_position']]


# modelData.loc[modelData['ip'] <= 5, 'ks_9'] = 0.0

modelData.loc[modelData['era'] == '-.--', 'era'] = 0.0
modelData.loc[modelData['era'] == '*.**', 'era'] = 0.0
modelData.loc[modelData['whip'] == '-.--', 'whip'] = 0.0
modelData.loc[modelData['whip'] == '*.**', 'whip'] = 0.0

modelData.loc[modelData['slg'] == '.---', 'slg'] = 0.0
modelData.loc[modelData['obp'] == '.---', 'obp'] = 0.0

modelData = modelData.replace([np.inf, -np.inf], np.nan).fillna(0)

  result = method(y)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [13]:
predictData = modelData
predictData = predictData.loc[:, predictData.columns != 'name_display_first_last']
predictData = predictData.loc[:, predictData.columns != 'team_name']
predictData = predictData.loc[:, predictData.columns != 'division_full']
predictData = predictData.loc[:, predictData.columns != 'primary_position']
# predictData = predictData.loc[:, predictData.columns != 'ab']
# predictData = predictData.loc[:, predictData.columns != 'ip']

probs = np.stack((model_lr.predict_proba(predictData)[:,1], model_rf.predict_proba(predictData)[:,1]))
avg_probs = np.mean(probs, axis=0)

modelData['prediction'] = avg_probs
modelData = modelData.sort_values(by = ['prediction'], ascending=False)

modelData

Unnamed: 0,h_x,hr_x,bb_x,so_x,er,sv,svo,w,l,ip,...,bb_y,so_y,slg,obp,ab,name_display_first_last,team_name,division_full,primary_position,prediction
893,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,80.0,122.0,0.623,0.404,541.0,Cody Bellinger,Los Angeles Dodgers,National League West,3,9.145740e-01
935,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,88.0,126.0,0.601,0.409,546.0,Christian Yelich,Milwaukee Brewers,National League Central,7,7.939367e-01
873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,62.0,99.0,0.568,0.359,602.0,Nolan Arenado,Colorado Rockies,National League West,5,7.480807e-01
699,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,130.0,116.0,0.600,0.454,497.0,Mike Trout,Los Angeles Angels,American League West,8,7.218128e-01
605,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,70.0,131.0,0.562,0.384,555.0,J.D. Martinez,Boston Red Sox,American League East,7,6.441069e-01
836,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,83.0,130.0,0.540,0.391,582.0,Freddie Freeman,Atlanta Braves,National League East,3,5.157203e-01
597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,88.0,97.0,0.511,0.390,585.0,Mookie Betts,Boston Red Sox,American League East,9,4.606251e-01
1040,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,64.0,93.0,0.552,0.391,496.0,Anthony Rendon,Washington Nationals,National League East,5,4.602591e-01
807,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,48.0,72.0,0.520,0.366,458.0,Vladimir Guerrero Jr.,Toronto Blue Jays,American League East,5,4.334936e-01
631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,54.0,87.0,0.510,0.353,539.0,Francisco Lindor,Cleveland Indians,American League Central,6,4.037892e-01
