In [15]:
import pandas as pd
import numpy as np
import psycopg2
import math
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(font_scale=1.5)

In [16]:
#Create connection and cursor
conn = psycopg2.connect("dbname=pfxbaseballdata")
cur = conn.cursor()

In [17]:
# Define query
query = """
SELECT * 
FROM batter_stats_table AS B
INNER JOIN player_id_table AS IB on IB.fg_id = B.playerid
INNER JOIN test_pitches AS GD on IB.mlb_id = GD.batter_id
INNER JOIN player_id_table AS IP on IP.mlb_id = GD.pitcher_id
INNER JOIN pitcher_stats_table AS P on P.playerid = IP.fg_id;
"""
#Execute the query
cur.execute(query)
# Return the next row
GameData = cur.fetchall()

In [18]:
GameData_df = pd.DataFrame(GameData, columns=['b_name', 'b_team', 'b_games', 'PA', 'HR', 'R', 'RBI', 'SB',
                                             'BB_percent', 'K_percent','b_ISO', 'b_BABIP', 'BA_AVG', 'OBP', 'SLG', 'wOBA',
                                              'WRC_plus', 'BSR', 'OFF', 'DEF', 'WAR', 'b_playerid',
                                              'b_mlb_id', 'b_mlb_name', 'b_mlb_pos', 'b_mlb_team', 'b_mlb_team_full',
                                              'b_bats', 'batter_throws', 'b_birth_year', 'b_bp_id', 'b_bref_id', 
                                              'b_bref_name','b_cbs_id', 'b_cbs_name', 'b_cbs_pos', 'b_espn_id', 
                                              'b_espn_name','b_espn_pos','b_fg_id','b_fg_name','b_fg_pos',
                                              'b_lahman_id', 'b_nfbc_id','b_nfbc_name','b_nfbc_pos','b_retro_id', 
                                              'b_retro_name', 'b_debut', 'b_yahoo_id','b_yahoo_name',
                                              'b_yahoo_pos','b_mlb_depth', 'b_ottoneu_id', 'b_ottoneu_name', 
                                              'b_ottoneu_pos','dataStamp', 'park_sv_id', 'play_guid', 'ab_total', 
                                              'ab_count','pitcher_id', 'batter_id', 'ab_id', 'des', 'pitcher_type',
                                              'pitch_id', 'sz_top', 'sz_bot', 'pfx_xDataFile', 'pfx_zDataFile',
                                              'mlbam_pitch_name', 'zone_location', 'pitch_con', 'stand',
                                              'strikes', 'balls', 'p_throws', 'gid', 'pdes', 'spin', 'norm_ht',
                                              'inning', 'pitcher_team', 'tstart', 'vystart', 'ftime', 'pfx_x',
                                              'pfx_z', 'uncorrected_pfx_x', 'uncorrected_pfx_z', 'x0',
                                              'y0', 'z0', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'start_speed',
                                              'px', 'pz', 'pxold', 'pzold', 'tm_spin', 'sb', 'p_mlb_id',
                                              'p_mlb_name', 'p_mlb_pos', 'p_mlb_team','p_mlb_team_full', 'p_bats', 
                                              'pfx_p_throws','p_birth_year', 'p_bp_id', 'p_bref_id', 'p_bref_name', 
                                              'p_cbs_id', 'p_cbs_name','p_cbs_pos', 'p_espn_id', 'p_espn_name', 
                                              'p_espn_pos', 
                                              'p_fg_id','p_fg_name','p_fg_pos', 'p_lahman_id', 'p_nfbc_id', 'p_nfbc_name', 
                                              'p_nfbc_pos', 'p_retro_id','p_retro_name', 'p_debut', 'p_yahoo_id', 'p_yahoo_name', 
                                              'p_yahoo_pos','p_mlb_depth', 'p_ottoneu_id', 'p_ottoneu_name', 'p_ottoneu_pos', 
                                              'p_name',
                                              'p_team', 'p_wins', 'p_losses', 'p_saves', 'p_games', 'p_games_started', 'p_innings_pitched',
                                              'p_k_per_9', 'p_bb_per_9', 'p_hr_per_9', 'p_babip', 'p_lob_percent',
                                              'p_gb_percent', 'p_hr_fb_percent', 'p_era', 'p_fip', 'p_xfip', 'p_war',
                                              'p_playerid'])

In [19]:
cleanGameData_df = GameData_df.loc[:,['b_games', 'PA', 'HR', 'R', 'RBI', 'SB',
                                    'BB_percent', 'K_percent','b_ISO', 'b_BABIP', 'BA_AVG', 'OBP', 'SLG', 'wOBA',
                                    'WRC_plus', 'BSR', 'OFF', 'DEF', 'WAR', 'b_playerid','b_mlb_id', 
                                    'b_fg_id','b_debut', 'ab_total', 'ab_count','pitcher_id', 
                                    'batter_id', 'ab_id','pitch_id', 'sz_top', 'sz_bot', 'pfx_xDataFile', 
                                    'pfx_zDataFile', 'pitch_con','strikes', 'balls', 'spin', 
                                    'norm_ht','inning','tstart', 'vystart', 'ftime', 'pfx_x','pfx_z', 
                                    'uncorrected_pfx_x','uncorrected_pfx_z', 'x0','y0', 'z0', 'vx0', 'vy0', 'vz0', 
                                    'ax', 'ay', 'az','start_speed','px', 'pz', 'pxold', 'pzold', 'tm_spin', 'sb', 
                                    'p_mlb_id','p_birth_year','p_fg_id','p_debut', 'p_wins', 'p_losses', 
                                    'p_saves','p_games', 'p_games_started', 'p_innings_pitched','p_k_per_9', 
                                    'p_bb_per_9','p_hr_per_9', 'p_babip', 'p_lob_percent','p_gb_percent', 
                                    'p_hr_fb_percent','p_era', 'p_fip', 'p_xfip', 'p_war','p_playerid']]

In [20]:
#convert right vs left hander batter to binary
batter_stance = []
for batter in GameData_df.loc[:,]['stand']:
    if batter == 'R':
        batter_stance.append(1)
    else:
        batter_stance.append(0)
cleanGameData_df['batter_stance'] = batter_stance

In [21]:
#convert right vs left hander pitcher to binary
right_left_handness = []
for pitcher in GameData_df.loc[:,]['pfx_p_throws']:
    if pitcher == 'R':
        right_left_handness.append(1)
    else:
        right_left_handness.append(0)
cleanGameData_df['pitcher_handedness'] = right_left_handness

In [22]:
#Binary Outcome for logistic regression
outcome = [] 
for pitch in GameData_df.loc[:,]['pdes']:
    if 'play' in pitch:
        if 'no' in pitch or 'run' in pitch:
            outcome.append(1) #1 indicates hit
        else:
            outcome.append(0)
    else:
        outcome.append(0) #0 indicates a non-hit outcome
cleanGameData_df['pitch_outcome'] = outcome

In [45]:
#Binary Outcome for logistic regression
ab_outcome = [] 
ball_in_play = ['Single', 'Double', 'Triple', 'Home Run']
for pitch in GameData_df.loc[:,]['des']:
    if pitch in ball_in_play and 'Play' not in pitch:
        ab_outcome.append(1)
    else:
        ab_outcome.append(0)
cleanGameData_df['ab_outcome'] = ab_outcome 

In [24]:
#Convert probabilities within data set to log odds for logistic regression analysis
#Log Odds: LO = LOGe[Odds]= LOGe[p/(1-p)] 
def proba_to_logodds(probability):
    odds = (probability / (1 - probability))
    logodds = math.log(odds)
    return logodds
#test    
proba_to_logodds(.2)

-1.3862943611198906

In [25]:
#Columns to convert to logodds
columns_to_convert = ["BB_percent", "K_percent", "BA_AVG", "OBP", "SLG", "p_BABIP", "p_lob_percent",
                      "p_gb_percent", "p_hr_fb_percent"]
labels = ["BB_logodds", "K_logodds", "BA_AVG_logodds", "b_OBP_logodds", "b_SLG_logodds", 
          "p_BABIP_logodds", "p_lob_logodds","p_gb_logodds", "p_hr_fb_logodds"]

In [29]:
def convert_to_logodds(list_DFs, labels):
    for idx, DF in enumerate(list_DFs):
        temp_lst = []
        for i in GameData_df.iloc[:,][DF]:
            temp_lst.append(proba_to_logodds(i))
        cleanGameData_df[labels[idx]] = temp_lst
        
convert_to_logodds(columns_to_convert, labels)

KeyError: 'p_BABIP'

In [34]:
cleanGameData_df = cleanGameData_df.drop(columns=["BB_percent", "K_percent", "BA_AVG", "OBP", "SLG", "p_lob_percent",
                      "p_gb_percent", "p_hr_fb_percent"])

In [46]:
cleanGameData_df

Unnamed: 0,b_games,PA,HR,R,RBI,SB,b_ISO,b_BABIP,wOBA,WRC_plus,...,p_war,p_playerid,batter_stance,pitcher_handedness,BB_logodds,K_logodds,BA_AVG_logodds,b_OBP_logodds,b_SLG_logodds,ab_outcome
0,899,3692,116,443,527,14,0.169,0.322,0.365,136,...,2.0,9975,1,1,-2.277543,-1.992430,-0.814182,-0.519368,-0.096074,0
1,899,3692,116,443,527,14,0.169,0.322,0.365,136,...,2.0,9975,1,1,-2.277543,-1.992430,-0.814182,-0.519368,-0.096074,0
2,899,3692,116,443,527,14,0.169,0.322,0.365,136,...,2.0,9975,1,1,-2.277543,-1.992430,-0.814182,-0.519368,-0.096074,0
3,899,3692,116,443,527,14,0.169,0.322,0.365,136,...,2.0,9975,1,1,-2.277543,-1.992430,-0.814182,-0.519368,-0.096074,0
4,809,2994,59,304,346,21,0.141,0.298,0.307,97,...,2.0,9975,0,1,-2.376273,-1.417843,-1.087974,-0.758371,-0.434719,1
5,809,2994,59,304,346,21,0.141,0.298,0.307,97,...,2.0,9975,0,1,-2.376273,-1.417843,-1.087974,-0.758371,-0.434719,1
6,555,1839,34,224,187,105,0.129,0.301,0.311,92,...,2.0,9975,1,1,-2.883007,-1.848918,-0.979455,-0.781485,-0.397139,1
7,555,1839,34,224,187,105,0.129,0.301,0.311,92,...,2.0,9975,1,1,-2.883007,-1.848918,-0.979455,-0.781485,-0.397139,1
8,555,1839,34,224,187,105,0.129,0.301,0.311,92,...,2.0,9975,1,1,-2.883007,-1.848918,-0.979455,-0.781485,-0.397139,1
9,555,1839,34,224,187,105,0.129,0.301,0.311,92,...,2.0,9975,1,1,-2.883007,-1.848918,-0.979455,-0.781485,-0.397139,1


In [38]:
cleanGameData_df = cleanGameData_df.drop(columns=['pitch_outcome', 'ab_outcome'])

Unnamed: 0,b_name,b_team,b_games,PA,HR,R,RBI,SB,BB_percent,K_percent,...,p_hr_per_9,p_babip,p_lob_percent,p_gb_percent,p_hr_fb_percent,p_era,p_fip,p_xfip,p_war,p_playerid
0,Buster Posey,Giants,899,3692,116,443,527,14,0.093,0.120,...,0.73,0.267,0.750,0.462,0.083,3.46,3.78,4.11,2.0,9975
1,Buster Posey,Giants,899,3692,116,443,527,14,0.093,0.120,...,0.73,0.267,0.750,0.462,0.083,3.46,3.78,4.11,2.0,9975
2,Buster Posey,Giants,899,3692,116,443,527,14,0.093,0.120,...,0.73,0.267,0.750,0.462,0.083,3.46,3.78,4.11,2.0,9975
3,Buster Posey,Giants,899,3692,116,443,527,14,0.093,0.120,...,0.73,0.267,0.750,0.462,0.083,3.46,3.78,4.11,2.0,9975
4,Brandon Crawford,Giants,809,2994,59,304,346,21,0.085,0.195,...,0.73,0.267,0.750,0.462,0.083,3.46,3.78,4.11,2.0,9975
5,Brandon Crawford,Giants,809,2994,59,304,346,21,0.085,0.195,...,0.73,0.267,0.750,0.462,0.083,3.46,3.78,4.11,2.0,9975
6,Eduardo Nunez,- - -,555,1839,34,224,187,105,0.053,0.136,...,0.73,0.267,0.750,0.462,0.083,3.46,3.78,4.11,2.0,9975
7,Eduardo Nunez,- - -,555,1839,34,224,187,105,0.053,0.136,...,0.73,0.267,0.750,0.462,0.083,3.46,3.78,4.11,2.0,9975
8,Eduardo Nunez,- - -,555,1839,34,224,187,105,0.053,0.136,...,0.73,0.267,0.750,0.462,0.083,3.46,3.78,4.11,2.0,9975
9,Eduardo Nunez,- - -,555,1839,34,224,187,105,0.053,0.136,...,0.73,0.267,0.750,0.462,0.083,3.46,3.78,4.11,2.0,9975


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=16)

In [174]:
# fit a logistic regression model and store the class predictions
logreg = LogisticRegression(C=1e9)
X = np.array(cleanGameData_df.iloc[:,0:83])
y = np.array(cleanGameData_df['ab_outcome'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=16)
logreg.fit(X_train, y_train)
#hit_proba = logreg.predict_proba(X)
logodds_pred = logreg.predict_log_proba(X_test)
proba_pred = logreg.predict_proba(X_test)
basic_log_score = logreg.score(X_test, y_test)

Unnamed: 0,b_games,PA,HR,R,RBI,SB,b_ISO,b_BABIP,wOBA,WRC_plus,...,p_xfip,p_war,p_playerid,batter_stance,pitcher_handedness,BB_logodds,K_logodds,BA_AVG_logodds,b_OBP_logodds,b_SLG_logodds
0,899,3692,116,443,527,14,0.169,0.322,0.365,136,...,4.11,2.0,9975,1,1,-2.277543,-1.992430,-0.814182,-0.519368,-0.096074
1,899,3692,116,443,527,14,0.169,0.322,0.365,136,...,4.11,2.0,9975,1,1,-2.277543,-1.992430,-0.814182,-0.519368,-0.096074
2,899,3692,116,443,527,14,0.169,0.322,0.365,136,...,4.11,2.0,9975,1,1,-2.277543,-1.992430,-0.814182,-0.519368,-0.096074
3,899,3692,116,443,527,14,0.169,0.322,0.365,136,...,4.11,2.0,9975,1,1,-2.277543,-1.992430,-0.814182,-0.519368,-0.096074
4,809,2994,59,304,346,21,0.141,0.298,0.307,97,...,4.11,2.0,9975,0,1,-2.376273,-1.417843,-1.087974,-0.758371,-0.434719
5,809,2994,59,304,346,21,0.141,0.298,0.307,97,...,4.11,2.0,9975,0,1,-2.376273,-1.417843,-1.087974,-0.758371,-0.434719
6,555,1839,34,224,187,105,0.129,0.301,0.311,92,...,4.11,2.0,9975,1,1,-2.883007,-1.848918,-0.979455,-0.781485,-0.397139
7,555,1839,34,224,187,105,0.129,0.301,0.311,92,...,4.11,2.0,9975,1,1,-2.883007,-1.848918,-0.979455,-0.781485,-0.397139
8,555,1839,34,224,187,105,0.129,0.301,0.311,92,...,4.11,2.0,9975,1,1,-2.883007,-1.848918,-0.979455,-0.781485,-0.397139
9,555,1839,34,224,187,105,0.129,0.301,0.311,92,...,4.11,2.0,9975,1,1,-2.883007,-1.848918,-0.979455,-0.781485,-0.397139


In [138]:
probabilities = []
for i in logodds_pred:
    logodds.append(i[1])

In [140]:
print(basic_log_score)
print(logodds_pred)

0.803489439853
[[-0.27622856 -1.42146368]
 [-0.23798861 -1.55216793]
 [-0.1914373  -1.74738704]
 ..., 
 [-0.21226836 -1.65416142]
 [-0.30378987 -1.33947158]
 [-0.2131799  -1.65031596]]


In [159]:
proba_pred[:,1]

array([ 0.24136049,  0.21178833,  0.1742286 , ...,  0.19125236,
        0.26198407,  0.19198924])

In [162]:
roc_curve = metrics.roc_curve(y_true=y_test, y_score=proba_pred[:,1])

roc_auc = metrics.roc_auc_score(y_true=y_test, y_score=proba_pred[:,1])
print(roc_auc)

0.487556742323


In [132]:
cleanGameData_df = cleanGameData_df.drop(columns=['ab_hit_logodds'])

0       0
1       0
2       0
3       0
4       1
5       1
6       1
7       1
8       1
9       1
10      0
11      0
12      0
13      0
14      0
15      0
16      0
17      0
18      0
19      0
20      0
21      0
22      0
23      0
24      0
25      0
26      1
27      1
28      1
29      1
       ..
3269    0
3270    0
3271    0
3272    1
3273    1
3274    1
3275    1
3276    1
3277    0
3278    0
3279    0
3280    0
3281    0
3282    0
3283    0
3284    1
3285    1
3286    0
3287    0
3288    0
3289    0
3290    1
3291    1
3292    0
3293    0
3294    0
3295    0
3296    0
3297    0
3298    0
Name: ab_outcome, Length: 3299, dtype: int64

In [127]:
cleanGameData_df['ab_outcome'].mean()

0.2003637465898757

In [128]:
logreg.coef_

array([[ -2.52617754e-08,   3.46399290e-08,  -5.15462205e-08,
         -8.81443873e-09,  -4.94112706e-08,   1.02624032e-08,
         -4.90749486e-11,   1.95678115e-11,  -1.20987762e-11,
         -8.66649914e-09,  -5.23156454e-10,  -2.88074928e-08,
         -4.71338862e-08,  -7.86468150e-09,   5.05143632e-07,
          8.57942294e-08,   5.05143632e-07,   3.26670678e-06,
         -4.53773991e-09,  -2.30666482e-09,   1.19850022e-06,
          8.57942294e-08,  -3.67112083e-08,  -3.02528686e-07,
          1.26437336e-10,   1.10786666e-11,  -2.42802485e-09,
          4.46846548e-09,   2.06828957e-10,  -9.90415658e-10,
         -1.06620857e-09,   4.01430793e-08,   5.11534183e-10,
         -4.14891573e-09,  -3.39224571e-12,  -5.04031587e-09,
         -1.41295142e-11,  -2.38382158e-09,   3.92513833e-09,
         -2.42802485e-09,   4.46846548e-09,  -9.45851854e-10,
          6.70909739e-12,   2.96577830e-10,   3.12420833e-09,
         -5.05555973e-09,  -1.44504092e-09,  -3.28158920e-09,
        

In [129]:
basic_log_score

0.79963625341012423

In [130]:
cleanGameData_df['ab_hit_logodds'].max()

-1.3813206684065871

In [131]:
cleanGameData_df

Unnamed: 0,b_games,PA,HR,R,RBI,SB,b_ISO,b_BABIP,wOBA,WRC_plus,...,p_playerid,batter_stance,pitcher_handedness,BB_logodds,K_logodds,BA_AVG_logodds,b_OBP_logodds,b_SLG_logodds,ab_outcome,ab_hit_logodds
0,899,3692,116,443,527,14,0.169,0.322,0.365,136,...,9975,1,1,-2.277543,-1.992430,-0.814182,-0.519368,-0.096074,0,-1.765066
1,899,3692,116,443,527,14,0.169,0.322,0.365,136,...,9975,1,1,-2.277543,-1.992430,-0.814182,-0.519368,-0.096074,0,-1.765361
2,899,3692,116,443,527,14,0.169,0.322,0.365,136,...,9975,1,1,-2.277543,-1.992430,-0.814182,-0.519368,-0.096074,0,-1.766352
3,899,3692,116,443,527,14,0.169,0.322,0.365,136,...,9975,1,1,-2.277543,-1.992430,-0.814182,-0.519368,-0.096074,0,-1.766352
4,809,2994,59,304,346,21,0.141,0.298,0.307,97,...,9975,0,1,-2.376273,-1.417843,-1.087974,-0.758371,-0.434719,1,-1.703226
5,809,2994,59,304,346,21,0.141,0.298,0.307,97,...,9975,0,1,-2.376273,-1.417843,-1.087974,-0.758371,-0.434719,1,-1.704704
6,555,1839,34,224,187,105,0.129,0.301,0.311,92,...,9975,1,1,-2.883007,-1.848918,-0.979455,-0.781485,-0.397139,1,-1.741578
7,555,1839,34,224,187,105,0.129,0.301,0.311,92,...,9975,1,1,-2.883007,-1.848918,-0.979455,-0.781485,-0.397139,1,-1.741727
8,555,1839,34,224,187,105,0.129,0.301,0.311,92,...,9975,1,1,-2.883007,-1.848918,-0.979455,-0.781485,-0.397139,1,-1.741517
9,555,1839,34,224,187,105,0.129,0.301,0.311,92,...,9975,1,1,-2.883007,-1.848918,-0.979455,-0.781485,-0.397139,1,-1.741724


In [209]:
#Random Forest Model
'''sklearn.ensemble.RandomForestClassifier(n_estimators=10, criterion=’gini’, max_depth=None, 
                                              min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                                              max_features=’auto’, max_leaf_nodes=None, min_impurity_decrease=0.0, 
                                              min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, 
                                              random_state=None, verbose=0, warm_start=False, class_weight=None)
'''
RFC = RandomForestClassifier(random_state=16)
RFC.fit(X_train, y_train)
rf_predict_proba = RFC.predict_proba(X_test)
tree_split = RFC.decision_path(X_test)
feature_importance = RFC.feature_importances_
feature_importance

NameError: name '_typelessdata' is not defined

In [182]:
roc_curve = metrics.roc_curve(y_true=y_test, y_score=rf_predict_proba[:,1])

roc_auc = metrics.roc_auc_score(y_true=y_test, y_score=rf_predict_proba[:,1])
print(roc_auc)

0.971684913218


1.0

In [196]:
0.04485167 / 0.012048192771084338

3.72268861

In [206]:
fpr, tpr, _ = metrics.roc_curve(y_test, rf_predict_proba[:,1])

In [54]:
logreg = LogisticRegression()
X = np.array(cleanGameData_df['WAR'])
X = X.reshape(-1,1)
y = np.array(cleanGameData_df['ab_outcome'])
logreg.fit(X, y)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [47]:
# fit a logistic regression model and store the class predictions
logreg = LogisticRegression(C=1e9)
X = np.array(cleanGameData_df['WAR'])
y = np.array(cleanGameData_df['ab_outcome'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=16)
logreg.fit(X_train, y_train)
#hit_proba = logreg.predict_proba(X)
logodds_pred = logreg.predict_log_proba(X_test)
proba_pred = logreg.predict_proba(X_test)
basic_log_score = logreg.score(X_test, y_test)

ValueError: Expected 2D array, got 1D array instead:
array=[ 4.2 17.5 15.9 ... 17.4 10.7  8.9].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.