In [19]:
import pandas as pd
from gamescout_db import db, cur
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
import numpy as np
import math
from sklearn import metrics

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

import statsmodels.discrete.discrete_model as sm

In [20]:
# DATA COLLECTION

In [21]:
batter_stats_COLUMNS = ['HITS','1_AGO_AVG', '2_AGO_AVG', '3_AGO_AVG', '4_AGO_AVG', '5_AGO_AVG', '6_AGO_AVG','7_AGO_AVG', 'P_ID','G_ID', 'BAT_ORDER', 'G_DATE', 'TEAM']

at_bats_COLUMNS = ['BATTER', 'PITCHER', 'G_ID', 'BATTER_LR', 'PITCHER_LR', 'EVENT']

pitcher_stats_COLUMNS = ['P_ID', 'G_ID', 'GAME_SCORE', 'BATTERS_FACED', 'TEAM', 'GAME_SCORE_1AGO', 'GAME_SCORE_2AGO', 'GAME_SCORE_3AGO']

In [22]:
batter_stats = pd.read_sql('select * from BatterStats;', con=db)[batter_stats_COLUMNS]
batter_stats["GOT_HIT"] = [1 if x > 0 else 0 for x in batter_stats["HITS"]]
batter_stats["NOT_HIT"] = [1 if x == 0 else 0 for x in batter_stats["HITS"]]

batter_stats["1_AGO"] = [0 if x == 0 else 1 for x in batter_stats["1_AGO_AVG"]]
batter_stats["2_AGO"] = [0 if x == 0 else 1 for x in batter_stats["2_AGO_AVG"]]
batter_stats["3_AGO"] = [0 if x == 0 else 1 for x in batter_stats["3_AGO_AVG"]]
batter_stats["4_AGO"] = [0 if x == 0 else 1 for x in batter_stats["4_AGO_AVG"]]
batter_stats["5_AGO"] = [0 if x == 0 else 1 for x in batter_stats["5_AGO_AVG"]]
batter_stats["6_AGO"] = [0 if x == 0 else 1 for x in batter_stats["6_AGO_AVG"]]
batter_stats["7_AGO"] = [0 if x == 0 else 1 for x in batter_stats["7_AGO_AVG"]]

batter_stats = batter_stats[['GOT_HIT', 'NOT_HIT', '1_AGO', '2_AGO', '3_AGO', '4_AGO', '5_AGO', '6_AGO','7_AGO', 'P_ID', 'G_ID', 'BAT_ORDER', 'G_DATE', 'TEAM']]

In [23]:
pitcher_stats = pd.read_sql('select * from PitcherStats;', con=db)[pitcher_stats_COLUMNS]

In [24]:
at_bats = pd.read_sql('select * from AtBats;', con=db)[at_bats_COLUMNS]

In [25]:
batter_stats = batter_stats[batter_stats['G_ID'].isin(at_bats['G_ID'].tolist())]

In [26]:
hist_AB_series = []
hist_H_series = []
OPP_ID_series = []
GS1AGO_series = []
GS2AGO_series = []
GS3AGO_series = []
order_series = []

for i in range(0, 100):
    batter = batter_stats.ix[i]
    starting_pitcher = pitcher_stats[pitcher_stats['G_ID'] == batter['G_ID']]
    starting_pitcher = starting_pitcher[pitcher_stats['TEAM'] != batter['TEAM']]
    starting_pitcher = starting_pitcher.sort(['BATTERS_FACED'], ascending=False).head(1)
    matchups = at_bats[at_bats['BATTER'] == batter['P_ID']]
    matchups = matchups[at_bats['PITCHER'] == starting_pitcher['P_ID'].iloc[0]]
    #matchups = matchups[at_bats['G_DATE'] < batter['G_DATE']]
    matchups_hits = matchups[matchups['EVENT'].isin(['Single', 'Double', 'Triple', 'Home Run'])]
    hist_AB = len(matchups)
    hist_H = len(matchups_hits)

    hist_AB_series.append(hist_AB)
    hist_H_series.append(hist_H)
    OPP_ID_series.append(starting_pitcher['P_ID'].iloc[0])

    GS1AGO_series.append(starting_pitcher['GAME_SCORE_1AGO'].iloc[0])
    GS2AGO_series.append(starting_pitcher['GAME_SCORE_2AGO'].iloc[0])
    GS3AGO_series.append(starting_pitcher['GAME_SCORE_3AGO'].iloc[0])
    order_series.append(batter['BAT_ORDER'])
    
    if i % 100 == 0:
        print i
        if i == 100:
            break
    
batter_stats['hist_AB'] = pd.Series(hist_AB_series)
batter_stats['hist_H'] = pd.Series(hist_H_series)
batter_stats['starting_P_ID'] = pd.Series(OPP_ID_series)
batter_stats['GS1AGO'] = pd.Series(GS1AGO_series)
batter_stats['GS2AGO'] = pd.Series(GS2AGO_series)
batter_stats['GS3AGO'] = pd.Series(GS3AGO_series)
batter_stats['BAT_ORDER'] = pd.Series(order_series)



0
100


In [None]:
# batter_stats.to_csv('SAVED.csv')

In [30]:
print(batter_stats)

      GOT_HIT  NOT_HIT  1_AGO  2_AGO  3_AGO  4_AGO  5_AGO  6_AGO  7_AGO  \
0           1        0      1      1      1      1      1      1      1   
1           0        1      1      1      1      1      1      1      1   
2           1        0      1      1      1      1      1      1      1   
3           0        1      1      1      1      1      1      1      1   
4           1        0      1      1      1      1      1      1      1   
5           0        1      1      1      1      1      1      1      1   
6           1        0      1      1      1      1      1      1      1   
7           0        1      1      1      1      1      1      1      1   
8           1        0      1      1      1      1      1      1      1   
9           1        0      1      1      1      1      1      1      1   
10          0        1      1      1      1      1      1      1      1   
11          0        1      1      1      1      1      1      1      1   
12          1        0   

In [2]:
compiled = pd.read_sql('select * from Compiled;', con=db)

In [None]:
data = compiled.copy()
data = data[pd.notnull(data['GS1AGO'])]
data = data[pd.notnull(data['GS2AGO'])]
data = data[pd.notnull(data['GS3AGO'])]

labels = data['GOT_HIT']
data = data[['1_AGO', '2_AGO', '3_AGO', '4_AGO', '5_AGO', '6_AGO', '7_AGO', 'GS1AGO', 'GS2AGO', 'GS3AGO', 
             'starting_P_ID', 'P_ID', 'hist_AB', 'hist_H']]

data = pd.concat([data, pd.get_dummies(data['starting_P_ID']).iloc[:,1:697]], axis=1)
data = pd.concat([data, pd.get_dummies(data['P_ID']).iloc[:,1:3264]], axis=1)

In [None]:
# H^1.2 - (.1 * AB^1.2)

data['Gamma'] = data['hist_H'] - (.1 * data['hist_AB'])
#plt.scatter(data['hist_H'], data['hist_AB'], s=.1)
plt.hist(data['Gamma'])

In [None]:
data_done = data[[col for col in data.columns if col not in ['starting_P_ID', 'P_ID', 'hist_AB', 'hist_H']]]

In [None]:
y = labels
x = data_done

In [None]:
# LINEAR REGRESSION MODEL



In [None]:
train_x = x[:250000]
test_x = x[250001:]

train_y = y[:250000]
test_y = y[250001:]

In [None]:
model = linear_model.LogisticRegression(class_weight='balanced')
model.fit(train_x, train_y)

for index, col in enumerate(x.columns):
    print("{0}: {1}".format(col, model.coef_[0][index]))

In [None]:
logit = sm.Logit(y, x)

In [None]:
f = logit.fit()

In [None]:
f.summary()


In [None]:
plt.hist(model.predict_proba(test_x)[:,0])

In [None]:
predicted = model.predict(test_x)
expected = test_y

print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

In [None]:
model.score(test_x, test_y)

In [None]:
# DO OUR FEATURES CORRELATE TO PERFORMANCE?

In [None]:
probs = model.predict_proba(test_x)[:,1]
pairs = zip(expected, probs)

In [None]:
prob_bins = {}
x = np.array([probs])
bins = np.linspace(0, 1, num=101)

inds = np.digitize(x, bins)

for i, pair in enumerate(pairs):
    key = str(inds[0][i] - 1)
    
    if key not in prob_bins:
        #print(key)
        prob_bins[key] = []
        
    prob_bins[key].append(pair)

In [None]:
pct_hit = []
for key in sorted(prob_bins.keys()):
    hits = [sum(x) for x in zip(*prob_bins[str(key)])][0]
    total = len(prob_bins[str(key)])
    pct = float(hits)/float(total)
    pct_hit.append(pct)

#     print("Bin: " + str(key))
#     print("# players w/ hit: " + str(hits))
#     print("# player (total): " + str(total))
#     print("Pct. Hit: " + str(float(hits)/float(total)))
#     print("")

In [None]:
plt.scatter(sorted([float(i)/100 for i in prob_bins.keys()]),pct_hit)
plt.plot([0.35, 0.65], [0.35, 0.65], "-")