In [1]:
from datetime import date, timedelta
from gamescout_db import db, cur 
from sklearn import linear_model
import pandas as pd
import requests
import json
import re

In [2]:
compiled = pd.read_sql('SELECT * FROM Compiled;', con=db)

cur.execute("""
  SELECT
     COUNT(DISTINCT starting_P_ID)
  FROM
     Compiled
""")
num_pitchers = cur.fetchall()[0][0]

cur.execute("""
  SELECT
     COUNT(DISTINCT P_ID)
  FROM
     Compiled
""")
num_batters = cur.fetchall()[0][0]

In [3]:
data = compiled.copy()
data = data[pd.notnull(data['GS1AGO'])]
data = data[pd.notnull(data['GS2AGO'])]
data = data[pd.notnull(data['GS3AGO'])]

labels = data['GOT_HIT']
data = data[['1_AGO', '2_AGO', '3_AGO', '4_AGO', '5_AGO', '6_AGO', '7_AGO',
            'GS1AGO', 'GS2AGO', 'GS3AGO', 'starting_P_ID', 'P_ID', 
            'hist_AB', 'hist_H']]

pitch_dummies = pd.get_dummies(data['starting_P_ID']).iloc[:,1:num_pitchers] 
bat_dummies = pd.get_dummies(data['P_ID']).iloc[:,1:num_batters]

data['Gamma'] = data['hist_H'] - (.1 * data['hist_AB'])

data = pd.concat([data, pitch_dummies], axis=1)
data = pd.concat([data, bat_dummies], axis=1)

data = data.drop('starting_P_ID', 1)
data = data.drop('P_ID', 1)
data = data.drop('hist_AB', 1)
data = data.drop('hist_H', 1)
#data_done = (data[[col for col in data.columns if col not in ['starting_P_ID', 'P_ID', 'hist_AB', 'hist_H']]])

y = labels
x = data

train_x = x[:250000]
test_x = x[250001:]

train_y = y[:250000]
test_y = y[250001:]

model = linear_model.LogisticRegression(class_weight='balanced')
model.fit(train_x, train_y)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [21]:
matchups = []
tomorrow = date.today() + timedelta(days=1)
url = "http://gd.mlb.com/components/game/mlb/year_{y}/month_{m:02d}/\
day_{d:02d}/".format(y=tomorrow.year, m=tomorrow.month, d=tomorrow.day)

games_index = requests.get(url).text
games = re.findall(r'> (gid.*mlb.*mlb.*)/</a>', games_index)

p_dummies = dict.fromkeys(pitch_dummies, [0])
b_dummies = dict.fromkeys(bat_dummies, [0])


for game_id in games:
    print(game_id)
    info_url = '{url}{gid}/linescore.json'.format(url=url, gid=game_id)
    response = requests.get(info_url)
    info = json.loads(response.text)

    game = info['data']['game']
    matchups.append({
     'pitcher':  game['home_probable_pitcher']['id'],
     'opp_team': game['away_team_name']
     })

    matchups.append({
     'pitcher':  game['away_probable_pitcher']['id'],
     'opp_team': game['home_team_name']
     })
    
    for matchup in matchups:
        cur.execute("""
        SELECT
         DISTINCT P_ID, NAME
        FROM 
         BatterStats 
        WHERE 
         TEAM=%s AND YEAR(G_DATE)=2017""", [matchup['opp_team']]
        )
        rows = cur.fetchall()



gid_2017_05_23_anamlb_tbamlb_1
gid_2017_05_23_chamlb_arimlb_1
gid_2017_05_23_clemlb_cinmlb_1
gid_2017_05_23_colmlb_phimlb_1
gid_2017_05_23_detmlb_houmlb_1
gid_2017_05_23_kcamlb_nyamlb_1
gid_2017_05_23_miamlb_oakmlb_1
gid_2017_05_23_minmlb_balmlb_1
gid_2017_05_23_pitmlb_atlmlb_1
gid_2017_05_23_sdnmlb_nynmlb_1
gid_2017_05_23_seamlb_wasmlb_1
gid_2017_05_23_sfnmlb_chnmlb_1
gid_2017_05_23_slnmlb_lanmlb_1
gid_2017_05_23_texmlb_bosmlb_1
gid_2017_05_23_tormlb_milmlb_1


[[ 0.35714618  0.64285382]] - Jonathan Villar 607352
[[ 0.35714618  0.64285382]] - Javier Betancourt 607352
[[ 0.35714618  0.64285382]] - Keon Broxton 607352
[[ 0.35714618  0.64285382]] - Ivan De Jesus Jr. 607352
[[ 0.35714618  0.64285382]] - Ryan Braun 607352
[[ 0.35714618  0.64285382]] - Trent Clark 607352
[[ 0.35714618  0.64285382]] - Travis Shaw 607352
[[ 0.35714618  0.64285382]] - Jacob Nottingham 607352
[[ 0.35714618  0.64285382]] - Domingo Santana 607352
[[ 0.35714618  0.64285382]] - Monte Harrison 607352
[[ 0.35714618  0.64285382]] - Jesus Aguilar 607352
[[ 0.35714618  0.64285382]] - Jake Gatewood 607352
[[ 0.35714618  0.64285382]] - Hernan Perez 607352
[[ 0.35714618  0.64285382]] - Manny Pina 607352
[[ 0.35714618  0.64285382]] - Eric Sogard 607352
[[ 0.35714618  0.64285382]] - Orlando Arcia 607352
[[ 0.35714618  0.64285382]] - Nick Noonan 607352
[[ 0.35714618  0.64285382]] - Matt Garza 607352
[[ 0.35714618  0.64285382]] - Tim Dillard 607352
[[ 0.35714618  0.64285382]] - Jhan M