In [318]:
import pandas as pd
from collections import defaultdict
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import math
from keras.layers import Flatten

In [23]:
# does not contain 'H' or 'AB' data
condensed_df = pd.read_csv('src/data/condensed.csv')

In [26]:
condensed_df.columns

Index(['playerID', 'year1_2B', 'year1_3B', 'year1_BB', 'year1_CS', 'year1_G',
       'year1_GIDP', 'year1_HBP', 'year1_HR', 'year1_IBB',
       ...
       'year7_SF', 'year7_SH', 'year7_SO', 'year7_avg', 'pos_1B', 'pos_2B',
       'pos_3B', 'pos_C', 'pos_OF', 'pos_SS'],
      dtype='object', length=119)

In [27]:
six_year_cols = [col for col in condensed_df.columns if 'year7' not in col]
six_year_df = condensed_df[six_year_cols]
y = condensed_df['year7_avg']

In [82]:
six_year_df2 = six_year_df.astype(float)

In [439]:
uncondensed_df = pd.read_csv('src/data/uncondensed.csv')
uncondensed_df.drop('Unnamed: 0', axis=1, inplace=True)
uncondensed_df = pd.get_dummies(uncondensed_df, columns=['pos'])

In [440]:
uncondensed_df.drop('yearID', axis=1, inplace=True)
uncondensed_df.set_index('playerID', inplace=True)

In [441]:
uncondensed_df = uncondensed_df.astype(float)
uncondensed_df

Unnamed: 0_level_0,G,R,2B,3B,HR,RBI,SB,CS,BB,SO,...,SH,SF,GIDP,avg,pos_1B,pos_2B,pos_3B,pos_C,pos_OF,pos_SS
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aaronha01,122.0,58.0,27.0,6.0,13.0,69.0,2.0,2.0,28.0,39.0,...,6.0,4.0,13.0,0.279915,0.0,0.0,0.0,0.0,1.0,0.0
aaronha01,153.0,105.0,37.0,9.0,27.0,106.0,3.0,1.0,49.0,61.0,...,7.0,4.0,20.0,0.313953,0.0,0.0,0.0,0.0,1.0,0.0
aaronha01,153.0,106.0,34.0,14.0,26.0,92.0,2.0,4.0,37.0,54.0,...,5.0,7.0,21.0,0.328407,0.0,0.0,0.0,0.0,1.0,0.0
aaronha01,151.0,118.0,27.0,6.0,44.0,132.0,1.0,1.0,57.0,58.0,...,0.0,3.0,13.0,0.321951,0.0,0.0,0.0,0.0,1.0,0.0
aaronha01,153.0,109.0,34.0,4.0,30.0,95.0,4.0,1.0,59.0,49.0,...,0.0,3.0,21.0,0.326123,0.0,0.0,0.0,0.0,1.0,0.0
aaronha01,154.0,116.0,46.0,7.0,39.0,123.0,8.0,0.0,51.0,54.0,...,0.0,9.0,19.0,0.354531,0.0,0.0,0.0,0.0,1.0,0.0
aaronha01,153.0,102.0,20.0,11.0,40.0,126.0,16.0,7.0,60.0,63.0,...,0.0,12.0,8.0,0.291525,0.0,0.0,0.0,0.0,1.0,0.0
abbotku01,101.0,41.0,17.0,3.0,9.0,33.0,3.0,0.0,16.0,98.0,...,3.0,2.0,5.0,0.249275,0.0,0.0,0.0,0.0,0.0,1.0
abbotku01,120.0,60.0,18.0,7.0,17.0,60.0,4.0,3.0,36.0,110.0,...,2.0,5.0,6.0,0.254762,0.0,0.0,0.0,0.0,0.0,1.0
abbotku01,109.0,37.0,18.0,7.0,8.0,33.0,3.0,3.0,22.0,99.0,...,4.0,0.0,7.0,0.253125,0.0,0.0,0.0,0.0,0.0,1.0


In [404]:
# X = uncondensed_df.groupby('playerID').apply(lambda x: x.iloc[:-1, :]).reset_index(0, drop=True)

# y = uncondensed_df.groupby('playerID').apply(lambda x: x.iloc[-1, :])

# y = pd.DataFrame(y['avg'])
# y

Unnamed: 0_level_0,avg
playerID,Unnamed: 1_level_1
aaronha01,0.291525
abbotku01,0.216561
abreubo01,0.299827
adairje01,0.270531
adamsbo03,0.282575
adamssp01,0.260204
adcocjo01,0.290749
ageeto01,0.227488
ainsmed01,0.191429
aldremi01,0.266667


In [442]:
def dict_convert(df):
    def_dict = defaultdict(dict)
    for i in range(7):
        year = 'year' + str(i + 1)
        def_dict[year] = df.groupby('playerID').nth(i).T.to_dict('index')
    return def_dict

In [443]:
y_values = uncondensed_df.pop('avg')

In [444]:
X_panel = pd.Panel(dict_convert(uncondensed_df))
y_panel = pd.Panel(dict_convert(pd.DataFrame(y_values)))

(16436,)

In [446]:
X_data = X_panel.as_matrix().astype('float32')
y_data = y_panel.as_matrix()
print(X_data.shape, y_data.shape)

(7, 2348, 21) (7, 2348, 1)


In [447]:
# scaler = MinMaxScaler(feature_range=(0, 2))
# data_norm = scaler.fit_transform(dataset.reshape(dataset.shape[0], -1)).reshape(dataset.shape)

In [448]:
X_new_data = np.moveaxis(X_data, 0, 1)
y_new_data = np.moveaxis(y_data, 0, 1)
print(X_new_data.shape, y_new_data.shape)

(2348, 7, 21) (2348, 7, 1)


In [410]:
X_new_data[0, :, 0]

array([27., 37., 34., 27., 34., 46., nan], dtype=float32)

In [449]:
X_train, X_test, y_train, y_test = train_test_split(X_new_data, y_new_data)

In [450]:
y_test

array([[[0.28818444],
        [0.28705882],
        [0.2690678 ],
        ...,
        [0.26908397],
        [0.28762542],
        [0.2244489 ]],

       [[0.28150573],
        [0.24525316],
        [0.25988701],
        ...,
        [0.28692699],
        [0.23333333],
        [0.21988528]],

       [[0.2244898 ],
        [0.2962963 ],
        [0.22358722],
        ...,
        [0.2251816 ],
        [0.20740741],
        [0.2816092 ]],

       ...,

       [[0.29064039],
        [0.26259947],
        [0.29650092],
        ...,
        [0.30167598],
        [0.28729282],
        [0.31165919]],

       [[0.25619835],
        [0.29133858],
        [0.26      ],
        ...,
        [0.23370787],
        [0.26262626],
        [0.2195122 ]],

       [[0.3125    ],
        [0.27740492],
        [0.26130653],
        ...,
        [0.27130435],
        [0.24587156],
        [0.30927835]]])

In [451]:
y_train = y_train[:,6,:] # grab only the last column of BAs
y_test = y_test[:,6,:]
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1761, 7, 21)
(1761, 1)
(587, 7, 21)
(587, 1)


In [453]:
model = Sequential()
model.add(LSTM(4, input_shape=(7, 21)))
model.add(Dense(7))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train, y_train, verbose=1, epochs=5, batch_size=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1c4ee04f28>

In [454]:
trainPred = model.predict(X_train)
testPred = model.predict(X_test)

In [455]:
print(trainPred.shape)
print(testPred.shape)

(1761, 1)
(587, 1)


In [456]:
print(y_test.shape)
print(testPred.shape)

(587, 1)
(587, 1)


In [457]:
testScore = math.sqrt(mean_squared_error(y_test, testPred))
trainScore = math.sqrt(mean_squared_error(y_train, trainPred))
print('Test Score: {:.3f}'.format(testScore))
print('Train Score: {:.3f}'.format(trainScore))

Test Score: 0.033
Train Score: 0.034


In [459]:
model.evaluate(X_test, y_test)



0.0011166594076697006

In [466]:
X_train[0,1,:]

array([ 15.,   2.,  25.,   4., 127.,   0.,   1.,   0.,   0.,  41.,  46.,
        13.,   0.,  11.,  27.,   0.,   0.,   0.,   0.,   0.,   1.],
      dtype=float32)

In [461]:
y_train

array([[0.24152542],
       [0.24017467],
       [0.30353818],
       ...,
       [0.27613941],
       [0.25384615],
       [0.24161074]])