In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from src import data_cleaning as dc

from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [5]:
people_df = pd.read_csv('src/data/baseballdatabank-master/core/People.csv')
batting_df = pd.read_csv('src/data/baseballdatabank-master/core/Batting.csv')
fielding_df = pd.read_csv('src/data/baseballdatabank-master/core/Fielding.csv')

In [6]:
batting_df2 = dc.initial_drop(batting_df)
batting_df3 = dc.combine_stints(batting_df2)
bat_with_pos = dc.map_position(batting_df3, fielding_df)
bat_with_pos['AVG'] = bat_with_pos['H'] / bat_with_pos['AB']
trimmed_df = dc.trim_batters(bat_with_pos)

Combining multiple stints into single years...
Mapping positions to batting stats...


In [7]:
condensed_df = dc.condense_df(trimmed_df)

In [8]:
condensed_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2349 entries, 0 to 2348
Columns: 119 entries, playerID to pos_SS
dtypes: float64(70), int64(42), object(1), uint8(6)
memory usage: 2.1+ MB


In [9]:
X = condensed_df.drop('playerID', axis=1)

In [10]:
kmeans = KMeans(n_clusters=20, random_state=0).fit(X)

In [11]:
kmeans.cluster_centers_

array([[1.09344262e+01, 2.01092896e+00, 2.46973908e-01, ...,
        4.26229508e-01, 2.07650273e-01, 7.10382514e-02],
       [1.87397260e+01, 7.31506849e+00, 2.83220937e-01, ...,
        1.94289029e-16, 4.24657534e-01, 1.78082192e-01],
       [1.13267327e+01, 2.22772277e+00, 2.44960104e-01, ...,
        1.98019802e-01, 3.56435644e-01, 9.90099010e-02],
       ...,
       [8.80254777e+00, 3.26751592e+00, 2.59474826e-01, ...,
        1.08280255e-01, 3.63057325e-01, 1.01910828e-01],
       [1.22000000e+01, 2.28235294e+00, 2.55208728e-01, ...,
        1.17647059e-01, 4.00000000e-01, 1.05882353e-01],
       [1.09436620e+01, 2.04225352e+00, 2.52829338e-01, ...,
        4.22535211e-02, 4.22535211e-01, 7.04225352e-02]])

In [12]:
kmeans.labels_

array([10,  2,  3, ..., 13, 11, 19], dtype=int32)

In [13]:
condensed_df['label'] = kmeans.labels_

In [14]:
condensed_df

Unnamed: 0,playerID,year1_2B,year1_3B,year1_AVG,year1_BB,year1_CS,year1_G,year1_GIDP,year1_HBP,year1_HR,...,year7_SF,year7_SH,year7_SO,pos_1B,pos_2B,pos_3B,pos_C,pos_OF,pos_SS,label
0,aaronha01,27,6,0.279915,28,2.0,122,13.0,3.0,13,...,12.0,0.0,63.0,0,0,0,0,1,0,10
1,abbotku01,17,3,0.249275,16,0.0,101,5.0,5.0,9,...,1.0,0.0,51.0,0,0,0,0,0,1,2
2,abreubo01,10,2,0.250000,21,2.0,59,0.0,1.0,3,...,7.0,0.0,126.0,0,0,0,0,1,0,3
3,adairje01,21,1,0.264249,35,2.0,133,6.0,2.0,9,...,3.0,6.0,52.0,0,1,0,0,0,0,14
4,adamsbo03,13,3,0.244373,18,0.0,94,7.0,3.0,4,...,0.0,8.0,67.0,0,0,1,0,0,0,7
5,adamssp01,12,0,0.289389,26,19.0,95,0.0,1.0,4,...,0.0,5.0,5.0,0,0,0,0,0,1,8
6,adcocjo01,16,1,0.293011,24,0.0,102,12.0,0.0,8,...,2.0,11.0,86.0,1,0,0,0,0,0,5
7,ageeto01,27,8,0.273450,41,18.0,160,17.0,10.0,22,...,3.0,1.0,92.0,0,0,0,0,1,0,13
8,ainsmed01,1,2,0.192308,6,0.0,33,0.0,0.0,0,...,0.0,16.0,48.0,0,0,0,1,0,0,16
9,aldremi01,18,3,0.250000,33,3.0,84,3.0,2.0,2,...,0.0,3.0,45.0,0,0,0,0,1,0,0


In [16]:
condensed_df[condensed_df['label'] == 5]

Unnamed: 0,playerID,year1_2B,year1_3B,year1_AVG,year1_BB,year1_CS,year1_G,year1_GIDP,year1_HBP,year1_HR,...,year7_SF,year7_SH,year7_SO,pos_1B,pos_2B,pos_3B,pos_C,pos_OF,pos_SS,label
6,adcocjo01,16,1,0.293011,24,0.0,102,12.0,0.0,8,...,2.0,11.0,86.0,1,0,0,0,0,0,5
15,alleyge01,3,1,0.210526,21,1.0,81,3.0,2.0,6,...,4.0,2.0,70.0,0,0,0,0,0,1,5
81,barmecl01,19,1,0.288571,16,4.0,81,4.0,6.0,10,...,2.0,8.0,106.0,0,0,0,0,0,1,5
164,blaloha01,8,0,0.210884,20,0.0,49,2.0,1.0,3,...,2.0,0.0,40.0,0,0,1,0,0,0,5
172,blowemi01,4,0,0.187500,12,0.0,48,3.0,1.0,5,...,4.0,2.0,116.0,0,0,1,0,0,0,5
217,breamsi01,7,0,0.229730,18,2.0,50,4.0,0.0,6,...,4.0,3.0,51.0,1,0,0,0,0,0,5
232,brookhu01,21,2,0.307263,23,5.0,98,9.0,1.0,4,...,4.0,0.0,72.0,0,0,0,0,1,0,5
253,buechst01,6,3,0.219178,14,2.0,69,11.0,2.0,6,...,3.0,11.0,97.0,0,0,1,0,0,0,5
256,bumbral01,15,11,0.337079,34,10.0,110,5.0,3.0,7,...,4.0,2.0,74.0,0,0,0,0,1,0,5
261,burksel01,30,2,0.272401,41,6.0,133,1.0,2.0,20,...,8.0,3.0,97.0,0,0,0,0,1,0,5


In [17]:
avg_cols = [col for col in condensed_df.columns if 'AVG' in col]
avg_cols.append('playerID')
avg_cols

['year1_AVG',
 'year2_AVG',
 'year3_AVG',
 'year4_AVG',
 'year5_AVG',
 'year6_AVG',
 'year7_AVG',
 'playerID']

In [18]:
# for label in condensed_df['label'].unique():
#     avgs_df = condensed_df.loc[condensed_df['label'] == label, avg_cols].set_index('playerID')
#     avgs_df.T.plot()
#     plt.gca().get_legend().remove()
# plt.show()

In [19]:
test_df = condensed_df.loc[condensed_df['label'] == 0, avg_cols].set_index('playerID', drop=True)

In [40]:
def get_metrics(X_test, y_true, y_pred, model):
    mse = mean_squared_error(y_true, y_pred)
    rmse = mse ** 0.5
    score = model.score(X_test, y_true)
    print('MSE:\t{:0.7f}\n'\
          'RMSE:\t{:0.7f}\n'\
          'Score:\t{:0.3f}'.format(mse, rmse, score))

# DecisionTreeRegressor
---

In [21]:
X_tree = condensed_df.drop('label', axis=1).set_index('playerID')

In [22]:
y_tree = X_tree.pop('year7_AVG')

In [23]:
# regr_tree_2_base = DecisionTreeRegressor(max_depth=2).fit(X_tree, y_tree)
regr_tree_4_base = DecisionTreeRegressor(max_depth=4).fit(X_tree, y_tree)
# regr_tree_6_base = DecisionTreeRegressor(max_depth=6).fit(X_tree, y_tree)
# regr_tree_8_base = DecisionTreeRegressor(max_depth=8).fit(X_tree, y_tree)

In [24]:
# feat_import_tree_2 = regr_tree_2_base.feature_importances_
feat_import_tree_4 = regr_tree_4_base.feature_importances_
# feat_import_tree_6 = regr_tree_6_base.feature_importances_
# feat_import_tree_8 = regr_tree_8_base.feature_importances_

In [25]:
# import_feats_tree_2 = np.where(feat_import_tree_2 > 0)[0]
import_feats_tree_4 = np.where(feat_import_tree_4 > 0)[0]
# import_feats_tree_6 = np.where(feat_import_tree_6 > 0)[0]
# import_feats_tree_8 = np.where(feat_import_tree_8 > 0)[0]

In [26]:
# X_tree_2 = X_tree.iloc[:, import_feats_tree_2]
X_tree_4 = X_tree.iloc[:, import_feats_tree_4]
# X_tree_6 = X_tree.iloc[:, import_feats_tree_6]
# X_tree_8 = X_tree.iloc[:, import_feats_tree_8]

In [27]:
# X_train_tree_2, X_test_tree_2, y_train_tree_2, y_test_tree_2 = train_test_split(X_tree_2, y_tree, test_size=0.2)
X_train_tree_4, X_test_tree_4, y_train_tree_4, y_test_tree_4 = train_test_split(X_tree_4, y_tree, test_size=0.2)
# X_train_tree_6, X_test_tree_6, y_train_tree_6, y_test_tree_6 = train_test_split(X_tree_6, y_tree, test_size=0.2)
# X_train_tree_8, X_test_tree_8, y_train_tree_8, y_test_tree_8 = train_test_split(X_tree_8, y_tree, test_size=0.2)

In [28]:
# regr_tree_2 = DecisionTreeRegressor(max_depth=2).fit(X_train_tree_2, y_train_tree_2)
regr_tree_4 = DecisionTreeRegressor(max_depth=4).fit(X_train_tree_4, y_train_tree_4)
# regr_tree_6 = DecisionTreeRegressor(max_depth=6).fit(X_train_tree_6, y_train_tree_6)
# regr_tree_8 = DecisionTreeRegressor(max_depth=8).fit(X_train_tree_8, y_train_tree_8)

In [29]:
# y_pred_tree_2 = regr_tree_2.predict(X_test_tree_2)
y_pred_tree_4 = regr_tree_4.predict(X_test_tree_4)
# y_pred_tree_6 = regr_tree_6.predict(X_test_tree_6)
# y_pred_tree_8 = regr_tree_8.predict(X_test_tree_8)

In [41]:
# print('max_depth=2')
# get_metrics(X_test_tree_2, y_test_tree_2, y_pred_tree_2, regr_tree_2)
print('max_depth=4')
get_metrics(X_test_tree_4, y_test_tree_4, y_pred_tree_4, regr_tree_4)
# print('max_depth=6')
# get_metrics(X_test_tree_6, y_test_tree_6, y_pred_tree_6, regr_tree_6)
# print('max_depth=8')
# get_metrics(X_test_tree_8, y_test_tree_8, y_pred_tree_8, regr_tree_8)

max_depth=4
MSE:	0.0009244
RMSE:	0.0304032
Score:	0.399


# RandomForestRegressor
---

In [42]:
X_rf = condensed_df.drop('label', axis=1).set_index('playerID')
y_rf = X_rf.pop('year7_AVG')

In [43]:
# regr_forest_2_base = RandomForestRegressor(max_depth=2).fit(X_rf, y_rf)
regr_forest_4_base = RandomForestRegressor(max_depth=4).fit(X_rf, y_rf)
# regr_forest_6_base = RandomForestRegressor(max_depth=6).fit(X_rf, y_rf)
# regr_forest_8_base = RandomForestRegressor(max_depth=8).fit(X_rf, y_rf)

In [44]:
# feat_import_rf_2 = regr_forest_2_base.feature_importances_
feat_import_rf_4 = regr_forest_4_base.feature_importances_
# feat_import_rf_6 = regr_forest_6_base.feature_importances_
# feat_import_rf_8 = regr_forest_8_base.feature_importances_

In [45]:
# import_feats_rf_2 = np.where(feat_import_rf_2 > 0)[0]
import_feats_rf_4 = np.where(feat_import_rf_4 > 0)[0]
# import_feats_rf_6 = np.where(feat_import_rf_6 > 0)[0]
# import_feats_rf_8 = np.where(feat_import_rf_8 > 0)[0]

In [46]:
# X_rf_2 = X_rf.iloc[:, import_feats_rf_2]
X_rf_4 = X_rf.iloc[:, import_feats_rf_4]
# X_rf_6 = X_rf.iloc[:, import_feats_rf_6]
# X_rf_8 = X_rf.iloc[:, import_feats_rf_8]

In [47]:
# X_train_rf_2, X_test_rf_2, y_train_rf_2, y_test_rf_2 = train_test_split(X_rf_2, y_rf, test_size=0.2)
X_train_rf_4, X_test_rf_4, y_train_rf_4, y_test_rf_4 = train_test_split(X_rf_4, y_rf, test_size=0.2)
# X_train_rf_6, X_test_rf_6, y_train_rf_6, y_test_rf_6 = train_test_split(X_rf_6, y_rf, test_size=0.2)
# X_train_rf_8, X_test_rf_8, y_train_rf_8, y_test_rf_8 = train_test_split(X_rf_8, y_rf, test_size=0.2)

In [48]:
# regr_forest_2 = RandomForestRegressor(max_depth=2).fit(X_train_rf_2, y_train_rf_2)
regr_forest_4 = RandomForestRegressor(max_depth=4).fit(X_train_rf_4, y_train_rf_4)
# regr_forest_6 = RandomForestRegressor(max_depth=6).fit(X_train_rf_6, y_train_rf_6)
# regr_forest_8 = RandomForestRegressor(max_depth=8).fit(X_train_rf_8, y_train_rf_8)

In [49]:
# y_pred_rf_2 = regr_forest_2.predict(X_test_rf_2)
y_pred_rf_4 = regr_forest_4.predict(X_test_rf_4)
# y_pred_rf_6 = regr_forest_6.predict(X_test_rf_6)
# y_pred_rf_8 = regr_forest_8.predict(X_test_rf_8)

In [50]:
# print('max_depth=2')
# get_metrics(X_test_rf_2, y_test_rf_2, y_pred_rf_2, regr_forest_2)
print('max_depth=4')
get_metrics(X_test_rf_4, y_test_rf_4, y_pred_rf_4, regr_forest_4)
# print('max_depth=6')
# get_metrics(X_test_rf_6, y_test_rf_6, y_pred_rf_6, regr_forest_6)
# print('max_depth=8')
# get_metrics(X_test_rf_8, y_test_rf_8, y_pred_rf_8, regr_forest_8)

max_depth=4
MSE:	0.0007535
RMSE:	0.0274505
Score:	0.496


In [54]:
def display_results(model, X, y):
    results = pd.DataFrame(y)
    results.columns = ['actual']
    results['predicted'] = model.predict(X)
    results = results.round(3)
    results['difference'] = (results['actual'] - results['predicted']).abs()
    return results.round(3)

In [55]:
results_tree = display_results(regr_tree_4, X_tree_4, y_tree)

In [56]:
results_forest = display_results(regr_forest_4, X_rf_4, y_rf)

In [58]:
results_tree.sort_values("difference")

Unnamed: 0_level_0,actual,predicted,difference
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
sosasa01,0.268,0.268,0.000
erstada01,0.283,0.283,0.000
hilldo01,0.264,0.264,0.000
vittos01,0.240,0.240,0.000
cannich01,0.240,0.240,0.000
burnsge01,0.303,0.303,0.000
nunamle01,0.259,0.259,0.000
cowenal01,0.268,0.268,0.000
mattido01,0.303,0.303,0.000
hoffmgl01,0.212,0.212,0.000


In [59]:
results_forest.sort_values('difference')

Unnamed: 0_level_0,actual,predicted,difference
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
wardpe01,0.246,0.246,0.000
bumbral01,0.285,0.285,0.000
crawfsa01,0.297,0.297,0.000
spencji01,0.266,0.266,0.000
mcdougi01,0.289,0.289,0.000
burnsoy01,0.285,0.285,0.000
snodgfr01,0.249,0.249,0.000
nashbi01,0.276,0.276,0.000
nettlgr01,0.267,0.267,0.000
smithse01,0.266,0.266,0.000


In [64]:
len(results_tree[results_tree['difference'] == .000])

25

In [60]:
len(results_forest[results_forest['difference'] == .000])

48

In [62]:
print(results_tree['difference'].max())

0.137


In [63]:
print(results_forest['difference'].max())

0.116
