In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV

In [2]:
pitching_df = pd.read_csv('data/Pitching.csv')
salary_df = pd.read_csv('data/Salaries.csv')

joint_df = pd.merge(pitching_df,salary_df, on=['teamID','playerID','yearID','lgID'])

joint_df = joint_df.sort_values(by=['playerID','yearID','stint'])

joint_df['salary_lag'] = joint_df.groupby('playerID')['salary'].shift(1)

def split_train_test( data, test_ratio):
    shuffled_indices = np.random.permutation( len( data))
    test_set_size = int( len( data) * test_ratio)
    test_indices = shuffled_indices[: test_set_size]
    train_indices = shuffled_indices[ test_set_size:]
    return data.iloc[ train_indices], data.iloc[ test_indices]


In [3]:
#Cols to use
cols_to_use = ['yearID','W','L','GS','SV','BK','R','H','ERA','SO', 'salary_bucket', 'salary_bucket_lag']


In [4]:
#Bucketize
num_buckets = 100
joint_df['salary_bucket'] = pd.cut(joint_df['salary'], num_buckets, labels=False)

In [5]:
#Lag Buckets
joint_df['salary_bucket_lag'] = joint_df.groupby('playerID')['salary_bucket'].shift(1)

In [6]:
#Drop player's final years since there is no previous-year-salary to regress on
joint_df = joint_df.dropna()
df_train, df_test = split_train_test(joint_df, .2)
df_train = df_train.filter(cols_to_use)
df_test = df_test.filter(cols_to_use)

In [8]:
#Train
x = df_train.drop('salary_bucket',1)
y = df_train['salary_bucket']
clf = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=0)
clf.fit(x,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [9]:
#Predict
x = df_test.drop('salary_bucket',1)
y = df_test['salary_bucket']
predictions = clf.predict(x)


In [10]:
scores = cross_val_score(clf, x, y, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)

print("Scores: ", scores)
print("Mean: ", scores.mean())
print("Std Dev: ", scores.std())



Scores:  [-380.35909091 -210.84577114 -153.13265306 -125.32984293 -106.33513514
  -84.48888889  -79.97727273  -64.46820809 -114.31137725  -57.4382716 ]
Mean:  -137.66865117407607
Std Dev:  91.71906801685098


In [18]:
param_grid = [
    {'n_estimators':[3, 10, 30, 50, 100], 'max_features': [2,4,6,8,10,11], 'max_depth':[5,10,20,40,50,100]}
]
forest_reg = RandomForestClassifier()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(df_train, df_train['salary_bucket'])



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'n_estimators': [3, 10, 30, 50, 100], 'max_features': [2, 4, 6, 8, 10, 11], 'max_depth': [5, 10, 20, 40, 50, 100]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [19]:
grid_search.best_params_

{'max_depth': 100, 'max_features': 11, 'n_estimators': 100}

In [20]:
#Train
x = df_train
y = df_train['salary_bucket']
clf = RandomForestClassifier(n_estimators=100, max_depth=100, random_state=0, max_features=11)
clf.fit(x,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=100, max_features=11, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [21]:
scores = cross_val_score(clf, x, y, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)

print("Scores: ", scores)
print("Mean: ", scores.mean())
print("Std Dev: ", scores.std())



Scores:  [-1.64441592 -0.12223667 -0.00398406 -0.         -0.         -0.
 -0.         -0.         -0.         -0.        ]
Mean:  -0.17706366525897094
Std Dev:  0.49046365531084063
