## Machine Learning for Pitcher Dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
df_pitcher = pd.read_csv('/Users/christopher/Desktop/Springboard-/df_pitcher_v3.csv')
df_pitcher.head()

Unnamed: 0,playerName,salary,adj_salary_filled,flag,Age,HT,WT,Bats,Throws,year,...,WP,BK,ERA,h9,hr9,bb9,so9,WHIP,total_years_mlb,minimum_year
0,AJ Achter,,518363.7,1,25,6-5,190,R,R,2014,...,0,0,3.27,11.45,1.64,2.45,4.09,1.55,1,480000
1,AJ Achter,507500.0,547411.9,0,26,6-5,190,R,R,2015,...,0,0,6.75,8.1,2.7,4.05,9.45,1.35,2,507500
2,AJ Achter,,540592.3,1,27,6-5,190,R,R,2016,...,0,0,3.11,10.27,1.67,2.87,3.35,1.46,3,507500
3,AJ Burnett,16500000.0,19345216.4,0,33,6-5,205,R,R,2010,...,16,0,5.26,9.84,1.21,3.76,6.99,1.51,12,400000
4,AJ Burnett,16500000.0,18753264.2,0,34,6-5,205,R,R,2011,...,25,0,5.15,8.98,1.47,3.92,8.18,1.43,13,414000


In [3]:
df_pitcher.columns

Index(['playerName', 'salary', 'adj_salary_filled', 'flag', 'Age', 'HT', 'WT',
       'Bats', 'Throws', 'year', 'teamName', 'posit', 'borndate', 'Place',
       'LeagueAbbr', 'W', 'L', 'G', 'GS', 'CG', 'SHO', 'GF', 'SV', 'IP', 'H',
       'HR', 'R', 'ER', 'BB', 'IBB', 'SO', 'WP', 'BK', 'ERA', 'h9', 'hr9',
       'bb9', 'so9', 'WHIP', 'total_years_mlb', 'minimum_year'],
      dtype='object')

## Baseline Regressor Model

In [4]:
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

#drop columns not needed
df_pitcher_fin = df_pitcher.drop(columns=['salary', 'borndate', 
                                    'Place', 'playerName', 'HT', 'teamName'], axis=1)

#encode categorical data 
dfh_obj = df_pitcher_fin.select_dtypes(include=['object']).copy()
dfh_obj.columns
dummies = pd.get_dummies(df_pitcher_fin[['Bats', 'Throws', 'LeagueAbbr', 'posit']], drop_first=True)


# Drop the column with the independent variable (Salary), and columns for which we created dummy variables
X_ = df_pitcher_fin.drop(['Bats', 'Throws', 'LeagueAbbr', 'posit', 'adj_salary_filled' ], axis=1).astype('float64')

# Define the feature set X and y
X = pd.concat([X_, dummies], axis=1)
y = np.log(df_pitcher_fin.adj_salary_filled)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#scale data 
std = StandardScaler()
X_train_scaled = std.fit_transform(X_train)
X_test_scaled = std.transform(X_test)

# Create a dummy regressor
dummy_mean = DummyRegressor(strategy='mean')

# "Train" dummy regressor
dummy_mean.fit(X_train_scaled, y_train)

y_pred = dummy_mean.predict(X_test_scaled)
print('R-squared:', r2_score(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(np.exp(y_test), np.exp(y_pred))))



R-squared: -6.815789559411201e-07
RMSE: 4683385.526601847


## CatBoost Regressor

In [5]:
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

df_pitcher_fin = df_pitcher.drop(columns=['salary', 'borndate', 
                                    'Place', 'playerName', 'HT', 'teamName'], axis=1)


X = df_pitcher_fin.drop(columns=['adj_salary_filled'])
y = df_pitcher_fin.adj_salary_filled
y_log = np.log(df_pitcher_fin.adj_salary_filled)

X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.2, random_state=42)


cat = CatBoostRegressor(iterations=3000, verbose=200, 
                        learning_rate=0.03,loss_function='RMSE',
                        early_stopping_rounds= 200,
                        random_seed=42)
cat.fit(X_train, y_train, cat_features=['Bats', 'Throws', 'LeagueAbbr', 'posit'],  eval_set=(X_test, y_test)
       )
y_pred = cat.predict(X_test)
y_pred_train = cat.predict(X_train)
rmse = np.sqrt(mean_squared_error(np.exp(y_test), np.exp(y_pred)))
print('Train r2 score: ', r2_score(y_pred_train, y_train))
print('Test r2 score: ', r2_score(y_test, y_pred))
print(rmse)

#check top features 
top_feat=pd.DataFrame()
top_feat['columns']=X.columns
top_feat['importances'] = cat.feature_importances_
top_feat.sort_values(by='importances',ascending=False,inplace=True)

print(top_feat[:20])

#check predictions
results = pd.DataFrame()
results['predicted'] = list(np.exp(y_pred))
results['actual'] = list(np.exp(y_test))
results.round(1).head(20)



0:	learn: 1.1066952	test: 1.0969334	best: 1.0969334 (0)	total: 65.4ms	remaining: 3m 16s
200:	learn: 0.5359457	test: 0.5434441	best: 0.5434441 (200)	total: 960ms	remaining: 13.4s
400:	learn: 0.5048601	test: 0.5361761	best: 0.5361761 (400)	total: 1.8s	remaining: 11.7s
600:	learn: 0.4778803	test: 0.5314679	best: 0.5314069 (597)	total: 2.65s	remaining: 10.6s
800:	learn: 0.4509485	test: 0.5299979	best: 0.5295711 (735)	total: 3.67s	remaining: 10.1s
1000:	learn: 0.4273065	test: 0.5285072	best: 0.5285072 (1000)	total: 4.84s	remaining: 9.68s
1200:	learn: 0.4064724	test: 0.5282910	best: 0.5279973 (1087)	total: 5.8s	remaining: 8.69s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.5279973174
bestIteration = 1087

Shrink model to first 1088 iterations.
Train r2 score:  0.8225891937818299
Test r2 score:  0.7773951221427647
2754297.969240993
            columns  importances
32  total_years_mlb    39.445909
0              flag    15.092379
1               Age     4.438605
11      

Unnamed: 0,predicted,actual
0,485164.3,468974.9
1,575820.0,557998.1
2,2904430.9,1670276.7
3,551642.9,559953.7
4,542355.0,554875.3
5,560384.2,557998.1
6,479937.5,468974.9
7,632007.9,566583.6
8,1921405.9,556580.4
9,1572400.7,5025000.0


In [6]:
from catboost import cv
from catboost import Pool
params = {"iterations": 3000,
          "learning_rate": 0.03,
          "eval_metric": "RMSE",
          "verbose": False,
         'early_stopping_rounds':200,
         'random_seed':42,
         'loss_function':'RMSE'} # Default Parameters

# Categorical features list
cat_feat = ['Bats', 'Throws', 'LeagueAbbr', 'posit']
cv_dataset = Pool(data=X_train, label=y_train, cat_features=cat_feat )

# CV scores
scores = cv(cv_dataset, params, fold_count=5)
scores
print((scores["test-RMSE-mean"]).tail(1))

Stopped by overfitting detector  (200 iterations wait)
2097    0.555412
Name: test-RMSE-mean, dtype: float64


## HistGradientBoosting Regressor

In [7]:
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df_pitcher_fin = df_pitcher.drop(columns=['salary', 'borndate', 
                                    'Place', 'playerName', 'HT', 'teamName'], axis=1)

#encode categorical features
dfh_obj = df_pitcher_fin.select_dtypes(include=['object']).copy()
dfh_obj.columns
dummies = pd.get_dummies(df_pitcher_fin[['Bats', 'Throws', 'LeagueAbbr', 'posit']], drop_first=True)


# Drop the column with the independent variable (Salary), and columns for which we created dummy variables
X_ = df_pitcher_fin.drop(['Bats', 'Throws', 'LeagueAbbr', 'posit','adj_salary_filled'], axis=1)

# Define the feature set X and y
X = pd.concat([X_, dummies], axis=1)

y = df_pitcher_fin.adj_salary_filled
y_log = np.log(df_pitcher_fin.adj_salary_filled)

X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.2, random_state=42)

std = StandardScaler()
X_train_scaled = std.fit_transform(X_train)
X_test_scaled = std.transform(X_test)


from sklearn.experimental import enable_hist_gradient_boosting  
from sklearn.ensemble import HistGradientBoostingRegressor

hist = HistGradientBoostingRegressor()
hist.fit(X_train_scaled, y_train)
y_pred = hist.predict(X_test_scaled)
y_pred_train_ = hist.predict(X_train_scaled)
rmse = np.sqrt(mean_squared_error(np.exp(y_test), np.exp(y_pred)))
print('Train r2 score: ', r2_score(y_pred_train_, y_train))
print('Test r2 score: ', r2_score(y_test, y_pred))
print(rmse)

#check predictions
results = pd.DataFrame()
results['y_pred'] = list(np.exp(y_pred))
results['y_test'] = list(np.exp(y_test))
results.round(1)

Train r2 score:  0.8799288841264248
Test r2 score:  0.7708972551938514
2757535.3765084916


Unnamed: 0,y_pred,y_test
0,478104.2,468974.9
1,566841.0,557998.1
2,3236473.3,1670276.7
3,556816.1,559953.7
4,554574.4,554875.3
...,...,...
1535,469688.7,468974.9
1536,549157.0,555000.0
1537,537964.7,540592.3
1538,574896.0,540592.3


## LightGBM Regressor

In [8]:
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
df_pitcher_fin = df_pitcher.drop(columns=['salary', 'borndate', 
                                    'Place', 'playerName', 'HT', 'teamName'], axis=1)

# convert categorical features
dfh_obj = df_pitcher_fin.select_dtypes(include=['object']).copy()
dfh_obj.columns
dummies = pd.get_dummies(df_pitcher_fin[['Bats', 'Throws', 'LeagueAbbr', 'posit']], drop_first=True)


# Drop the column with the independent variable (Salary), and columns for which we created dummy variables
X_ = df_pitcher_fin.drop(['Bats', 'Throws', 'LeagueAbbr', 'posit','adj_salary_filled'], axis=1)

# Define the feature set X and y
X = pd.concat([X_, dummies], axis=1)

y = df_pitcher_fin.adj_salary_filled
y_log = np.log(df_pitcher_fin.adj_salary_filled)

X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.2, random_state=42)

std = StandardScaler()
X_train_scaled = std.fit_transform(X_train)
X_test_scaled = std.transform(X_test)

from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV

params = {'max_depth':[5,6,7],
         'learning_rate': [0.05],
         'n_estimators':[150],
         'colsample_bytree': [0.7, 0.8, 0.9]}


lgbm = LGBMRegressor()

grid= GridSearchCV(estimator=lgbm, param_grid=params, cv=5, 
                   scoring='neg_mean_squared_error')

grid.fit(X_train_scaled, y_train)
y_pred = grid.predict(X_test_scaled)
y_pred_train = grid.predict(X_train_scaled)
print("Best parameters found: ", grid.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid.best_score_)))
print('Train r2 score: ', r2_score(y_pred_train, y_train))
print('Test r2 score: ', r2_score(y_test, y_pred))

rmse = np.sqrt(mean_squared_error(np.exp(y_test), np.exp(y_pred)))

print(rmse)

#check predictions
results = pd.DataFrame()
results['y_pred'] = list(np.exp(y_pred))
results['y_test'] = list(np.exp(y_test))
results.round(1)

Best parameters found:  {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 150}
Lowest RMSE found:  0.5559192642072937
Train r2 score:  0.8272368769333853
Test r2 score:  0.7659977719441128
2818405.847366279


Unnamed: 0,y_pred,y_test
0,477185.3,468974.9
1,527982.2,557998.1
2,2860847.1,1670276.7
3,560988.1,559953.7
4,545180.9,554875.3
...,...,...
1535,472678.4,468974.9
1536,533056.8,555000.0
1537,536152.7,540592.3
1538,650127.5,540592.3


## XGB Regressor

In [9]:
from sklearn.metrics import r2_score
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

df_pitcher_fin = df_pitcher.drop(columns=['salary', 'borndate', 
                                    'Place', 'playerName', 'HT', 'teamName'], axis=1)

# convert categorical features
dfh_obj = df_pitcher_fin.select_dtypes(include=['object']).copy()
dfh_obj.columns
dummies = pd.get_dummies(df_pitcher_fin[['Bats', 'Throws', 'LeagueAbbr', 'posit']], drop_first=True)


# Drop the column with the independent variable (Salary), and columns for which we created dummy variables
X_ = df_pitcher_fin.drop(['Bats', 'Throws', 'LeagueAbbr', 'posit','adj_salary_filled'], axis=1)

# Define the feature set X and y
X = pd.concat([X_, dummies], axis=1)
y = df_pitcher_fin.adj_salary_filled
y_log = np.log(df_pitcher_fin.adj_salary_filled)

X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.2, random_state=42)

xgb_model = xgb.XGBRegressor(n_estimators=1000)
params = {'eta':[0.05, 0.10, 0.15, 0.20, 0.30],
         'max_depth':[2, 3, 5, 7],
         'gamma': [0.0, 0.1, 0.3, 0.5],
         'colsample_bytree': [0.3, 0.5, 0.7],
         'reg_lambda':[0, 0.01, 0.1, 1, 100]}

ran = RandomizedSearchCV(estimator=xgb_model, param_distributions=params, 
                         cv=5, scoring='neg_mean_squared_error', random_state=42)
ran.fit(X_train, y_train)
y_pred = ran.predict(X_test)
y_pred_train = ran.predict(X_train)
rmse = np.sqrt(mean_squared_error(np.exp(y_test), np.exp(y_pred)))

print(rmse)

print("Best parameters found: ", ran.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(ran.best_score_)))
print('Train r2 score: ', r2_score(y_pred_train, y_train))
print('Test r2 score: ', r2_score(y_test, y_pred))

# checking predictions
results_ = pd.DataFrame()
results_['y_pred'] = list(np.exp(y_pred))
results_['y_test'] = list(np.exp(y_test))
print(results_.round(1).head(30))

2832288.261709321
Best parameters found:  {'reg_lambda': 0.01, 'max_depth': 3, 'gamma': 0.5, 'eta': 0.05, 'colsample_bytree': 0.5}
Lowest RMSE found:  0.5625789614900701
Train r2 score:  0.7994370435567463
Test r2 score:  0.7621050739347917
        y_pred     y_test
0     467921.1   468974.9
1     529740.9   557998.1
2    2533092.0  1670276.7
3     610945.8   559953.7
4     582533.9   554875.3
5     532852.5   557998.1
6     455262.9   468974.9
7     620634.2   566583.6
8    2112585.5   556580.4
9    1244214.1  5025000.0
10    549827.9   562511.2
11    544080.2   554875.3
12    508882.2   470536.4
13    529918.3   565820.5
14    499366.1   526772.6
15    483929.3   557998.1
16    466309.8   547411.9
17    573558.9   540592.3
18    626271.9   547411.9
19    518559.8   540592.3
20    535519.4   534488.5
21   1362152.2  2346721.0
22   2685248.0  6153623.9
23    611007.5   547411.9
24    613415.9   521734.6
25    741101.6   552381.4
26    545794.6   555000.0
27    490734.6   534488.5
28   

## RandomForest Regressor

In [10]:
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

df_pitcher_fin = df_pitcher.drop(columns=['salary', 'borndate', 
                                    'Place', 'playerName', 'HT', 'teamName'], axis=1)

# convert categorical features
dfh_obj = df_pitcher_fin.select_dtypes(include=['object']).copy()
dfh_obj.columns
dummies = pd.get_dummies(df_pitcher_fin[['Bats', 'Throws', 'LeagueAbbr', 'posit']], drop_first=True)


# Drop the column with the independent variable (Salary), and columns for which we created dummy variables
X_ = df_pitcher_fin.drop(['Bats', 'Throws', 'LeagueAbbr', 'posit','adj_salary_filled'], axis=1)

# Define the feature set X and y
X = pd.concat([X_, dummies], axis=1)

y = df_pitcher_fin.adj_salary_filled
y_log = np.log(df_pitcher_fin.adj_salary_filled)

X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.2, random_state=42)

#scale data
std = StandardScaler()
X_train_scaled = std.fit_transform(X_train)
X_test_scaled = std.transform(X_test)


#build model 
rand = RandomForestRegressor(random_state=42)
param_grid = {'bootstrap': [True, False],
 'max_depth': [5,7,10],
 'max_features': ['auto', 'sqrt', 'log2'],
 'n_estimators': [200, 300]}

grid = GridSearchCV(estimator=rand, cv=5, param_grid=param_grid,
                    scoring='neg_mean_squared_error')
grid.fit(X_train_scaled, y_train)
y_pred = grid.predict(X_test_scaled)
y_pred_train = grid.predict(X_train_scaled)

rmse = np.sqrt(mean_squared_error(np.exp(y_test), np.exp(y_pred)))

print(rmse)

print("Best parameters found: ", grid.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid.best_score_)))
print('Train r2 score: ', r2_score(y_pred_train, y_train))
print('Test r2 score: ', r2_score(y_test, y_pred))

#check predictions
results = pd.DataFrame()
results['y_pred'] = list(np.exp(y_pred))
results['y_test'] = list(np.exp(y_test))
results.round(1)

2796006.200844836
Best parameters found:  {'bootstrap': True, 'max_depth': 10, 'max_features': 'auto', 'n_estimators': 200}
Lowest RMSE found:  0.5671195929735632
Train r2 score:  0.8519717383302176
Test r2 score:  0.7698665021019827


Unnamed: 0,y_pred,y_test
0,494077.8,468974.9
1,557983.9,557998.1
2,2714775.1,1670276.7
3,559640.9,559953.7
4,554717.4,554875.3
...,...,...
1535,500129.7,468974.9
1536,555359.1,555000.0
1537,545091.4,540592.3
1538,629781.6,540592.3


# Model Selection Summary 

### Best Model
In the pitcher dataset the best model was the CatBoost Regressor. This model's final metric was an  R-squared of 0.77 and RMSE of 2,754,297. The hyperparameters ‘learning rate’ were set to 0.03 and ‘iterations’ to 3000. I also implemented the use of early stopping rounds in this model to minimize any possible overfitting. 
Working with this model was rather interesting because it encodes any categorical variables in the dataset. 

### Future Improvements 
Overall I was able to get over a 40% reduction in the root mean squared error of the baseline model in the pitcher  dataset.  For future iterations I will gather data on player injuries and medical history. Collecting information on a player's past injuries will help predict any future injuries as well as affect their total value. Therefore leading to a more precise salary prediction. Another feature that could be added is a player's place of origin. This could be within the states or at the international level, as they each get paid different bonuses.