<a href="https://colab.research.google.com/github/aslyldrm/python-baseball-salary-prediction/blob/main/Baseball_Salary_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Baseball Salary Prediction**

# **Importing the libraries**

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset

In [2]:
dataset = pd.read_csv('hitters.csv')
X = dataset.iloc[:, :].values
y = dataset.loc[:, ['Salary']].values

In [3]:
X = np.delete(X, 18, axis=1)

In [4]:

X.shape

(322, 19)

In [5]:
print(X)

[[293 66 1 ... 33 20 'A']
 [315 81 7 ... 43 10 'N']
 [479 130 18 ... 82 14 'A']
 ...
 [475 126 3 ... 113 7 'A']
 [573 144 9 ... 131 12 'A']
 [631 170 9 ... 4 3 'A']]


# Taking care of missing data

In [6]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy='median')
imputer.fit(y)
y = imputer.transform(y)



# Encoding categorical data

In [7]:
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder
# ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(), [-1,13,14])], remainder='passthrough')
# X = np.array(ct.fit_transform(X))

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
values = [13,14,-1]

for i in values:
    X[:,i] = le.fit_transform(X[:,i])



In [8]:
# np.set_printoptions(threshold=np.inf)
# print(X)

# Splitting the dataset into the Training set and Test set

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.2, random_state=1)

# Feature Scaling

In [10]:
#For not dominating some features to others :

# -  Standardisation between 3 and -3  Always work
# - Normalization between 1 and -1  Works for which has normal distributions
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()

X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)



# Multiple Linear Regression

In [11]:
from sklearn.linear_model import LinearRegression
regressor_lin_mul = LinearRegression()
regressor_lin_mul.fit(X_train, y_train)

In [12]:
y_pred_mul = regressor_lin_mul.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred_mul.reshape(len(y_pred_mul),1), y_test.reshape(len(y_test),1)),1))

[[ 361.27  191.  ]
 [ 322.01  450.  ]
 [ 322.58  425.  ]
 [ 621.24  425.  ]
 [ 177.65  425.  ]
 [ 315.18  425.  ]
 [ 863.65  950.  ]
 [1424.51 2412.5 ]
 [ 923.74 1925.57]
 [ 337.73  172.  ]
 [ 343.62  145.  ]
 [ 627.91  425.  ]
 [ 113.47  110.  ]
 [ 538.82  225.  ]
 [ 270.54  416.67]
 [1012.18 1350.  ]
 [ 432.05  425.  ]
 [ 441.51  425.  ]
 [ 299.17   70.  ]
 [ 456.02  600.  ]
 [ 461.23  400.  ]
 [ 341.44  475.  ]
 [ 308.36  245.  ]
 [ 248.66  425.  ]
 [ 750.22  365.  ]
 [ 750.44  875.  ]
 [ 475.08  750.  ]
 [ 709.02  425.  ]
 [ 579.27  120.  ]
 [ 261.9   512.5 ]
 [1022.55  425.  ]
 [ 878.67 1670.  ]
 [ 482.38  450.  ]
 [ 398.11  595.  ]
 [ 557.83  550.  ]
 [ 816.55  700.  ]
 [ 209.65  300.  ]
 [ 464.29  550.  ]
 [ 467.84  750.  ]
 [ 432.52  140.  ]
 [ 671.56 1000.  ]
 [ 543.97  900.  ]
 [ 696.15 1300.  ]
 [ 396.87  425.  ]
 [ 901.    535.  ]
 [ 448.93  326.67]
 [ 601.39 1940.  ]
 [ 439.77  300.  ]
 [ 203.21   75.  ]
 [ 492.62  400.  ]
 [ 292.67  362.5 ]
 [ 391.34  431.5 ]
 [ 172.43  3

In [13]:
from sklearn.metrics import r2_score
r2_score_multiple_lin = r2_score(y_test, y_pred_mul)

# Polynomial Regression

In [38]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
poly_reg = PolynomialFeatures(degree = 2)
X_poly = poly_reg.fit_transform(X_train)
regressor = LinearRegression()
regressor.fit(X_poly, y_train)

In [39]:
y_pred_poly = regressor.predict(poly_reg.transform(X_test))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred_poly.reshape(len(y_pred_poly),1), y_test.reshape(len(y_test),1)),1))

[[-1.31e+00 -8.01e-01]
 [ 5.35e-01 -1.27e-01]
 [-1.81e+00 -1.92e-01]
 [ 2.01e-01 -1.92e-01]
 [ 1.46e-01 -1.92e-01]
 [-8.68e-01 -1.92e-01]
 [ 3.46e+00  1.18e+00]
 [ 4.53e+00  4.99e+00]
 [ 6.38e+00  3.72e+00]
 [ 7.32e-02 -8.51e-01]
 [ 3.42e-02 -9.21e-01]
 [-2.65e+00 -1.92e-01]
 [-3.15e-01 -1.01e+00]
 [-7.81e-01 -7.13e-01]
 [-8.75e-01 -2.14e-01]
 [ 1.93e+00  2.22e+00]
 [ 6.45e-02 -1.92e-01]
 [-2.74e-01 -1.92e-01]
 [-1.93e+00 -1.12e+00]
 [ 2.30e+00  2.64e-01]
 [-5.84e-01 -2.57e-01]
 [-2.76e-01 -6.16e-02]
 [-1.05e+00 -6.61e-01]
 [-6.38e-01 -1.92e-01]
 [ 1.53e+00 -3.48e-01]
 [-1.04e+01  9.80e-01]
 [-1.55e+00  6.55e-01]
 [ 6.53e+00 -1.92e-01]
 [-1.03e+00 -9.86e-01]
 [-1.69e+00  3.61e-02]
 [ 1.77e+00 -1.92e-01]
 [-5.55e+00  3.05e+00]
 [-7.41e+00 -1.27e-01]
 [-1.81e+00  2.51e-01]
 [-1.49e-01  1.34e-01]
 [-9.59e-01  5.25e-01]
 [-1.43e+00 -5.17e-01]
 [ 9.85e-01  1.34e-01]
 [-1.77e+00  6.55e-01]
 [ 4.14e-01 -9.34e-01]
 [ 6.24e+00  1.31e+00]
 [ 4.39e+00  1.05e+00]
 [ 1.46e-01  2.09e+00]
 [ 4.37e+00

In [40]:
from sklearn.metrics import r2_score
r2_score_poly_reg = r2_score(y_test, y_pred_poly)

# Random Forest Regressor

In [17]:
from sklearn.ensemble import RandomForestRegressor
regressor_rg = RandomForestRegressor(n_estimators = 25,min_samples_split = 5)
regressor_rg.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [18]:
y_pred_rg = regressor_rg.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred_rg.reshape(len(y_pred_rg),1), y_test.reshape(len(y_test),1)),1))

[[ 371.05  191.  ]
 [ 462.41  450.  ]
 [ 231.67  425.  ]
 [ 675.16  425.  ]
 [ 468.01  425.  ]
 [ 221.6   425.  ]
 [1255.59  950.  ]
 [1529.04 2412.5 ]
 [1328.15 1925.57]
 [ 236.04  172.  ]
 [ 223.31  145.  ]
 [1134.36  425.  ]
 [ 164.35  110.  ]
 [ 442.8   225.  ]
 [ 407.55  416.67]
 [1011.61 1350.  ]
 [ 262.19  425.  ]
 [ 358.96  425.  ]
 [ 156.51   70.  ]
 [ 724.01  600.  ]
 [ 423.6   400.  ]
 [ 245.42  475.  ]
 [ 387.16  245.  ]
 [ 631.18  425.  ]
 [1006.09  365.  ]
 [ 884.56  875.  ]
 [ 789.72  750.  ]
 [ 793.3   425.  ]
 [ 135.04  120.  ]
 [ 497.13  512.5 ]
 [ 645.13  425.  ]
 [1067.03 1670.  ]
 [ 549.21  450.  ]
 [ 632.96  595.  ]
 [ 610.1   550.  ]
 [ 496.5   700.  ]
 [ 371.21  300.  ]
 [ 407.79  550.  ]
 [ 684.76  750.  ]
 [ 124.    140.  ]
 [ 779.49 1000.  ]
 [1054.96  900.  ]
 [1002.76 1300.  ]
 [ 792.18  425.  ]
 [1471.51  535.  ]
 [ 540.62  326.67]
 [ 843.43 1940.  ]
 [ 406.38  300.  ]
 [ 148.8    75.  ]
 [ 438.24  400.  ]
 [ 448.68  362.5 ]
 [ 333.91  431.5 ]
 [ 325.66  3

In [19]:
re2_score_rfg = r2_score(y_test, y_pred_rg)

# XGBoost

In [20]:
from xgboost import XGBRegressor
xgb_r = XGBRegressor()  #objective ='reg:linear', n_estimators = 10, seed = 123
xgb_r.fit(X_train, y_train)

In [21]:

pred_xcb = xgb_r.predict(X_test)
from sklearn.metrics import r2_score
r2_score_xcb_r = r2_score(y_test, pred_xcb)



In [22]:
r2_score_xcb_r

0.629985276580993

In [23]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = xgb_r, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 29.23 %
Standard Deviation: 26.49 %


In [24]:
accuracies

array([ 0.17,  0.39,  0.1 , -0.11,  0.44,  0.6 ,  0.69,  0.52, -0.09,
        0.22])

# Support Vector Regression (SVR)

In [25]:
y_train = y_train.reshape(len(y_train),1)
y_test = y_test.reshape(len(y_test),1)


sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)
y_test = sc_y.transform(y_test)

In [26]:
from sklearn.svm import SVR
regressor_svr = SVR(kernel = 'rbf')
regressor_svr.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [27]:
y_pred_svr = regressor_svr.predict(X_test)



In [28]:
r2_score_svr = r2_score(y_test, y_pred_svr)

# Results

In [41]:
print("Support Vector Regression: " + str(r2_score_svr))
print("Polynomial Regression: " + str(r2_score_poly_reg))
print("Random Forest Regression: " + str(re2_score_rfg))
print("Multiple Linear regression: " + str(r2_score_multiple_lin))
print("XGBoost regressor: "+ str(r2_score_xcb_r))


Support Vector Regression: 0.5073497144296917
Polynomial Regression: -1.0998643151674072e+16
Random Forest Regression: 0.5810535006715443
Multiple Linear regression: 0.4548304456567659
XGBoost regressor: 0.629985276580993
