### Import Module

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from keras import optimizers
from keras.models import Sequential, load_model
from keras.layers import Dense,Activation, LSTM, Dropout, TimeDistributed, Flatten
# from keras.layers.normalization import BatchNormalization
from tensorflow.keras.layers import BatchNormalization
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

from sklearn.utils import class_weight
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC 
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from math import sqrt
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from collections import Counter

### Data Preparation

In [2]:
training_load = pd.read_csv('training_1961_2016.csv', delimiter=',')
testing_2017_load = pd.read_csv('testing_2017.csv', delimiter=',')
testing_2018_load = pd.read_csv('testing_2018.csv', delimiter=',')
testing_2019_load = pd.read_csv('testing_2019.csv', delimiter=',')

In [3]:
training_load.head(5)

Unnamed: 0,FirstYear_index,FirstYear_playerID,FirstYear_yearID,FirstYear_teamID,FirstYear_lgID,FirstYear_stint,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,...,SixthYear_HBP,SixthYear_SH,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age
0,41261,aaronha01,1961,ML1,NL,1.0,155.0,603.0,115.0,197.0,...,1.0,0.0,8.0,14.0,Hank,Aaron,Henry Louis,180.0,72.0,32.0
1,41959,aaronha01,1962,ML1,NL,1.0,156.0,592.0,127.0,191.0,...,0.0,0.0,6.0,11.0,Hank,Aaron,Henry Louis,180.0,72.0,33.0
2,42719,aaronha01,1963,ML1,NL,1.0,161.0,631.0,121.0,201.0,...,1.0,0.0,5.0,21.0,Hank,Aaron,Henry Louis,180.0,72.0,34.0
3,43471,aaronha01,1964,ML1,NL,1.0,145.0,570.0,103.0,187.0,...,2.0,0.0,3.0,14.0,Hank,Aaron,Henry Louis,180.0,72.0,35.0
4,44225,aaronha01,1965,ML1,NL,1.0,150.0,570.0,109.0,181.0,...,2.0,0.0,6.0,13.0,Hank,Aaron,Henry Louis,180.0,72.0,36.0


In [5]:
var_name = ['index', 'playerID', 'teamID', 'lgID', 'stint','nameFirst', 'nameLast', 'nameGiven']

First = []
for i in range(len(var_name)):
    First.append(f"FirstYear_{var_name[i]}")

Second = []
for i in range(len(var_name)):
    Second.append(f"SecondYear_{var_name[i]}")

Third = []
for i in range(len(var_name)):
    Third.append(f"ThirdYear_{var_name[i]}")
    
Fourth = []
for i in range(len(var_name)):
    Fourth.append(f"FourthYear_{var_name[i]}")
    
Fifth = []
for i in range(len(var_name)):
    Fifth.append(f"FifthYear_{var_name[i]}")
    
FileColumnName = First + Second + Third + Fourth + Fifth

In [6]:
x_train = training_load.iloc[:,0:-29].drop(FileColumnName, axis=1).values
y_train = training_load['SixthYear_HR'].values
x_test_2017 = testing_2017_load.iloc[:,0:-29].drop(FileColumnName, axis=1).values
y_test_2017 = testing_2017_load['SixthYear_HR'].values
x_test_2018 = testing_2018_load.iloc[:,0:-29].drop(FileColumnName, axis=1).values
y_test_2018 = testing_2018_load['SixthYear_HR'].values
x_test_2019 = testing_2019_load.iloc[:,0:-29].drop(FileColumnName, axis=1).values
y_test_2019 = testing_2019_load['SixthYear_HR'].values

In [7]:
y_train_class = np.select([np.floor(y_train/5) < 8, np.floor(y_train/5) > 7], [np.floor(y_train/5), 8])
y_test_2017_class = np.select([np.floor(y_test_2017/5) < 8, np.floor(y_test_2017/5) > 7], [np.floor(y_test_2017/5), 8])
y_test_2018_class = np.select([np.floor(y_test_2018/5) < 8, np.floor(y_test_2018/5) > 7], [np.floor(y_test_2018/5), 8])
y_test_2019_class = np.select([np.floor(y_test_2019/5) < 8, np.floor(y_test_2019/5) > 7], [np.floor(y_test_2019/5), 8])

In [8]:
y_train_cat = np_utils.to_categorical(y_train_class, 9)
y_test_2017_cat = np_utils.to_categorical(y_test_2017_class, 9)
y_test_2018_cat = np_utils.to_categorical(y_test_2018_class, 9)
y_test_2019_cat = np_utils.to_categorical(y_test_2019_class, 9)

In [9]:
x_train_2017 = np.concatenate((x_train, x_test_2017))
y_train_2017 = np.concatenate((y_train, y_test_2017))
y_train_2017_class = np.concatenate((y_train_class, y_test_2017_class))
y_train_2017_cat = np.concatenate((y_train_cat, y_test_2017_cat))

In [10]:
x_train_2018 = np.concatenate((x_train, x_test_2017, x_test_2018))
y_train_2018 = np.concatenate((y_train, y_test_2017, y_test_2018))
y_train_2018_class = np.concatenate((y_train_class, y_test_2017_class, y_test_2018_class))
y_train_2018_cat = np.concatenate((y_train_cat, y_test_2017_cat, y_test_2018_cat))

In [11]:
testing_2018 = testing_2018_load.drop(FileColumnName, axis=1).copy()
testing_2019 = testing_2019_load.drop(FileColumnName, axis=1).copy()

In [12]:
from sklearn.utils import class_weight
class_weight = np.array(class_weight.compute_class_weight(class_weight='balanced'
                                               ,classes=np.unique(y_train_class)
                                               ,y=y_train_class))

In [13]:
scaler = StandardScaler().fit(training_load.iloc[:,0:-29].drop(FileColumnName, axis=1).values)

### Linear Regression

In [14]:
regr_2017 = LinearRegression()
regr_2017.fit(x_train_2017,y_train_2017)

In [15]:
t = 0
a = 0
b = 0
c = 0
d = 0
g = []
l = []
m = []
m1 = []
n = []
n1 = []
p = []
p1 = []
q = []
q1 = []
predictions = regr_2017.predict(x_test_2018)
for i in range(len(y_test_2018)):
    k = np.round(predictions)[i] - y_test_2018[i]
    if abs(k) < 11:
        d += 1
        g.append(y_test_2018[i])
        if abs(k) < 6:
            c += 1
            l.append(y_test_2018[i])
            if abs(k) < 4:
                b += 1
                m.append(y_test_2018[i])
                m1.append(i)
                if abs(k) < 2:
                    a += 1
                    n.append(y_test_2018[i])
                    n1.append(i)
                    if abs(k) == 0:
                        t += 1
                        p.append(y_test_2018[i])
                        p1.append(i)
    else:
        q.append(y_test_2018[i])
        q1.append(i)
print("correct answer：%.4f" % (t/len(y_test_2018)))
print("plus or minus 1：%.4f" % (a/len(y_test_2018)))
print("plus or minus 3：%.4f" % (b/len(y_test_2018)))
print("plus or minus 5：%.4f" % (c/len(y_test_2018)))
print("plus or minus 10：%.4f" % (d/len(y_test_2018)))

答對：0.0435
正負1：0.1739
正負3：0.4130
正負5：0.6033
正負10：0.9022


In [16]:
print("mean_absolute_error:",mean_absolute_error(predictions,y_test_2018))
print("mean_squared_error:",mean_squared_error(predictions,y_test_2018))
print("rmse:",sqrt(mean_squared_error(predictions,y_test_2018)))

mean_absolute_error: 5.205214881378671
mean_squared_error: 43.39065000137966
rmse: 6.587157960864432


In [17]:
Counter(sorted(p))

Counter({10.0: 2, 0.0: 1, 5.0: 1, 16.0: 1, 20.0: 1, 22.0: 1, 23.0: 1})

In [18]:
Counter(sorted(n))

Counter({9.0: 3,
         10.0: 3,
         0.0: 2,
         2.0: 2,
         4.0: 2,
         5.0: 2,
         11.0: 2,
         21.0: 2,
         23.0: 2,
         1.0: 1,
         3.0: 1,
         6.0: 1,
         12.0: 1,
         13.0: 1,
         14.0: 1,
         15.0: 1,
         16.0: 1,
         19.0: 1,
         20.0: 1,
         22.0: 1,
         32.0: 1})

In [19]:
Counter(sorted(m))

Counter({1.0: 6,
         10.0: 6,
         6.0: 5,
         9.0: 5,
         11.0: 5,
         4.0: 4,
         5.0: 4,
         8.0: 4,
         14.0: 4,
         15.0: 4,
         2.0: 3,
         12.0: 3,
         21.0: 3,
         0.0: 2,
         3.0: 2,
         13.0: 2,
         16.0: 2,
         19.0: 2,
         20.0: 2,
         23.0: 2,
         7.0: 1,
         18.0: 1,
         22.0: 1,
         24.0: 1,
         32.0: 1,
         38.0: 1})

In [20]:
Counter(sorted(q))

Counter({9.0: 2,
         23.0: 2,
         36.0: 2,
         2.0: 1,
         3.0: 1,
         4.0: 1,
         6.0: 1,
         8.0: 1,
         10.0: 1,
         12.0: 1,
         22.0: 1,
         27.0: 1,
         39.0: 1,
         43.0: 1,
         48.0: 1})

In [21]:
print('0:',p1)
print('1:',n1)
print('3:',m1)
print('NA:',q1)

0: [71, 77, 83, 108, 119, 129, 131, 159]
1: [2, 11, 13, 16, 22, 34, 47, 50, 58, 62, 71, 73, 77, 83, 107, 108, 114, 115, 119, 123, 129, 131, 133, 144, 152, 154, 155, 158, 159, 162, 180, 182]
3: [1, 2, 8, 10, 11, 13, 15, 16, 17, 22, 23, 28, 34, 39, 47, 48, 49, 50, 51, 54, 58, 59, 60, 62, 69, 70, 71, 73, 77, 78, 81, 83, 86, 90, 91, 92, 94, 97, 98, 99, 107, 108, 113, 114, 115, 116, 119, 120, 123, 124, 129, 131, 133, 136, 142, 144, 146, 152, 154, 155, 157, 158, 159, 160, 161, 162, 165, 166, 169, 171, 173, 176, 177, 178, 180, 182]
NA: [9, 18, 21, 24, 25, 30, 38, 41, 44, 52, 87, 88, 104, 111, 145, 167, 175, 179]


In [22]:
regr_2018_dataframe = testing_2018.copy()
regr_2018_dataframe['Prediction'] = np.round(predictions)
regr_2018_dataframe['Difference'] = regr_2018_dataframe.SixthYear_HR - regr_2018_dataframe.Prediction

In [23]:
regr_2018_dataframe[regr_2018_dataframe.SixthYear_HR > 39]

Unnamed: 0,FirstYear_yearID,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,FirstYear_2B,FirstYear_3B,FirstYear_HR,FirstYear_RBI,FirstYear_SB,...,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age,Prediction,Difference
38,2013,56.0,136.0,27.0,38.0,10.0,0.0,11.0,27.0,3.0,...,7.0,16.0,Khris,Davis,Khristopher Adrian,205.0,71.0,31.0,30.0,18.0
111,2013,86.0,296.0,24.0,74.0,17.0,0.0,7.0,36.0,2.0,...,7.0,19.0,J. D.,Martinez,Julio Daniel,230.0,75.0,31.0,29.0,14.0


In [24]:
regr_2018 = LinearRegression()
regr_2018.fit(x_train_2018,y_train_2018)

In [25]:
t = 0
a = 0
b = 0
c = 0
d = 0
g = []
l = []
m = []
m1 = []
n = []
n1 = []
p = []
p1 = []
q = []
q1 = []
predictions = regr_2018.predict(x_test_2019)
for i in range(len(y_test_2019)):
    k = np.round(predictions)[i] - y_test_2019[i]
    if abs(k) < 11:
        d += 1
        g.append(y_test_2019[i])
        if abs(k) < 6:
            c += 1
            l.append(y_test_2019[i])
            if abs(k) < 4:
                b += 1
                m.append(y_test_2019[i])
                m1.append(i)
                if abs(k) < 2:
                    a += 1
                    n.append(y_test_2019[i])
                    n1.append(i)
                    if abs(k) == 0:
                        t += 1
                        p.append(y_test_2019[i])
                        p1.append(i)
    else:
        q.append(y_test_2019[i])
        q1.append(i)
print("correct answer：%.4f" % (t/len(y_test_2019)))
print("plus or minus 1：%.4f" % (a/len(y_test_2019)))
print("plus or minus 3：%.4f" % (b/len(y_test_2019)))
print("plus or minus 5：%.4f" % (c/len(y_test_2019)))
print("plus or minus 10：%.4f" % (d/len(y_test_2019)))

答對：0.0314
正負1：0.1466
正負3：0.3874
正負5：0.5131
正負10：0.8010


In [26]:
print("mean_absolute_error:",mean_absolute_error(predictions,y_test_2019))
print("mean_squared_error:",mean_squared_error(predictions,y_test_2019))
print("rmse:",sqrt(mean_squared_error(predictions,y_test_2019)))

mean_absolute_error: 6.702257525858455
mean_squared_error: 79.74969471323746
rmse: 8.930268456952314


In [27]:
Counter(sorted(p))

Counter({5.0: 3, 7.0: 1, 12.0: 1, 15.0: 1})

In [28]:
Counter(sorted(n))

Counter({5.0: 4,
         12.0: 4,
         2.0: 3,
         9.0: 3,
         7.0: 2,
         13.0: 2,
         1.0: 1,
         3.0: 1,
         6.0: 1,
         10.0: 1,
         11.0: 1,
         14.0: 1,
         15.0: 1,
         16.0: 1,
         17.0: 1,
         32.0: 1})

In [29]:
Counter(sorted(m))

Counter({12.0: 9,
         9.0: 7,
         5.0: 5,
         7.0: 5,
         11.0: 5,
         1.0: 4,
         2.0: 4,
         3.0: 4,
         6.0: 3,
         8.0: 3,
         13.0: 3,
         0.0: 2,
         10.0: 2,
         14.0: 2,
         15.0: 2,
         17.0: 2,
         18.0: 2,
         23.0: 2,
         4.0: 1,
         16.0: 1,
         20.0: 1,
         21.0: 1,
         22.0: 1,
         27.0: 1,
         29.0: 1,
         32.0: 1})

In [30]:
Counter(sorted(q))

Counter({33.0: 4,
         2.0: 3,
         1.0: 2,
         3.0: 2,
         23.0: 2,
         24.0: 2,
         31.0: 2,
         34.0: 2,
         35.0: 2,
         6.0: 1,
         12.0: 1,
         16.0: 1,
         17.0: 1,
         18.0: 1,
         21.0: 1,
         26.0: 1,
         27.0: 1,
         28.0: 1,
         37.0: 1,
         38.0: 1,
         39.0: 1,
         41.0: 1,
         44.0: 1,
         45.0: 1,
         48.0: 1,
         49.0: 1})

In [31]:
print('0:',p1)
print('1:',n1)
print('3:',m1)
print('NA:',q1)

0: [2, 72, 75, 98, 136, 160]
1: [2, 21, 22, 25, 36, 42, 59, 67, 72, 75, 80, 91, 97, 98, 108, 111, 113, 114, 117, 119, 123, 130, 136, 148, 153, 160, 165, 177]
3: [2, 4, 7, 9, 11, 13, 20, 21, 22, 23, 25, 27, 29, 36, 42, 48, 55, 56, 57, 59, 60, 67, 69, 72, 74, 75, 78, 80, 88, 91, 94, 95, 96, 97, 98, 102, 103, 108, 110, 111, 113, 114, 117, 118, 119, 121, 123, 124, 125, 130, 131, 134, 136, 137, 140, 142, 146, 147, 148, 153, 154, 157, 158, 159, 160, 163, 164, 165, 167, 168, 175, 177, 184, 185]
NA: [0, 6, 15, 24, 34, 38, 41, 47, 54, 61, 63, 65, 66, 71, 79, 89, 99, 100, 106, 107, 126, 128, 129, 139, 141, 145, 155, 162, 166, 169, 170, 172, 173, 174, 179, 180, 181, 187]


In [32]:
regr_2019_dataframe = testing_2019.copy()
regr_2019_dataframe['Prediction'] = np.round(predictions)
regr_2019_dataframe['Difference'] = regr_2019_dataframe.SixthYear_HR - regr_2019_dataframe.Prediction

In [33]:
regr_2019_dataframe[regr_2019_dataframe.SixthYear_HR > 39]

Unnamed: 0,FirstYear_yearID,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,FirstYear_2B,FirstYear_3B,FirstYear_HR,FirstYear_RBI,FirstYear_SB,...,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age,Prediction,Difference
8,2014,111.0,432.0,58.0,124.0,34.0,2.0,18.0,61.0,2.0,...,8.0,14.0,Nolan,Arenado,Nolan James,215.0,74.0,28.0,33.0,8.0
38,2014,159.0,613.0,87.0,166.0,32.0,2.0,40.0,108.0,4.0,...,3.0,14.0,Nelson,Cruz,Nelson Ramon,230.0,74.0,39.0,28.0,13.0
170,2014,24.0,89.0,11.0,26.0,8.0,1.0,5.0,20.0,1.0,...,4.0,16.0,Jorge,Soler,Jorge Carlos,235.0,76.0,27.0,10.0,38.0
174,2014,85.0,244.0,33.0,59.0,9.0,1.0,4.0,23.0,3.0,...,6.0,12.0,Eugenio,Suarez,Eugenio Alejandro,213.0,71.0,28.0,24.0,25.0
179,2014,157.0,602.0,115.0,173.0,39.0,9.0,36.0,111.0,16.0,...,4.0,5.0,Mike,Trout,Michael Nelson,235.0,74.0,28.0,33.0,12.0
187,2014,144.0,582.0,94.0,165.0,30.0,6.0,9.0,54.0,21.0,...,3.0,8.0,Christian,Yelich,Christian Stephen,195.0,75.0,28.0,24.0,20.0


### SVM

In [34]:
from sklearn.utils import class_weight
class_weight = np.array(class_weight.compute_class_weight(class_weight='balanced'
                                               ,classes=np.unique(y_train_2017)
                                               ,y=y_train_2017))

clf_2017 = SVC(class_weight=dict(zip(list(np.unique(y_train_2017)), list(class_weight))))
clf_2017.fit(scaler.transform(x_train_2017), y_train_2017)

In [35]:
t = 0
a = 0
b = 0
c = 0
d = 0
g = []
l = []
m = []
m1 = []
n = []
n1 = []
p = []
p1 = []
q = []
q1 = []
predictions = clf_2017.predict(scaler.transform(x_test_2018))
for i in range(len(y_test_2018)):
    k = np.round(predictions)[i] - y_test_2018[i]
    if abs(k) < 11:
        d += 1
        g.append(y_test_2018[i])
        if abs(k) < 6:
            c += 1
            l.append(y_test_2018[i])
            if abs(k) < 4:
                b += 1
                m.append(y_test_2018[i])
                m1.append(i)
                if abs(k) < 2:
                    a += 1
                    n.append(y_test_2018[i])
                    n1.append(i)
                    if abs(k) == 0:
                        t += 1
                        p.append(y_test_2018[i])
                        p1.append(i)
    else:
        q.append(y_test_2018[i])
        q1.append(i)
print("correct answer：%.4f" % (t/len(y_test_2018)))
print("plus or minus 1：%.4f" % (a/len(y_test_2018)))
print("plus or minus 3：%.4f" % (b/len(y_test_2018)))
print("plus or minus 5：%.4f" % (c/len(y_test_2018)))
print("plus or minus 10：%.4f" % (d/len(y_test_2018)))

答對：0.0870
正負1：0.1467
正負3：0.3641
正負5：0.5217
正負10：0.7880


In [36]:
print("mean_absolute_error:",mean_absolute_error(predictions,y_test_2018))
print("mean_squared_error:",mean_squared_error(predictions,y_test_2018))
print("rmse:",sqrt(mean_squared_error(predictions,y_test_2018)))

mean_absolute_error: 6.706521739130435
mean_squared_error: 78.3913043478261
rmse: 8.853886397951246


In [37]:
Counter(sorted(p))

Counter({14.0: 4,
         2.0: 2,
         38.0: 2,
         0.0: 1,
         1.0: 1,
         7.0: 1,
         8.0: 1,
         13.0: 1,
         22.0: 1,
         27.0: 1,
         34.0: 1})

In [38]:
Counter(sorted(n))

Counter({2.0: 4,
         14.0: 4,
         1.0: 3,
         3.0: 2,
         13.0: 2,
         38.0: 2,
         0.0: 1,
         5.0: 1,
         7.0: 1,
         8.0: 1,
         10.0: 1,
         15.0: 1,
         22.0: 1,
         27.0: 1,
         33.0: 1,
         34.0: 1})

In [39]:
Counter(sorted(m))

Counter({1.0: 5,
         4.0: 5,
         5.0: 5,
         11.0: 5,
         14.0: 5,
         2.0: 4,
         10.0: 4,
         9.0: 3,
         13.0: 3,
         16.0: 3,
         21.0: 3,
         0.0: 2,
         3.0: 2,
         8.0: 2,
         15.0: 2,
         27.0: 2,
         38.0: 2,
         6.0: 1,
         7.0: 1,
         12.0: 1,
         20.0: 1,
         22.0: 1,
         28.0: 1,
         33.0: 1,
         34.0: 1,
         37.0: 1,
         43.0: 1})

In [40]:
Counter(sorted(q))

Counter({13.0: 5,
         1.0: 3,
         8.0: 3,
         9.0: 3,
         12.0: 3,
         2.0: 2,
         6.0: 2,
         7.0: 2,
         11.0: 2,
         15.0: 2,
         23.0: 2,
         0.0: 1,
         3.0: 1,
         4.0: 1,
         5.0: 1,
         10.0: 1,
         18.0: 1,
         21.0: 1,
         27.0: 1,
         36.0: 1,
         37.0: 1})

In [41]:
print('0:',p1)
print('1:',n1)
print('3:',m1)
print('NA:',q1)

0: [7, 8, 13, 15, 34, 41, 60, 77, 80, 81, 109, 135, 136, 165, 169, 174]
1: [2, 7, 8, 13, 15, 16, 22, 34, 41, 47, 49, 60, 66, 77, 78, 80, 81, 85, 94, 102, 109, 115, 135, 136, 165, 169, 174]
3: [0, 2, 4, 7, 8, 13, 15, 16, 17, 22, 32, 33, 34, 35, 41, 43, 47, 48, 49, 51, 59, 60, 62, 66, 67, 71, 76, 77, 78, 79, 80, 81, 83, 85, 94, 98, 99, 102, 107, 109, 111, 112, 113, 115, 117, 123, 126, 129, 130, 132, 135, 136, 138, 152, 154, 155, 156, 160, 162, 164, 165, 169, 171, 172, 173, 174, 177]
NA: [9, 18, 21, 24, 27, 28, 30, 40, 42, 44, 45, 52, 55, 56, 57, 68, 69, 74, 82, 86, 87, 88, 104, 106, 118, 121, 124, 125, 127, 128, 137, 139, 141, 145, 151, 175, 176, 179, 181]


In [42]:
SVM_2018_dataframe = testing_2018.copy()
SVM_2018_dataframe['Prediction'] = np.round(predictions)
SVM_2018_dataframe['Difference'] = SVM_2018_dataframe.SixthYear_HR - SVM_2018_dataframe.Prediction

In [43]:
SVM_2018_dataframe[SVM_2018_dataframe.SixthYear_HR > 39]

Unnamed: 0,FirstYear_yearID,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,FirstYear_2B,FirstYear_3B,FirstYear_HR,FirstYear_RBI,FirstYear_SB,...,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age,Prediction,Difference
38,2013,56.0,136.0,27.0,38.0,10.0,0.0,11.0,27.0,3.0,...,7.0,16.0,Khris,Davis,Khristopher Adrian,205.0,71.0,31.0,43.0,5.0
111,2013,86.0,296.0,24.0,74.0,17.0,0.0,7.0,36.0,2.0,...,7.0,19.0,J. D.,Martinez,Julio Daniel,230.0,75.0,31.0,40.0,3.0


In [44]:
from sklearn.utils import class_weight
class_weight = np.array(class_weight.compute_class_weight(class_weight='balanced'
                                               ,classes=np.unique(y_train_2018)
                                               ,y=y_train_2018))

clf_2018 = SVC(class_weight=dict(zip(list(np.unique(y_train_2018)), list(class_weight))))
clf_2018.fit(scaler.transform(x_train_2018), y_train_2018)

In [45]:
t = 0
a = 0
b = 0
c = 0
d = 0
g = []
l = []
m = []
m1 = []
n = []
n1 = []
p = []
p1 = []
q = []
q1 = []
predictions = clf_2018.predict(scaler.transform(x_test_2019))
for i in range(len(y_test_2019)):
    k = np.round(predictions)[i] - y_test_2019[i]
    if abs(k) < 11:
        d += 1
        g.append(y_test_2019[i])
        if abs(k) < 6:
            c += 1
            l.append(y_test_2019[i])
            if abs(k) < 4:
                b += 1
                m.append(y_test_2019[i])
                m1.append(i)
                if abs(k) < 2:
                    a += 1
                    n.append(y_test_2019[i])
                    n1.append(i)
                    if abs(k) == 0:
                        t += 1
                        p.append(y_test_2019[i])
                        p1.append(i)
    else:
        q.append(y_test_2019[i])
        q1.append(i)
print("correct answer：%.4f" % (t/len(y_test_2019)))
print("plus or minus 1：%.4f" % (a/len(y_test_2019)))
print("plus or minus 3：%.4f" % (b/len(y_test_2019)))
print("plus or minus 5：%.4f" % (c/len(y_test_2019)))
print("plus or minus 10：%.4f" % (d/len(y_test_2019)))

答對：0.0785
正負1：0.1937
正負3：0.3246
正負5：0.4974
正負10：0.7330


In [46]:
print("mean_absolute_error:",mean_absolute_error(predictions,y_test_2019))
print("mean_squared_error:",mean_squared_error(predictions,y_test_2019))
print("rmse:",sqrt(mean_squared_error(predictions,y_test_2019)))

mean_absolute_error: 7.780104712041885
mean_squared_error: 115.63350785340315
rmse: 10.753302183673773


In [47]:
Counter(sorted(p))

Counter({2.0: 4,
         7.0: 2,
         23.0: 2,
         1.0: 1,
         3.0: 1,
         6.0: 1,
         11.0: 1,
         12.0: 1,
         13.0: 1,
         14.0: 1})

In [48]:
Counter(sorted(n))

Counter({1.0: 5,
         2.0: 4,
         12.0: 4,
         7.0: 3,
         14.0: 3,
         3.0: 2,
         5.0: 2,
         6.0: 2,
         13.0: 2,
         23.0: 2,
         0.0: 1,
         8.0: 1,
         9.0: 1,
         11.0: 1,
         24.0: 1,
         26.0: 1,
         29.0: 1,
         32.0: 1})

In [49]:
Counter(sorted(m))

Counter({2.0: 6,
         12.0: 6,
         1.0: 5,
         5.0: 5,
         3.0: 3,
         7.0: 3,
         9.0: 3,
         11.0: 3,
         13.0: 3,
         14.0: 3,
         0.0: 2,
         6.0: 2,
         20.0: 2,
         22.0: 2,
         23.0: 2,
         29.0: 2,
         4.0: 1,
         8.0: 1,
         24.0: 1,
         25.0: 1,
         26.0: 1,
         27.0: 1,
         28.0: 1,
         32.0: 1,
         34.0: 1,
         41.0: 1})

In [50]:
Counter(sorted(q))

Counter({2.0: 6,
         12.0: 4,
         3.0: 3,
         10.0: 3,
         15.0: 3,
         17.0: 3,
         23.0: 3,
         1.0: 2,
         6.0: 2,
         11.0: 2,
         16.0: 2,
         35.0: 2,
         4.0: 1,
         7.0: 1,
         8.0: 1,
         9.0: 1,
         13.0: 1,
         19.0: 1,
         20.0: 1,
         24.0: 1,
         27.0: 1,
         28.0: 1,
         31.0: 1,
         32.0: 1,
         37.0: 1,
         44.0: 1,
         48.0: 1,
         49.0: 1})

In [51]:
print('0:',p1)
print('1:',n1)
print('3:',m1)
print('NA:',q1)

0: [21, 23, 57, 67, 74, 80, 88, 90, 93, 119, 124, 148, 150, 154, 164]
1: [2, 4, 13, 19, 21, 23, 42, 57, 67, 73, 74, 75, 80, 86, 88, 90, 91, 93, 94, 98, 110, 111, 119, 121, 124, 137, 138, 148, 149, 150, 153, 154, 157, 160, 164, 165, 175]
3: [2, 4, 9, 13, 18, 19, 21, 23, 28, 31, 37, 38, 42, 44, 55, 57, 62, 67, 69, 73, 74, 75, 76, 78, 80, 82, 86, 88, 90, 91, 93, 94, 98, 104, 108, 109, 110, 111, 119, 121, 124, 130, 135, 136, 137, 138, 140, 142, 148, 149, 150, 153, 154, 155, 157, 159, 160, 164, 165, 171, 175, 186]
NA: [1, 3, 5, 6, 14, 16, 26, 30, 36, 39, 40, 41, 45, 47, 49, 50, 52, 54, 56, 65, 66, 70, 71, 81, 87, 99, 100, 105, 106, 115, 122, 126, 128, 129, 141, 145, 147, 151, 152, 158, 161, 170, 173, 174, 176, 177, 181, 183, 187, 189, 190]


In [52]:
SVM_2019_dataframe = testing_2019.copy()
SVM_2019_dataframe['Prediction'] = np.round(predictions)
SVM_2019_dataframe['Difference'] = SVM_2019_dataframe.SixthYear_HR - SVM_2019_dataframe.Prediction

In [53]:
SVM_2019_dataframe[SVM_2019_dataframe.SixthYear_HR > 39]

Unnamed: 0,FirstYear_yearID,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,FirstYear_2B,FirstYear_3B,FirstYear_HR,FirstYear_RBI,FirstYear_SB,...,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age,Prediction,Difference
8,2014,111.0,432.0,58.0,124.0,34.0,2.0,18.0,61.0,2.0,...,8.0,14.0,Nolan,Arenado,Nolan James,215.0,74.0,28.0,33.0,8.0
38,2014,159.0,613.0,87.0,166.0,32.0,2.0,40.0,108.0,4.0,...,3.0,14.0,Nelson,Cruz,Nelson Ramon,230.0,74.0,39.0,39.0,2.0
170,2014,24.0,89.0,11.0,26.0,8.0,1.0,5.0,20.0,1.0,...,4.0,16.0,Jorge,Soler,Jorge Carlos,235.0,76.0,27.0,7.0,41.0
174,2014,85.0,244.0,33.0,59.0,9.0,1.0,4.0,23.0,3.0,...,6.0,12.0,Eugenio,Suarez,Eugenio Alejandro,213.0,71.0,28.0,34.0,15.0
179,2014,157.0,602.0,115.0,173.0,39.0,9.0,36.0,111.0,16.0,...,4.0,5.0,Mike,Trout,Michael Nelson,235.0,74.0,28.0,39.0,6.0
187,2014,144.0,582.0,94.0,165.0,30.0,6.0,9.0,54.0,21.0,...,3.0,8.0,Christian,Yelich,Christian Stephen,195.0,75.0,28.0,29.0,15.0


### Random Forest

In [54]:
forest_2017 = RandomForestClassifier(n_estimators=200)
forest_2017.fit(x_train_2017, y_train_2017)

In [55]:
t = 0
a = 0
b = 0
c = 0
d = 0
g = []
l = []
m = []
m1 = []
n = []
n1 = []
p = []
p1 = []
q = []
q1 = []
predictions = forest_2017.predict(x_test_2018)
for i in range(len(y_test_2018)):
    k = np.round(predictions)[i] - y_test_2018[i]
    if abs(k) < 11:
        d += 1
        g.append(y_test_2018[i])
        if abs(k) < 6:
            c += 1
            l.append(y_test_2018[i])
            if abs(k) < 4:
                b += 1
                m.append(y_test_2018[i])
                m1.append(i)
                if abs(k) < 2:
                    a += 1
                    n.append(y_test_2018[i])
                    n1.append(i)
                    if abs(k) == 0:
                        t += 1
                        p.append(y_test_2018[i])
                        p1.append(i)
    else:
        q.append(y_test_2018[i])
        q1.append(i)
print("correct answer：%.4f" % (t/len(y_test_2018)))
print("plus or minus 1：%.4f" % (a/len(y_test_2018)))
print("plus or minus 3：%.4f" % (b/len(y_test_2018)))
print("plus or minus 5：%.4f" % (c/len(y_test_2018)))
print("plus or minus 10：%.4f" % (d/len(y_test_2018)))

答對：0.0489
正負1：0.2065
正負3：0.3533
正負5：0.5217
正負10：0.7989


In [56]:
print("mean_absolute_error:",mean_absolute_error(predictions,y_test_2018))
print("mean_squared_error:",mean_squared_error(predictions,y_test_2018))
print("rmse:",sqrt(mean_squared_error(predictions,y_test_2018)))

mean_absolute_error: 6.586956521739131
mean_squared_error: 74.07608695652173
rmse: 8.606746595347264


In [57]:
Counter(sorted(p))

Counter({0.0: 1,
         3.0: 1,
         4.0: 1,
         11.0: 1,
         12.0: 1,
         14.0: 1,
         20.0: 1,
         23.0: 1,
         27.0: 1})

In [58]:
Counter(sorted(n))

Counter({1.0: 6,
         2.0: 3,
         6.0: 3,
         13.0: 3,
         0.0: 2,
         4.0: 2,
         5.0: 2,
         14.0: 2,
         15.0: 2,
         23.0: 2,
         3.0: 1,
         7.0: 1,
         8.0: 1,
         9.0: 1,
         11.0: 1,
         12.0: 1,
         19.0: 1,
         20.0: 1,
         21.0: 1,
         27.0: 1,
         37.0: 1})

In [59]:
Counter(sorted(m))

Counter({1.0: 8,
         4.0: 8,
         2.0: 4,
         7.0: 4,
         14.0: 4,
         0.0: 3,
         3.0: 3,
         6.0: 3,
         8.0: 3,
         11.0: 3,
         13.0: 3,
         5.0: 2,
         9.0: 2,
         15.0: 2,
         19.0: 2,
         21.0: 2,
         23.0: 2,
         10.0: 1,
         12.0: 1,
         20.0: 1,
         27.0: 1,
         30.0: 1,
         33.0: 1,
         37.0: 1})

In [60]:
Counter(sorted(q))

Counter({23.0: 6,
         21.0: 3,
         9.0: 2,
         24.0: 2,
         25.0: 2,
         27.0: 2,
         36.0: 2,
         0.0: 1,
         3.0: 1,
         4.0: 1,
         6.0: 1,
         8.0: 1,
         10.0: 1,
         12.0: 1,
         13.0: 1,
         17.0: 1,
         20.0: 1,
         22.0: 1,
         29.0: 1,
         34.0: 1,
         37.0: 1,
         38.0: 1,
         39.0: 1,
         43.0: 1,
         48.0: 1})

In [61]:
print('0:',p1)
print('1:',n1)
print('3:',m1)
print('NA:',q1)

0: [2, 17, 53, 117, 127, 131, 135, 155, 169]
1: [2, 6, 13, 17, 31, 33, 35, 36, 46, 47, 49, 53, 58, 72, 77, 78, 86, 95, 96, 102, 115, 117, 120, 127, 131, 132, 134, 135, 136, 139, 144, 147, 151, 155, 158, 169, 172, 181]
3: [2, 6, 8, 13, 15, 17, 22, 23, 31, 33, 34, 35, 36, 46, 47, 49, 51, 53, 54, 58, 66, 72, 73, 77, 78, 79, 85, 86, 93, 94, 95, 96, 98, 99, 101, 102, 115, 117, 118, 120, 121, 123, 127, 130, 131, 132, 134, 135, 136, 139, 142, 144, 147, 150, 151, 152, 155, 158, 169, 170, 172, 173, 176, 177, 181]
NA: [0, 3, 4, 9, 12, 18, 19, 21, 24, 25, 29, 30, 32, 38, 41, 44, 52, 63, 64, 69, 75, 76, 80, 82, 87, 104, 106, 110, 111, 140, 143, 148, 149, 163, 165, 167, 179]


In [62]:
RF_2018_dataframe = testing_2018.copy()
RF_2018_dataframe['Prediction'] = np.round(predictions)
RF_2018_dataframe['Difference'] = RF_2018_dataframe.SixthYear_HR - RF_2018_dataframe.Prediction

In [63]:
RF_2018_dataframe[RF_2018_dataframe.SixthYear_HR > 39]

Unnamed: 0,FirstYear_yearID,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,FirstYear_2B,FirstYear_3B,FirstYear_HR,FirstYear_RBI,FirstYear_SB,...,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age,Prediction,Difference
38,2013,56.0,136.0,27.0,38.0,10.0,0.0,11.0,27.0,3.0,...,7.0,16.0,Khris,Davis,Khristopher Adrian,205.0,71.0,31.0,23.0,25.0
111,2013,86.0,296.0,24.0,74.0,17.0,0.0,7.0,36.0,2.0,...,7.0,19.0,J. D.,Martinez,Julio Daniel,230.0,75.0,31.0,27.0,16.0


In [64]:
forest_2018 = RandomForestClassifier(n_estimators=200)
forest_2018.fit(x_train_2018, y_train_2018)

In [74]:
t = 0
a = 0
b = 0
c = 0
d = 0
g = []
l = []
m = []
m1 = []
n = []
n1 = []
p = []
p1 = []
q = []
q1 = []
predictions = forest_2018.predict(x_test_2019)
for i in range(len(y_test_2019)):
    k = np.round(predictions)[i] - y_test_2019[i]
    if abs(k) < 11:
        d += 1
        g.append(y_test_2019[i])
        if abs(k) < 6:
            c += 1
            l.append(y_test_2019[i])
            if abs(k) < 4:
                b += 1
                m.append(y_test_2019[i])
                m1.append(i)
                if abs(k) < 2:
                    a += 1
                    n.append(y_test_2019[i])
                    n1.append(i)
                    if abs(k) == 0:
                        t += 1
                        p.append(y_test_2019[i])
                        p1.append(i)
    else:
        q.append(y_test_2019[i])
        q1.append(i)
print("correct answer：%.4f" % (t/len(y_test_2019)))
print("plus or minus 1：%.4f" % (a/len(y_test_2019)))
print("plus or minus 3：%.4f" % (b/len(y_test_2019)))
print("plus or minus 5：%.4f" % (c/len(y_test_2019)))
print("plus or minus 10：%.4f" % (d/len(y_test_2019)))

答對：0.0366
正負1：0.1466
正負3：0.3194
正負5：0.4503
正負10：0.6545


In [75]:
print("mean_absolute_error:",mean_absolute_error(predictions,y_test_2019))
print("mean_squared_error:",mean_squared_error(predictions,y_test_2019))
print("rmse:",sqrt(mean_squared_error(predictions,y_test_2019)))

mean_absolute_error: 8.471204188481675
mean_squared_error: 123.70680628272251
rmse: 11.12235614798962


In [67]:
Counter(sorted(p))

Counter({1.0: 3, 2.0: 2, 5.0: 1, 9.0: 1})

In [68]:
Counter(sorted(n))

Counter({2.0: 5,
         1.0: 3,
         5.0: 3,
         7.0: 3,
         12.0: 3,
         0.0: 2,
         3.0: 2,
         9.0: 2,
         6.0: 1,
         11.0: 1,
         14.0: 1,
         15.0: 1,
         20.0: 1})

In [69]:
Counter(sorted(m))

Counter({2.0: 9,
         1.0: 6,
         3.0: 6,
         5.0: 5,
         12.0: 5,
         7.0: 4,
         0.0: 3,
         6.0: 3,
         8.0: 3,
         9.0: 3,
         4.0: 2,
         11.0: 2,
         20.0: 2,
         27.0: 2,
         10.0: 1,
         13.0: 1,
         14.0: 1,
         15.0: 1,
         16.0: 1,
         41.0: 1})

In [70]:
Counter(sorted(q))

Counter({23.0: 6,
         17.0: 4,
         24.0: 4,
         21.0: 3,
         22.0: 3,
         33.0: 3,
         34.0: 3,
         2.0: 2,
         12.0: 2,
         13.0: 2,
         16.0: 2,
         19.0: 2,
         20.0: 2,
         26.0: 2,
         28.0: 2,
         29.0: 2,
         35.0: 2,
         1.0: 1,
         3.0: 1,
         6.0: 1,
         14.0: 1,
         15.0: 1,
         18.0: 1,
         25.0: 1,
         27.0: 1,
         30.0: 1,
         31.0: 1,
         32.0: 1,
         36.0: 1,
         37.0: 1,
         38.0: 1,
         39.0: 1,
         41.0: 1,
         44.0: 1,
         45.0: 1,
         48.0: 1,
         49.0: 1})

In [71]:
print('0:',p1)
print('1:',n1)
print('3:',m1)
print('NA:',q1)

0: [4, 88, 93, 102, 138, 142, 160]
1: [2, 4, 21, 23, 40, 44, 57, 67, 72, 74, 82, 86, 88, 90, 91, 93, 94, 95, 102, 119, 137, 138, 142, 151, 160, 165, 167, 175]
3: [2, 4, 7, 8, 16, 21, 23, 28, 32, 36, 40, 42, 43, 44, 48, 57, 67, 70, 72, 73, 74, 78, 80, 82, 86, 88, 90, 91, 93, 94, 95, 96, 97, 102, 104, 105, 108, 110, 119, 122, 134, 137, 138, 140, 142, 147, 148, 151, 156, 157, 158, 159, 160, 161, 165, 167, 171, 175, 184, 189, 190]
NA: [6, 10, 14, 15, 18, 19, 24, 30, 33, 34, 37, 38, 39, 47, 52, 53, 54, 58, 60, 61, 63, 64, 65, 66, 76, 87, 89, 92, 99, 100, 101, 106, 107, 109, 112, 115, 116, 121, 124, 126, 127, 129, 133, 135, 139, 141, 144, 145, 149, 150, 152, 155, 162, 163, 164, 166, 168, 170, 172, 173, 174, 176, 179, 180, 182, 187]


In [72]:
RF_2019_dataframe = testing_2019.copy()
RF_2019_dataframe['Prediction'] = np.round(predictions)
RF_2019_dataframe['Difference'] = RF_2019_dataframe.SixthYear_HR - RF_2019_dataframe.Prediction

In [73]:
RF_2019_dataframe[RF_2019_dataframe.SixthYear_HR > 39]

Unnamed: 0,FirstYear_yearID,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,FirstYear_2B,FirstYear_3B,FirstYear_HR,FirstYear_RBI,FirstYear_SB,...,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age,Prediction,Difference
8,2014,111.0,432.0,58.0,124.0,34.0,2.0,18.0,61.0,2.0,...,8.0,14.0,Nolan,Arenado,Nolan James,215.0,74.0,28.0,38.0,3.0
38,2014,159.0,613.0,87.0,166.0,32.0,2.0,40.0,108.0,4.0,...,3.0,14.0,Nelson,Cruz,Nelson Ramon,230.0,74.0,39.0,16.0,25.0
170,2014,24.0,89.0,11.0,26.0,8.0,1.0,5.0,20.0,1.0,...,4.0,16.0,Jorge,Soler,Jorge Carlos,235.0,76.0,27.0,2.0,46.0
174,2014,85.0,244.0,33.0,59.0,9.0,1.0,4.0,23.0,3.0,...,6.0,12.0,Eugenio,Suarez,Eugenio Alejandro,213.0,71.0,28.0,19.0,30.0
179,2014,157.0,602.0,115.0,173.0,39.0,9.0,36.0,111.0,16.0,...,4.0,5.0,Mike,Trout,Michael Nelson,235.0,74.0,28.0,33.0,12.0
187,2014,144.0,582.0,94.0,165.0,30.0,6.0,9.0,54.0,21.0,...,3.0,8.0,Christian,Yelich,Christian Stephen,195.0,75.0,28.0,18.0,26.0


### Neural Networks

In [76]:
model_2017 = Sequential()
model_2017.add(Dense(1024, input_dim = 105))
model_2017.add(Dropout(rate=0.5))
model_2017.add(Dense(512, activation="relu"))
model_2017.add(Dropout(rate=0.5))
model_2017.add(Dense(128, activation="relu"))
model_2017.add(Dense(1, activation='relu'))
adam = optimizers.Adam(lr = 0.001)
model_2017.compile(loss = "mse", optimizer=adam, metrics=["accuracy"])
model_2017.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1024)              108544    
                                                                 
 dropout (Dropout)           (None, 1024)              0         
                                                                 
 dense_1 (Dense)             (None, 512)               524800    
                                                                 
 dropout_1 (Dropout)         (None, 512)               0         
                                                                 
 dense_2 (Dense)             (None, 128)               65664     
                                                                 
 dense_3 (Dense)             (None, 1)                 129       
                                                                 
Total params: 699,137
Trainable params: 699,137
Non-trai

  super().__init__(name, **kwargs)


In [77]:
#model_9 LSTM128, FC9

model_2018 = Sequential()
model_2018.add(Dense(1024, input_dim = 105))
model_2018.add(Dropout(rate=0.5))
model_2018.add(Dense(512, activation="relu"))
model_2018.add(Dropout(rate=0.5))
model_2018.add(Dense(128, activation="relu"))
model_2018.add(Dense(1, activation='relu'))
adam = optimizers.Adam(lr = 0.001)
model_2018.compile(loss = "mse", optimizer=adam, metrics=["accuracy"])
model_2018.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   


 dense_4 (Dense)             (None, 1024)              108544    
                                                                 
 dropout_2 (Dropout)         (None, 1024)              0         
                                                                 
 dense_5 (Dense)             (None, 512)               524800    
                                                                 
 dropout_3 (Dropout)         (None, 512)               0         
                                                                 
 dense_6 (Dense)             (None, 128)               65664     
                                                                 
 dense_7 (Dense)             (None, 1)                 129       
                                                                 
Total params: 699,137
Trainable params: 699,137
Non-trainable params: 0
_________________________________________________________________


### Training

In [95]:
from sklearn.utils import class_weight
""" class_weight = class_weight.compute_class_weight(class_weight='balanced'
                                               ,classes=np.unique(y_train_class)
                                               ,y=y_train_class)
# Convert the class weights to a dictionary because Keras expects class weights in this format
class_weights_dict = dict(zip(np.unique(y_train_2017), class_weight))

class_weights_dict_int_keys = {int(key): value for key, value in class_weights_dict.items()}

filepath = "saved-model-{epoch:03d}-{val_accuracy:.4f}-2018.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
model_2017_stat = model_2017.fit(x_train_2017, y_train_2017, 
                              batch_size=128, 
                              epochs=10, 
                              validation_data=(x_test_2018, y_test_2018),
                              callbacks=[checkpoint],
                              shuffle=True,
                              class_weight=class_weights_dict_int_keys
                             ) """

# Correctly compute class weights
class_weights = class_weight.compute_class_weight(class_weight='balanced',
                                                  classes=np.unique(y_train_2017),
                                                  y=y_train_2017)

# Convert the class weights to a dictionary because Keras expects class weights in this format
class_weights_dict = dict(zip(np.unique(y_train_2017), class_weights))

class_weights_dict_int_keys = {int(key): value for key, value in class_weights_dict.items()}

# Detecting the highest class label
max_class_label = max(class_weights_dict_int_keys.keys())

# Ensuring continuity by replacing missing class with 0 as weight
adjusted_class_weights = {i: class_weights_dict_int_keys.get(i, 0) for i in range(max_class_label + 1)}

filepath = "saved-model-{epoch:03d}-{val_accuracy:.4f}-2018.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
model_2017_stat = model_2017.fit(x_train_2017, y_train_2017, 
                              batch_size=128, 
                              epochs=10, 
                              validation_data=(x_test_2018, y_test_2018),
                              callbacks=[checkpoint],
                              shuffle=True,
                              class_weight=adjusted_class_weights
                             )

Epoch 1/10
 1/77 [..............................] - ETA: 2s - loss: 341.4172 - accuracy: 0.1328

Epoch 1: val_accuracy improved from -inf to 0.02717, saving model to saved-model-001-0.0272-2018.hdf5
Epoch 2/10
Epoch 2: val_accuracy did not improve from 0.02717
Epoch 3/10
Epoch 3: val_accuracy did not improve from 0.02717
Epoch 4/10
Epoch 4: val_accuracy did not improve from 0.02717
Epoch 5/10
Epoch 5: val_accuracy did not improve from 0.02717
Epoch 6/10
Epoch 6: val_accuracy did not improve from 0.02717
Epoch 7/10
Epoch 7: val_accuracy did not improve from 0.02717
Epoch 8/10
Epoch 8: val_accuracy did not improve from 0.02717
Epoch 9/10
Epoch 9: val_accuracy did not improve from 0.02717
Epoch 10/10
Epoch 10: val_accuracy did not improve from 0.02717


In [90]:
class_weights_dict_int_keys

{0: 0.18552147239263803,
 1: 0.17104072398190046,
 2: 0.20712328767123286,
 3: 0.2507462686567164,
 4: 0.301195219123506,
 5: 0.301195219123506,
 6: 0.32238805970149254,
 7: 0.4053619302949062,
 8: 0.4108695652173913,
 9: 0.45,
 10: 0.504,
 11: 0.5361702127659574,
 12: 0.5073825503355704,
 13: 0.5727272727272728,
 14: 0.6096774193548387,
 15: 0.6967741935483871,
 16: 0.7411764705882353,
 17: 0.723444976076555,
 18: 0.8307692307692308,
 19: 1.0216216216216216,
 20: 0.8307692307692308,
 21: 1.0285714285714285,
 22: 0.9219512195121952,
 23: 1.1368421052631579,
 24: 1.1454545454545455,
 25: 1.3147826086956522,
 26: 1.6434782608695653,
 27: 1.4679611650485438,
 28: 1.7181818181818183,
 29: 1.9894736842105263,
 30: 2.1913043478260867,
 31: 2.290909090909091,
 32: 2.4,
 33: 3.516279069767442,
 34: 2.749090909090909,
 35: 3.876923076923077,
 36: 3.6878048780487807,
 37: 4.0864864864864865,
 38: 4.447058823529412,
 39: 4.447058823529412,
 40: 4.2,
 41: 7.56,
 42: 8.894117647058824,
 43: 10.08,


In [96]:
from sklearn.utils import class_weight
""" class_weight = class_weight.compute_class_weight('balanced'
                                               ,np.unique(y_train_class)
                                               ,y_train_class)

filepath = "saved-model-{epoch:03d}-{val_acc:.4f}-2019.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
model_2018_stat = model_2018.fit(x_train_2018, y_train_2018, 
                              batch_size=128, 
                              epochs=10, 
                              validation_data=(x_test_2019, y_test_2019),
                              callbacks=[checkpoint],
                              shuffle=True,
                              class_weight=class_weight
                             )

 """
# Correctly compute class weights
class_weights = class_weight.compute_class_weight(class_weight='balanced',
                                                  classes=np.unique(y_train_2018),
                                                  y=y_train_2018)

# Convert the class weights to a dictionary because Keras expects class weights in this format
class_weights_dict = dict(zip(np.unique(y_train_2018), class_weights))

class_weights_dict_int_keys = {int(key): value for key, value in class_weights_dict.items()}

# Detecting the highest class label
max_class_label = max(class_weights_dict_int_keys.keys())

# Ensuring continuity by replacing missing class with 0 as weight
adjusted_class_weights = {i: class_weights_dict_int_keys.get(i, 0) for i in range(max_class_label + 1)}

filepath = "saved-model-{epoch:03d}-{val_accuracy:.4f}-2019.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
model_2017_stat = model_2017.fit(x_train_2018, y_train_2018, 
                              batch_size=128, 
                              epochs=10, 
                              validation_data=(x_test_2019, y_test_2019),
                              callbacks=[checkpoint],
                              shuffle=True,
                              class_weight=adjusted_class_weights
                             )

Epoch 1/10
 1/79 [..............................] - ETA: 0s - loss: 773.3993 - accuracy: 0.1016

Epoch 1: val_accuracy improved from -inf to 0.02094, saving model to saved-model-001-0.0209-2019.hdf5
Epoch 2/10
Epoch 2: val_accuracy did not improve from 0.02094
Epoch 3/10
Epoch 3: val_accuracy did not improve from 0.02094
Epoch 4/10
Epoch 4: val_accuracy did not improve from 0.02094
Epoch 5/10
Epoch 5: val_accuracy did not improve from 0.02094
Epoch 6/10
Epoch 6: val_accuracy did not improve from 0.02094
Epoch 7/10
Epoch 7: val_accuracy did not improve from 0.02094
Epoch 8/10
Epoch 8: val_accuracy did not improve from 0.02094
Epoch 9/10
Epoch 9: val_accuracy did not improve from 0.02094
Epoch 10/10
Epoch 10: val_accuracy did not improve from 0.02094


In [110]:
BBB = load_model('ML/saved-model-001-0.0272-2018.hdf5')
CCC = load_model('ML/saved-model-001-0.0209-2019.hdf5')

In [111]:
t = 0
a = 0
b = 0
c = 0
d = 0
g = []
l = []
m = []
m1 = []
n = []
n1 = []
p = []
p1 = []
q = []
q1 = []
predictions = BBB.predict(x_test_2018)
for i in range(len(y_test_2018)):
    k = np.round(predictions)[i] - y_test_2018[i]
    if abs(k) < 11:
        d += 1
        g.append(y_test_2018[i])
        if abs(k) < 6:
            c += 1
            l.append(y_test_2018[i])
            if abs(k) < 4:
                b += 1
                m.append(y_test_2018[i])
                m1.append(i)
                if abs(k) < 2:
                    a += 1
                    n.append(y_test_2018[i])
                    n1.append(i)
                    if abs(k) == 0:
                        t += 1
                        p.append(y_test_2018[i])
                        p1.append(i)
    else:
        q.append(y_test_2018[i])
        q1.append(i)
print("correct answer：%.4f" % (t/len(y_test_2018)))
print("plus or minus 1：%.4f" % (a/len(y_test_2018)))
print("plus or minus 3：%.4f" % (b/len(y_test_2018)))
print("plus or minus 5：%.4f" % (c/len(y_test_2018)))
print("plus or minus 10：%.4f" % (d/len(y_test_2018)))

答對：0.0272
正負1：0.0978
正負3：0.1522
正負5：0.2391
正負10：0.4457


In [112]:
print("mean_absolute_error:",mean_absolute_error(predictions,y_test_2018))
print("mean_squared_error:",mean_squared_error(predictions,y_test_2018))
print("rmse:",sqrt(mean_squared_error(predictions,y_test_2018)))

mean_absolute_error: 13.293478260869565
mean_squared_error: 273.4673913043478
rmse: 16.53684949754178


In [113]:
Counter(sorted(p))

Counter({0.0: 5})

In [114]:
Counter(sorted(n))

Counter({1.0: 13, 0.0: 5})

In [115]:
Counter(sorted(m))

Counter({1.0: 13, 2.0: 6, 0.0: 5, 3.0: 4})

In [116]:
Counter(sorted(q))

Counter({11.0: 10,
         13.0: 9,
         15.0: 9,
         23.0: 9,
         14.0: 7,
         21.0: 7,
         12.0: 6,
         16.0: 6,
         20.0: 5,
         18.0: 3,
         19.0: 3,
         24.0: 3,
         25.0: 3,
         27.0: 3,
         17.0: 2,
         22.0: 2,
         36.0: 2,
         37.0: 2,
         38.0: 2,
         28.0: 1,
         29.0: 1,
         30.0: 1,
         32.0: 1,
         33.0: 1,
         34.0: 1,
         39.0: 1,
         43.0: 1,
         48.0: 1})

In [117]:
print('0:',p1)
print('1:',n1)
print('3:',m1)
print('NA:',q1)

0: [65, 77, 82, 101, 155]
1: [15, 27, 31, 36, 39, 49, 65, 77, 82, 101, 115, 137, 139, 142, 147, 155, 156, 171]
3: [2, 13, 15, 21, 27, 31, 36, 39, 47, 49, 55, 65, 77, 82, 85, 88, 93, 94, 101, 115, 136, 137, 139, 142, 147, 155, 156, 171]
NA: [0, 3, 4, 7, 10, 11, 12, 14, 16, 17, 19, 23, 25, 28, 29, 32, 34, 35, 37, 38, 40, 41, 42, 43, 45, 46, 50, 52, 53, 57, 58, 59, 60, 61, 62, 63, 64, 66, 67, 70, 71, 72, 74, 75, 76, 79, 80, 84, 87, 90, 92, 95, 97, 99, 100, 102, 103, 104, 106, 109, 110, 111, 112, 119, 122, 123, 124, 125, 126, 127, 128, 131, 133, 135, 138, 140, 143, 144, 146, 148, 149, 151, 153, 157, 158, 159, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 174, 175, 176, 179, 181, 183]


In [118]:
BB = testing_2018.copy()
BB['Prediction'] = np.round(predictions)
BB['Difference'] = BB.SixthYear_HR - BB.Prediction

In [119]:
t = 0
a = 0
b = 0
c = 0
d = 0
g = []
l = []
m = []
m1 = []
n = []
n1 = []
p = []
p1 = []
q = []
q1 = []
predictions = CCC.predict(x_test_2019)
for i in range(len(y_test_2019)):
    k = np.round(predictions)[i] - y_test_2019[i]
    if abs(k) < 11:
        d += 1
        g.append(y_test_2019[i])
        if abs(k) < 6:
            c += 1
            l.append(y_test_2019[i])
            if abs(k) < 4:
                b += 1
                m.append(y_test_2019[i])
                m1.append(i)
                if abs(k) < 2:
                    a += 1
                    n.append(y_test_2019[i])
                    n1.append(i)
                    if abs(k) == 0:
                        t += 1
                        p.append(y_test_2019[i])
                        p1.append(i)
    else:
        q.append(y_test_2019[i])
        q1.append(i)
print("correct answer：%.4f" % (t/len(y_test_2019)))
print("plus or minus 1：%.4f" % (a/len(y_test_2019)))
print("plus or minus 3：%.4f" % (b/len(y_test_2019)))
print("plus or minus 5：%.4f" % (c/len(y_test_2019)))
print("plus or minus 10：%.4f" % (d/len(y_test_2019)))

1/6 [====>.........................] - ETA: 0s

答對：0.0209
正負1：0.0733
正負3：0.1885
正負5：0.2356
正負10：0.3927


In [120]:
print("mean_absolute_error:",mean_absolute_error(predictions,y_test_2019))
print("mean_squared_error:",mean_squared_error(predictions,y_test_2019))
print("rmse:",sqrt(mean_squared_error(predictions,y_test_2019)))

mean_absolute_error: 15.178010471204189
mean_squared_error: 360.3193717277487
rmse: 18.982080279246233


In [121]:
Counter(sorted(p))

Counter({0.0: 4})

In [122]:
Counter(sorted(n))

Counter({1.0: 10, 0.0: 4})

In [123]:
Counter(sorted(m))

Counter({2.0: 14, 1.0: 10, 3.0: 8, 0.0: 4})

In [124]:
Counter(sorted(q))

Counter({12.0: 15,
         11.0: 8,
         17.0: 7,
         23.0: 7,
         13.0: 5,
         15.0: 5,
         20.0: 5,
         14.0: 4,
         16.0: 4,
         19.0: 4,
         21.0: 4,
         22.0: 4,
         24.0: 4,
         33.0: 4,
         34.0: 4,
         18.0: 3,
         27.0: 3,
         29.0: 3,
         35.0: 3,
         26.0: 2,
         28.0: 2,
         31.0: 2,
         32.0: 2,
         41.0: 2,
         25.0: 1,
         30.0: 1,
         36.0: 1,
         37.0: 1,
         38.0: 1,
         39.0: 1,
         44.0: 1,
         45.0: 1,
         48.0: 1,
         49.0: 1})

In [125]:
print('0:',p1)
print('1:',n1)
print('3:',m1)
print('NA:',q1)

0: [35, 82, 96, 175]
1: [4, 35, 42, 82, 84, 88, 96, 99, 138, 157, 169, 175, 178, 189]
3: [4, 32, 35, 42, 43, 66, 67, 70, 71, 74, 78, 81, 82, 84, 88, 91, 93, 96, 99, 119, 126, 128, 132, 134, 138, 142, 148, 151, 157, 161, 169, 171, 173, 175, 178, 189]
NA: [0, 1, 3, 6, 7, 8, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 22, 23, 24, 25, 26, 28, 29, 30, 31, 33, 34, 36, 37, 38, 39, 40, 41, 44, 45, 46, 47, 48, 52, 53, 54, 56, 58, 59, 60, 61, 62, 63, 64, 65, 68, 69, 72, 73, 75, 76, 77, 79, 83, 86, 87, 89, 90, 92, 97, 100, 101, 103, 106, 107, 109, 111, 112, 115, 116, 120, 121, 124, 127, 129, 130, 131, 133, 135, 139, 140, 141, 143, 144, 146, 149, 150, 152, 153, 154, 155, 158, 162, 163, 164, 165, 166, 168, 170, 172, 174, 176, 177, 179, 180, 181, 182, 183, 185, 186, 187]


### Analysis

In [126]:
CC = testing_2019.copy()
CC['Prediction'] = np.round(predictions)
CC['Difference'] = CC.SixthYear_HR - CC.Prediction

In [127]:
regr_2018_dataframe[regr_2018_dataframe.Prediction > 39]

Unnamed: 0,FirstYear_yearID,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,FirstYear_2B,FirstYear_3B,FirstYear_HR,FirstYear_RBI,FirstYear_SB,...,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age,Prediction,Difference


In [128]:
len(regr_2018_dataframe[regr_2018_dataframe.Difference < 0])

101

In [129]:
len(regr_2018_dataframe[regr_2018_dataframe.Difference == 0])

8

In [130]:
len(regr_2018_dataframe[regr_2018_dataframe.Difference > 0])

75

In [131]:
SVM_2018_dataframe[SVM_2018_dataframe.Prediction > 39]

Unnamed: 0,FirstYear_yearID,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,FirstYear_2B,FirstYear_3B,FirstYear_HR,FirstYear_RBI,FirstYear_SB,...,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age,Prediction,Difference
38,2013,56.0,136.0,27.0,38.0,10.0,0.0,11.0,27.0,3.0,...,7.0,16.0,Khris,Davis,Khristopher Adrian,205.0,71.0,31.0,43.0,5.0
111,2013,86.0,296.0,24.0,74.0,17.0,0.0,7.0,36.0,2.0,...,7.0,19.0,J. D.,Martinez,Julio Daniel,230.0,75.0,31.0,40.0,3.0


In [132]:
len(SVM_2018_dataframe[SVM_2018_dataframe.Difference < 0])

115

In [133]:
len(SVM_2018_dataframe[SVM_2018_dataframe.Difference == 0])

16

In [134]:
len(SVM_2018_dataframe[SVM_2018_dataframe.Difference > 0])

53

In [135]:
RF_2018_dataframe[RF_2018_dataframe.Prediction > 39]

Unnamed: 0,FirstYear_yearID,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,FirstYear_2B,FirstYear_3B,FirstYear_HR,FirstYear_RBI,FirstYear_SB,...,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age,Prediction,Difference


In [136]:
len(RF_2018_dataframe[RF_2018_dataframe.Difference < 0])

68

In [137]:
len(RF_2018_dataframe[RF_2018_dataframe.Difference == 0])

9

In [138]:
len(RF_2018_dataframe[RF_2018_dataframe.Difference > 0])

107

In [139]:
BB[BB.SixthYear_HR > 39]

Unnamed: 0,FirstYear_yearID,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,FirstYear_2B,FirstYear_3B,FirstYear_HR,FirstYear_RBI,FirstYear_SB,...,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age,Prediction,Difference
38,2013,56.0,136.0,27.0,38.0,10.0,0.0,11.0,27.0,3.0,...,7.0,16.0,Khris,Davis,Khristopher Adrian,205.0,71.0,31.0,0.0,48.0
111,2013,86.0,296.0,24.0,74.0,17.0,0.0,7.0,36.0,2.0,...,7.0,19.0,J. D.,Martinez,Julio Daniel,230.0,75.0,31.0,0.0,43.0


In [140]:
len(BB[BB.Difference < 0])

0

In [141]:
len(BB[BB.Difference == 0])

5

In [142]:
len(BB[BB.Difference > 0])

179

In [143]:
regr_2019_dataframe[regr_2019_dataframe.Prediction > 39]

Unnamed: 0,FirstYear_yearID,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,FirstYear_2B,FirstYear_3B,FirstYear_HR,FirstYear_RBI,FirstYear_SB,...,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age,Prediction,Difference


In [144]:
len(regr_2019_dataframe[regr_2019_dataframe.Difference < 0])

71

In [145]:
len(regr_2019_dataframe[regr_2019_dataframe.Difference == 0])

6

In [146]:
len(regr_2019_dataframe[regr_2019_dataframe.Difference > 0])

114

In [147]:
SVM_2019_dataframe[SVM_2019_dataframe.Prediction > 39]

Unnamed: 0,FirstYear_yearID,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,FirstYear_2B,FirstYear_3B,FirstYear_HR,FirstYear_RBI,FirstYear_SB,...,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age,Prediction,Difference
83,2014,100.0,352.0,41.0,96.0,10.0,2.0,13.0,32.0,2.0,...,4.0,10.0,Bryce,Harper,Bryce Aron Max,210.0,75.0,27.0,40.0,-5.0
116,2014,123.0,441.0,57.0,139.0,30.0,3.0,23.0,76.0,6.0,...,5.0,19.0,J. D.,Martinez,Julio Daniel,230.0,75.0,32.0,44.0,-8.0


In [148]:
len(SVM_2019_dataframe[SVM_2019_dataframe.Difference < 0])

99

In [149]:
len(SVM_2019_dataframe[SVM_2019_dataframe.Difference == 0])

15

In [150]:
len(SVM_2019_dataframe[SVM_2019_dataframe.Difference > 0])

77

In [151]:
RF_2019_dataframe[RF_2019_dataframe.Prediction > 39]

Unnamed: 0,FirstYear_yearID,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,FirstYear_2B,FirstYear_3B,FirstYear_HR,FirstYear_RBI,FirstYear_SB,...,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age,Prediction,Difference


In [152]:
len(RF_2019_dataframe[RF_2019_dataframe.Difference < 0])

43

In [153]:
len(RF_2019_dataframe[RF_2019_dataframe.Difference == 0])

7

In [154]:
len(RF_2019_dataframe[RF_2019_dataframe.Difference > 0])

141

In [155]:
CC[CC.SixthYear_HR > 39]

Unnamed: 0,FirstYear_yearID,FirstYear_G,FirstYear_AB,FirstYear_R,FirstYear_H,FirstYear_2B,FirstYear_3B,FirstYear_HR,FirstYear_RBI,FirstYear_SB,...,SixthYear_SF,SixthYear_GIDP,SixthYear_nameFirst,SixthYear_nameLast,SixthYear_nameGiven,SixthYear_weight,SixthYear_height,SixthYear_age,Prediction,Difference
8,2014,111.0,432.0,58.0,124.0,34.0,2.0,18.0,61.0,2.0,...,8.0,14.0,Nolan,Arenado,Nolan James,215.0,74.0,28.0,0.0,41.0
38,2014,159.0,613.0,87.0,166.0,32.0,2.0,40.0,108.0,4.0,...,3.0,14.0,Nelson,Cruz,Nelson Ramon,230.0,74.0,39.0,0.0,41.0
170,2014,24.0,89.0,11.0,26.0,8.0,1.0,5.0,20.0,1.0,...,4.0,16.0,Jorge,Soler,Jorge Carlos,235.0,76.0,27.0,0.0,48.0
174,2014,85.0,244.0,33.0,59.0,9.0,1.0,4.0,23.0,3.0,...,6.0,12.0,Eugenio,Suarez,Eugenio Alejandro,213.0,71.0,28.0,0.0,49.0
179,2014,157.0,602.0,115.0,173.0,39.0,9.0,36.0,111.0,16.0,...,4.0,5.0,Mike,Trout,Michael Nelson,235.0,74.0,28.0,0.0,45.0
187,2014,144.0,582.0,94.0,165.0,30.0,6.0,9.0,54.0,21.0,...,3.0,8.0,Christian,Yelich,Christian Stephen,195.0,75.0,28.0,0.0,44.0


In [156]:
len(CC[CC.Difference < 0])

0

In [157]:
len(CC[CC.Difference == 0])

4

In [158]:
len(CC[CC.Difference > 0])

187

### Save Models

In [161]:
import joblib

In [162]:
joblib.dump(regr, 'LR.pkl')

NameError: name 'regr' is not defined

In [163]:
joblib.dump(regr_2017, 'LR_2017.pkl')

['LR_2017.pkl']

In [164]:
joblib.dump(regr_2018, 'LR_2018.pkl')

['LR_2018.pkl']

In [165]:
joblib.dump(clf, 'SVM.pkl')

NameError: name 'clf' is not defined

In [166]:
joblib.dump(clf_2017, 'SVM_2017.pkl')

['SVM_2017.pkl']

In [167]:
joblib.dump(clf_2018, 'SVM_2018.pkl')

['SVM_2018.pkl']

In [168]:
joblib.dump(forest, 'RF.pkl')

NameError: name 'forest' is not defined

In [169]:
joblib.dump(forest_2017, 'RF_2017.pkl')

['RF_2017.pkl']

In [170]:
joblib.dump(forest_2018, 'RF_2018.pkl')

['RF_2018.pkl']