In [24]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import AdaBoostClassifier
import joblib
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

#Read in the data from nfl.xlsx
df = pd.read_excel('nfl.xlsx')

#Create x and y where y is the target and x is the data
x = df[df.columns[:-1]].to_numpy()
y = df[df.columns[-1]].to_numpy()

#split x and y into trianing and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=33) #33

from sklearn.model_selection import GridSearchCV

# hyperparameter grid
hp_grid = [{'criterion':['gini', 'entropy'], 'min_samples_split':[2, 3, 4], 'max_features':['auto', 'sqrt', 'log2']}]

# create the model
model = RandomForestClassifier()

# create the grid object
grid_search = GridSearchCV(model, hp_grid, cv=5, scoring='accuracy', return_train_score=False)

# grid search
grid_search.fit(X_train, y_train)

#store the results of the grid search in the variable results
results = grid_search.cv_results_

#print out the results of the grid search
for mean_score, params in zip(results['mean_test_score'], results['params']):
       print(mean_score, params)

#Store the best variable in the classifier the_best
the_best = grid_search.best_estimator_

#fit the_best and store it in the variable classifier
classifier = the_best.fit(X_train, y_train)

#y_hat represents the classifiers predicitons
y_hat = classifier.predict(X_test)

print("y_hat:", y_hat)
print("y_test:", y_test)
print(accuracy_score(y_hat, y_test))

#save the model
filename = 'RF_model.sav'
#joblib.dump(classifier, filename)

0.6108225108225108 {'criterion': 'gini', 'max_features': 'auto', 'min_samples_split': 2}
0.5839826839826839 {'criterion': 'gini', 'max_features': 'auto', 'min_samples_split': 3}
0.6112554112554112 {'criterion': 'gini', 'max_features': 'auto', 'min_samples_split': 4}
0.6294372294372295 {'criterion': 'gini', 'max_features': 'sqrt', 'min_samples_split': 2}
0.583116883116883 {'criterion': 'gini', 'max_features': 'sqrt', 'min_samples_split': 3}
0.6294372294372295 {'criterion': 'gini', 'max_features': 'sqrt', 'min_samples_split': 4}
0.6298701298701298 {'criterion': 'gini', 'max_features': 'log2', 'min_samples_split': 2}
0.6303030303030303 {'criterion': 'gini', 'max_features': 'log2', 'min_samples_split': 3}
0.5645021645021645 {'criterion': 'gini', 'max_features': 'log2', 'min_samples_split': 4}
0.6203463203463203 {'criterion': 'entropy', 'max_features': 'auto', 'min_samples_split': 2}
0.6203463203463203 {'criterion': 'entropy', 'max_features': 'auto', 'min_samples_split': 3}
0.58354978354978

['RF_model.sav']