In [1]:
# Dependencies
import pandas as pd

In [2]:
# Load data
df = pd.read_csv('../data/machineLearningDataSet.csv')

# Drop the null columns where all values are null -- nothing to drop
df = df.dropna(axis='columns', how='all')

# Drop the null rows -- nothing to drop
df = df.dropna()

# Drop Unnamed: 0
df = df.drop(columns=['Unnamed: 0'])

df

Unnamed: 0,gameID,homeTeamID,visitorTeamID,homeTeamHeightAverage,homeTeamWeightAverage,homeTeamAgeAverage,visitorTeamHeightAverage,visitorTeamWeightAverage,visitorTeamAgeAverage,homeTeamWin
0,1.0,2.0,23.0,199.390000,102.625190,26.416667,200.269231,96.440638,25.153846,1.0
1,2.0,10.0,21.0,201.441538,97.906089,28.000000,199.878462,97.696738,25.692308,1.0
2,3.0,4.0,17.0,201.050769,99.301756,26.846154,201.050769,100.941666,26.692308,0.0
3,4.0,9.0,3.0,198.966667,100.243832,27.466667,201.718333,98.656260,25.166667,1.0
4,5.0,12.0,15.0,199.683077,97.208255,26.461538,201.832308,99.511106,27.692308,1.0
...,...,...,...,...,...,...,...,...,...,...
6553,62481.0,28.0,10.0,198.901538,97.626955,27.923077,201.832308,99.476215,28.615385,0.0
6554,62482.0,10.0,28.0,202.418462,100.348507,28.692308,198.315385,97.975872,27.923077,0.0
6555,62483.0,10.0,28.0,201.832308,99.476215,28.615385,198.901538,98.952839,28.000000,0.0
6556,62484.0,28.0,10.0,198.901538,97.626955,27.923077,202.418462,100.522966,29.230769,0.0


In [3]:
# Assign X (data) and y (target)
drop_col = ['gameID', 'homeTeamID', 'visitorTeamID', 'homeTeamWin']
X = df.drop(drop_col, axis=1)
y = df['homeTeamWin']

y_names = ['Lose', 'Win']

print(X.shape, y.shape)

(6558, 6) (6558,)


In [4]:
# Split our data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [5]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from tensorflow.keras.utils import to_categorical

X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [6]:
# Support vector machine linear classifier
from sklearn.svm import SVC 

model = SVC(kernel='linear')

In [7]:
# Fit model using the training data
model.fit(X_train_scaled, y_train)

SVC(kernel='linear')

In [8]:
# Scoring model
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 0.5870272468483123
Testing Data Score: 0.5829268292682926


In [9]:
# Calculate classification report
from sklearn.metrics import classification_report

predictions = model.predict(X_test_scaled)
print(classification_report(y_test, predictions,
                            target_names=y_names))

              precision    recall  f1-score   support

        Lose       0.00      0.00      0.00       684
         Win       0.58      1.00      0.74       956

    accuracy                           0.58      1640
   macro avg       0.29      0.50      0.37      1640
weighted avg       0.34      0.58      0.43      1640



  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [11]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

print(grid.best_params_)
print(grid.best_score_)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.587, total=   0.3s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.587, total=   0.3s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.586, total=   0.3s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.587, total=   0.2s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.587, total=   0.3s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.587, total=   0.3s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.587, total=   0.2s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.586, total=   0.3s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.587, total=   0.3s
[CV] C=1, gamma=0.001 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:   12.0s finished


{'C': 1, 'gamma': 0.0001}
0.5870272270881407


In [12]:
# Save the model
import joblib
filename = 'saved_SVC_game.sav'
joblib.dump(model, filename)

['saved_SVC_game.sav']