In [1]:
# Dependencies
import pandas as pd

In [2]:
# Load data
df = pd.read_csv('../data/machineLearningDataSet.csv')

# Drop the null columns where all values are null -- nothing to drop
df = df.dropna(axis='columns', how='all')

# Drop the null rows -- nothing to drop
df = df.dropna()

# Drop Unnamed: 0
df = df.drop(columns=['Unnamed: 0'])

df

Unnamed: 0,gameID,homeTeamID,visitorTeamID,homeTeamHeightAverage,homeTeamWeightAverage,homeTeamAgeAverage,visitorTeamHeightAverage,visitorTeamWeightAverage,visitorTeamAgeAverage,homeTeamWin
0,1.0,2.0,23.0,199.390000,102.625190,26.416667,200.269231,96.440638,25.153846,1.0
1,2.0,10.0,21.0,201.441538,97.906089,28.000000,199.878462,97.696738,25.692308,1.0
2,3.0,4.0,17.0,201.050769,99.301756,26.846154,201.050769,100.941666,26.692308,0.0
3,4.0,9.0,3.0,198.966667,100.243832,27.466667,201.718333,98.656260,25.166667,1.0
4,5.0,12.0,15.0,199.683077,97.208255,26.461538,201.832308,99.511106,27.692308,1.0
...,...,...,...,...,...,...,...,...,...,...
6553,62481.0,28.0,10.0,198.901538,97.626955,27.923077,201.832308,99.476215,28.615385,0.0
6554,62482.0,10.0,28.0,202.418462,100.348507,28.692308,198.315385,97.975872,27.923077,0.0
6555,62483.0,10.0,28.0,201.832308,99.476215,28.615385,198.901538,98.952839,28.000000,0.0
6556,62484.0,28.0,10.0,198.901538,97.626955,27.923077,202.418462,100.522966,29.230769,0.0


In [3]:
# Assign X (data) and y (target)
drop_col = ['gameID', 'homeTeamID', 'visitorTeamID', 'homeTeamWin']
X = df.drop(drop_col, axis=1)
y = df['homeTeamWin']

print(X.shape, y.shape)

(6558, 6) (6558,)


In [4]:
# Split our data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [5]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [6]:
# Create a Logistic Regression Model
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier

LogisticRegression()

In [7]:
# Fit (train) or model using the training data
classifier.fit(X_train, y_train)

LogisticRegression()

In [8]:
# Validate the model using the test data
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.6026840178934526
Testing Data Score: 0.5975609756097561


In [9]:
# Make predictions 
predictions = classifier.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [1. 1. 0. 1. 1. 1. 0. 1. 1. 1.]
First 10 Actual labels: [0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0]


In [10]:
# Save the model
import joblib
filename = 'saved_logistic_game.sav'
joblib.dump(classifier, filename)

['saved_logistic_game.sav']