In [1]:
# load dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Align Prediction Features and Model-creation Features

In [2]:
# import data
from mega import model_df, unplayed_games

df = model_df.copy()
unplayed_games_clean = unplayed_games[3:].copy()

In [3]:
# check imported file
df.head()

Unnamed: 0,Gm#,W/L,D/N,H/A,Opp,COL_at_bats,COL_ba,COL_hits,COL_hr,COL_kk,COL_obp,COL_walks,Opp_at_bats,Opp_ba,Opp_hits,OPP_HR_Column,OPP_kk,Opp_obp,Opp_walks
0,1,L,D,A,MIL,31,0.258065,8,1,8,0.351351,4,38,0.315789,12,3,11,0.333333,1
1,2,W,N,A,MIL,32,0.28125,9,1,5,0.342857,3,37,0.378378,14,2,11,0.390244,2
2,3,W,N,A,MIL,34,0.264706,9,1,5,0.305556,2,37,0.405405,15,3,8,0.463415,4
3,4,W,D,H,SDP,33,0.212121,7,0,8,0.235294,1,0,0.0,0,0,0,0.0,0
4,5,W,N,H,SDP,33,0.242424,8,0,4,0.305556,3,0,0.0,0,0,0,0.0,0


In [4]:
# check imported file
unplayed_games_clean.head()

Unnamed: 0,Gm#,W/L,D/N,H/A,Opp,COL_at_bats,COL_ba,COL_hits,COL_hr,COL_kk,COL_obp,COL_walks,Opp_at_bats,Opp_ba,Opp_hits,OPP_HR_Column,OPP_kk,Opp_obp,Opp_walks
1816,137,8:10 pm,Game Preview and Matchups,H,BAL,-1,-1.0,-1,-1,-1,-1.0,-1,29,0.310345,9,0,9,0.393939,4
1817,138,3:10 pm,Game Preview and Matchups,H,BAL,-1,-1.0,-1,-1,-1,-1.0,-1,30,0.2,6,0,11,0.25,2
1818,139,7:20 pm,Game Preview and Matchups,A,ATL,-1,-1.0,-1,-1,-1,-1.0,-1,31,0.193548,-1,0,-1,0.21875,-1
1819,140,7:20 pm,Game Preview and Matchups,A,ATL,-1,-1.0,-1,-1,-1,-1.0,-1,-1,-1.0,-1,-1,-1,-1.0,-1
1820,141,7:20 pm,Game Preview and Matchups,A,ATL,-1,-1.0,-1,-1,-1,-1.0,-1,-1,-1.0,-1,-1,-1,-1.0,-1


In [5]:
# drop the "Opp" column, so that we dont have to encode all the team names. 
# Not all teams that are in the model df "df" exist in the prediction df "unplayed_games_clean"
# this ensures that the same number of features are used between creating the RF model and predicting

df = df.copy().drop(columns="Opp")


# drop Opp later in "unplayed_games_clean" in the code, after this column is no longer needed for season calcuations

In [6]:
print(f"Model df columns = {df.columns}")
print()
print(f"Prediction df columns = {unplayed_games_clean.columns}")

Model df columns = Index(['Gm#', 'W/L', 'D/N', 'H/A', 'COL_at_bats', 'COL_ba', 'COL_hits',
       'COL_hr', 'COL_kk', 'COL_obp', 'COL_walks', 'Opp_at_bats', 'Opp_ba',
       'Opp_hits', 'OPP_HR_Column', 'OPP_kk', 'Opp_obp', 'Opp_walks'],
      dtype='object')

Prediction df columns = Index(['Gm#', 'W/L', 'D/N', 'H/A', 'Opp', 'COL_at_bats', 'COL_ba', 'COL_hits',
       'COL_hr', 'COL_kk', 'COL_obp', 'COL_walks', 'Opp_at_bats', 'Opp_ba',
       'Opp_hits', 'OPP_HR_Column', 'OPP_kk', 'Opp_obp', 'Opp_walks'],
      dtype='object')


# Prep Data for ML

In [7]:
# check data types

df.dtypes

Gm#                int64
W/L               object
D/N               object
H/A               object
COL_at_bats        int64
COL_ba           float64
COL_hits           int64
COL_hr             int64
COL_kk             int64
COL_obp          float64
COL_walks          int64
Opp_at_bats        int64
Opp_ba           float64
Opp_hits          object
OPP_HR_Column      int64
OPP_kk            object
Opp_obp          float64
Opp_walks          int64
dtype: object

In [8]:
df = df.rename(columns={"OPP_HR_Column": "Opp_hr"})

In [9]:
# confirm there are only 2 values in the win/loss column
uniquevalues = df["W/L"].unique()
print(uniquevalues)

['L' 'W']


In [10]:
# encode object columns as needed
# intialize encoders
le = LabelEncoder()

In [11]:
# encode columns with only 2 unique values
df["D/N"] = le.fit_transform(df["D/N"])
df["H/A"] = le.fit_transform(df["H/A"])
df["W/L"] = le.fit_transform(df["W/L"])
df

Unnamed: 0,Gm#,W/L,D/N,H/A,COL_at_bats,COL_ba,COL_hits,COL_hr,COL_kk,COL_obp,COL_walks,Opp_at_bats,Opp_ba,Opp_hits,Opp_hr,OPP_kk,Opp_obp,Opp_walks
0,1,0,0,0,31,0.258065,8,1,8,0.351351,4,38,0.315789,12,3,11,0.333333,1
1,2,1,1,0,32,0.281250,9,1,5,0.342857,3,37,0.378378,14,2,11,0.390244,2
2,3,1,1,0,34,0.264706,9,1,5,0.305556,2,37,0.405405,15,3,8,0.463415,4
3,4,1,0,1,33,0.212121,7,0,8,0.235294,1,0,0.000000,0,0,0,0.000000,0
4,5,1,1,1,33,0.242424,8,0,4,0.305556,3,0,0.000000,0,0,0,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1808,129,0,1,0,28,0.178571,5,2,5,0.233333,2,31,0.129032,4,0,9,0.205882,3
1809,130,1,0,0,29,0.172414,5,1,9,0.294118,4,39,0.333333,13,2,14,0.386364,4
1810,131,0,0,0,35,0.342857,12,5,6,0.410256,3,29,0.206897,6,0,8,0.323529,5
1811,132,1,1,1,34,0.264706,9,0,8,0.324324,3,28,0.214286,6,1,11,0.312500,4


In [12]:
# debug: Locate the positions of "error"
error_positions = df.isin(['error'])

# get the indices of "error" values
error_indices = [(row, col) for row, col in zip(*error_positions.to_numpy().nonzero())]

# Display the indices
print("\nIndices of 'error' values:")
print(error_indices)


Indices of 'error' values:
[(37, 15), (1118, 13)]


In [13]:
# apply new values to those data cells
df.iloc[37, 15] = 0
df.iloc[1118, 13] = 0


In [14]:
# ensure column values are numerical
df['OPP_kk'] = pd.to_numeric(df['OPP_kk'], errors='coerce')
df['Opp_hits'] = pd.to_numeric(df['Opp_hits'], errors='coerce')

In [15]:
# check dtypes for all columns again
df.dtypes

Gm#              int64
W/L              int32
D/N              int32
H/A              int32
COL_at_bats      int64
COL_ba         float64
COL_hits         int64
COL_hr           int64
COL_kk           int64
COL_obp        float64
COL_walks        int64
Opp_at_bats      int64
Opp_ba         float64
Opp_hits         int64
Opp_hr           int64
OPP_kk           int64
Opp_obp        float64
Opp_walks        int64
dtype: object

In [16]:
# Get target variable ("win/loss" column) - y 
y = df["W/L"]
len(y)

1751

In [17]:
# Get the features - X
X = df.copy()
X =X.drop(columns = "W/L", axis =1)
X

Unnamed: 0,Gm#,D/N,H/A,COL_at_bats,COL_ba,COL_hits,COL_hr,COL_kk,COL_obp,COL_walks,Opp_at_bats,Opp_ba,Opp_hits,Opp_hr,OPP_kk,Opp_obp,Opp_walks
0,1,0,0,31,0.258065,8,1,8,0.351351,4,38,0.315789,12,3,11,0.333333,1
1,2,1,0,32,0.281250,9,1,5,0.342857,3,37,0.378378,14,2,11,0.390244,2
2,3,1,0,34,0.264706,9,1,5,0.305556,2,37,0.405405,15,3,8,0.463415,4
3,4,0,1,33,0.212121,7,0,8,0.235294,1,0,0.000000,0,0,0,0.000000,0
4,5,1,1,33,0.242424,8,0,4,0.305556,3,0,0.000000,0,0,0,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1808,129,1,0,28,0.178571,5,2,5,0.233333,2,31,0.129032,4,0,9,0.205882,3
1809,130,0,0,29,0.172414,5,1,9,0.294118,4,39,0.333333,13,2,14,0.386364,4
1810,131,0,0,35,0.342857,12,5,6,0.410256,3,29,0.206897,6,0,8,0.323529,5
1811,132,1,1,34,0.264706,9,0,8,0.324324,3,28,0.214286,6,1,11,0.312500,4


In [18]:
X.columns

Index(['Gm#', 'D/N', 'H/A', 'COL_at_bats', 'COL_ba', 'COL_hits', 'COL_hr',
       'COL_kk', 'COL_obp', 'COL_walks', 'Opp_at_bats', 'Opp_ba', 'Opp_hits',
       'Opp_hr', 'OPP_kk', 'Opp_obp', 'Opp_walks'],
      dtype='object')

### split and scale data

In [19]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5)
# 9, 

In [20]:
# scale the data!
# Initialize the scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data using the same parameters
X_test_scaled = scaler.transform(X_test)



# Deep Learning model

In [21]:
import tensorflow as tf

In [22]:
number_input_features = len(X_train.columns)
hidden_nodes_layer1 =  8
hidden_nodes_layer2 = 8
hidden_nodes_layer3 = 8

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# fourth hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [23]:
# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=200)

Epoch 1/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 907us/step - accuracy: 0.4088 - loss: 0.7972
Epoch 2/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 720us/step - accuracy: 0.4647 - loss: 0.7119
Epoch 3/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 732us/step - accuracy: 0.6000 - loss: 0.6844
Epoch 4/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 720us/step - accuracy: 0.6700 - loss: 0.6686
Epoch 5/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 710us/step - accuracy: 0.7027 - loss: 0.6514
Epoch 6/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 712us/step - accuracy: 0.6723 - loss: 0.6411
Epoch 7/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6974 - loss: 0.6157 
Epoch 8/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 857us/step - accuracy: 0.7077 - loss: 0.5871
Epoch 9/200
[1m42/42[0m [32m━━

### hypertune model 

In [24]:
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])

    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=10,
        step=2), activation=activation, input_dim=len(X_train.columns)))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 6)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=10,
            step=2),
            activation=activation))

    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])

    return nn_model

In [25]:
# Import the kerastuner library
import keras_tuner as kt

tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=100,
    hyperband_iterations=2)

In [None]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled,y_train,epochs=100,validation_data=(X_test_scaled,y_test))

In [None]:
# Get best model hyperparameters
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

In [None]:
# Evaluate best model against full test data
best_model = tuner.get_best_models(1)[0]
model_loss, model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

# Load unplayed game csv

In [24]:
unplayed_games_final = pd.read_csv('../Project 2/randomforest_support_csv_files/unplayed_games_populated.csv')
unplayed = unplayed_games[3:].copy()
gm_col = unplayed['Gm#'].reset_index(drop=True)
unplayed_games_topred = pd.concat([unplayed_games_final, gm_col], axis=1)
pop_gm = unplayed_games_topred.pop('Gm#')
unplayed_games_topred.insert(0, "Gm#", pop_gm)
unplayed_games_topred

Unnamed: 0,Gm#,W/L,D/N,H/A,COL_at_bats,COL_ba,COL_hits,COL_hr,COL_kk,COL_obp,COL_walks,Opp_at_bats,Opp_ba,Opp_hits,OPP_kk,Opp_obp,Opp_walks,Opp_hr
0,137,,1,1,33.011422,0.278016,9.374643,1.205026,7.342661,0.344419,3.214163,34.173611,0.253,8.548611,8.256944,0.316,3.020833,1.451389
1,138,,0,1,33.011422,0.278016,9.374643,1.205026,7.342661,0.344419,3.214163,34.173611,0.253,8.548611,8.256944,0.316,3.020833,1.451389
2,139,,1,0,33.011422,0.278016,9.374643,1.205026,7.342661,0.344419,3.214163,33.272727,0.245,8.020979,8.993007,0.306,2.937063,1.265734
3,140,,1,0,33.011422,0.278016,9.374643,1.205026,7.342661,0.344419,3.214163,33.272727,0.245,8.020979,8.993007,0.306,2.937063,1.265734
4,141,,1,0,33.011422,0.278016,9.374643,1.205026,7.342661,0.344419,3.214163,33.272727,0.245,8.020979,8.993007,0.306,2.937063,1.265734
5,142,,1,0,33.011422,0.278016,9.374643,1.205026,7.342661,0.344419,3.214163,33.832168,0.278,8.454545,8.853147,0.328,3.741259,1.118881
6,143,,1,0,33.011422,0.278016,9.374643,1.205026,7.342661,0.344419,3.214163,33.832168,0.278,8.454545,8.853147,0.328,3.741259,1.118881
7,144,,0,0,33.011422,0.278016,9.374643,1.205026,7.342661,0.344419,3.214163,33.832168,0.278,8.454545,8.853147,0.328,3.741259,1.118881
8,145,,1,0,33.011422,0.278016,9.374643,1.205026,7.342661,0.344419,3.214163,33.361111,0.191,7.8125,8.805556,0.298,2.8125,1.006944
9,146,,1,0,33.011422,0.278016,9.374643,1.205026,7.342661,0.344419,3.214163,33.361111,0.191,7.8125,8.805556,0.298,2.8125,1.006944


### Separating Prediction Target Variable from Prediction Features

In [25]:
# get target and features
y_to_predict = unplayed_games_topred["W/L"].copy()
X_to_predict = unplayed_games_topred.drop(columns=["W/L"]).copy()

# Casting Predictions on Prediction Data

In [26]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Fit and transform the feature data
X_prediction_data = scaler.fit_transform(X_to_predict)


In [27]:
# make predictions

y_pred = nn.predict(X_prediction_data)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step


In [28]:
y_round = y_pred.round()

display(y_round)
#len(y_round)


array([[0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.]], dtype=float32)

# Create Mini Dataframe to show Game#, Opponent, and Prediction for W/L

In [29]:
# grab the date from original dataframe
from mega import mega_concat_df

mega_concat_df.columns
dates = mega_concat_df['Formatted_Date'].tail(26)
opponents = mega_concat_df["Opp"].tail(26)

In [30]:
# equalize all index values for all separate pieces
dates = dates.reset_index(drop=True)
opponents = opponents.reset_index(drop=True)
winloss = pd.DataFrame(y_round)

In [31]:
# concat and rename columns for new minidataframe
predicted_winLoss_df = pd.concat([dates, opponents,winloss], axis=1, )
predicted_winLoss_df = predicted_winLoss_df.rename(columns = {0: "Win/Loss"})
predicted_winLoss_df['Win/Loss'] = predicted_winLoss_df['Win/Loss'].replace({0:"loss", 1:"win"})
predicted_winLoss_df

Unnamed: 0,Formatted_Date,Opp,Win/Loss
0,2024-08-31,BAL,loss
1,2024-09-01,BAL,loss
2,2024-09-03,ATL,win
3,2024-09-04,ATL,win
4,2024-09-05,ATL,win
5,2024-09-06,MIL,win
6,2024-09-07,MIL,win
7,2024-09-08,MIL,win
8,2024-09-10,DET,win
9,2024-09-11,DET,win


# Visualizations

In [None]:
# plot randomforest tree as a diagram
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt


# grab list of features from test data 
feature_list = X.columns.tolist()

# Visualize one of the trees from the forest
plt.figure(figsize=(240, 80))
plot_tree(nn.estimators_[0], feature_names=feature_list, class_names=['Lose[0]', 'Win[1]'], filled=True)
plt.title("Decision Tree from Random Forest")
plt.show()

In [None]:
# plot feature importance chart
import matplotlib.pyplot as plt
import pandas as pd



# Extract feature importances from randomforest model
importances = rf_model.feature_importances_

# Get feature names from the training data
feature_names = X_train.columns

# Create a DataFrame using feature names and feature importances 
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

# Sort the DataFrame by importance, descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
top_10_features = feature_importance_df.head(10)

# Plot the feature importances as a bar chart
plt.figure(figsize=(10, 10))
plt.barh(top_10_features['Feature'], top_10_features['Importance'])
plt.xlabel('Importance')
plt.title('Top 10 Feature Importances')
plt.gca().invert_yaxis()  # Invert y-axis to have the most important feature at the top
plt.show()