In [1]:
# Imports
import pandas as pd
from pathlib import Path
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder

In [2]:
# Read the applicants_data.csv file from the Resources folder into a Pandas DataFrame
data = Path("GAME_LEVEL.csv")
df = pd.read_csv(data)
df.head()



Unnamed: 0,dt,Y,M,D,away_team,home_team,away_score,home_score,away_pitcher_id,home_pitcher_id,...,home_1_id,home_2_id,home_3_id,home_4_id,home_5_id,home_6_id,home_7_id,home_8_id,home_9_id,outcome
0,2020-07-23,2020,7,23,SFN,LAN,1,8,cuetj001,may-d003,...,muncm001,bettm001,bellc002,turnj001,seagc001,herne001,pedej001,polla001,barna001,1
1,2020-07-23,2020,7,23,NYA,WAS,4,1,coleg001,schem001,...,turnt001,eatoa002,casts001,kendh001,thame001,suzuk001,cabra002,steva001,roblv001,0
2,2020-07-24,2020,7,24,COL,TEX,0,1,marqg001,lynnl001,...,choos001,andre001,santd001,gallj002,odorr001,frazt001,chirr001,solan001,kinei001,1
3,2020-07-24,2020,7,24,TOR,TBA,6,4,ryu-h001,mortc002,...,diazy001,renfh001,tsuty001,martj008,margm001,brosm001,adamw002,kierk001,zunim001,0
4,2020-07-24,2020,7,24,LAA,OAK,3,7,heLAA001,montf001,...,semim001,laurr001,chapm001,davik003,olsom001,pindc001,canhm001,piscs001,murps001,1


In [3]:
# Review the data types associated with the columns
print(df.dtypes)

dt                 object
Y                   int64
M                   int64
D                   int64
away_team          object
home_team          object
away_score          int64
home_score          int64
away_pitcher_id    object
home_pitcher_id    object
away_1_id          object
away_2_id          object
away_3_id          object
away_4_id          object
away_5_id          object
away_6_id          object
away_7_id          object
away_8_id          object
away_9_id          object
home_1_id          object
home_2_id          object
home_3_id          object
home_4_id          object
home_5_id          object
home_6_id          object
home_7_id          object
home_8_id          object
home_9_id          object
outcome             int64
dtype: object


In [4]:
# Drop the 'EIN' and 'NAME' columns from the DataFrame
df2 = df.drop(columns = ['dt','Y','M','D'])

# Review the DataFrame
df2.head()


Unnamed: 0,away_team,home_team,away_score,home_score,away_pitcher_id,home_pitcher_id,away_1_id,away_2_id,away_3_id,away_4_id,...,home_1_id,home_2_id,home_3_id,home_4_id,home_5_id,home_6_id,home_7_id,home_8_id,home_9_id,outcome
0,SFN,LAN,1,8,cuetj001,may-d003,yastm001,florw001,sandp001,dicka001,...,muncm001,bettm001,bellc002,turnj001,seagc001,herne001,pedej001,polla001,barna001,1
1,NYA,WAS,4,1,coleg001,schem001,hicka001,judga001,torrg001,stanm004,...,turnt001,eatoa002,casts001,kendh001,thame001,suzuk001,cabra002,steva001,roblv001,0
2,COL,TEX,0,1,marqg001,lynnl001,dahld001,stort001,blacc001,arenn001,...,choos001,andre001,santd001,gallj002,odorr001,frazt001,chirr001,solan001,kinei001,1
3,TOR,TBA,6,4,ryu-h001,mortc002,bichb001,biggc002,guerv002,shawt001,...,diazy001,renfh001,tsuty001,martj008,margm001,brosm001,adamw002,kierk001,zunim001,0
4,LAA,OAK,3,7,heLAA001,montf001,fletd002,troum001,ohtas001,uptoj001,...,semim001,laurr001,chapm001,davik003,olsom001,pindc001,canhm001,piscs001,murps001,1


In [5]:
# # Create a list of categorical variables 
# categorical_variables = df2.drop(columns = ['STATUS','ASK_AMT','IS_SUCCESSFUL'])
# categorical_names = ['APPLICATION_TYPE','AFFILIATION','CLASSIFICATION','USE_CASE','ORGANIZATION','INCOME_AMT','SPECIAL_CONSIDERATIONS']
# # Display the categorical variables list
# categorical_variables.head()

categorical_variables = df2[['away_team','home_team','away_pitcher_id','home_pitcher_id','home_1_id','home_2_id','home_3_id','home_4_id','home_5_id','home_6_id','home_7_id','home_8_id','home_9_id','away_1_id','away_2_id','away_3_id','away_4_id','away_5_id','away_6_id','away_7_id','away_8_id','away_9_id']]
df2 = df2.drop(columns = ['away_team','home_team','away_pitcher_id','home_pitcher_id','home_1_id','home_2_id','home_3_id','home_4_id','home_5_id','home_6_id','home_7_id','home_8_id','home_9_id','away_1_id','away_2_id','away_3_id','away_4_id','away_5_id','away_6_id','away_7_id','away_8_id','away_9_id'])

In [6]:
# Create a OneHotEncoder instance
enc =  OneHotEncoder()


In [7]:
# Encode the categorcal variables using OneHotEncoder
encoded_data = enc.fit_transform(categorical_variables)
print(encoded_data.shape)
# print(categorical_variables)



(5746, 9984)


In [8]:
# Create a DataFrame with the encoded variables
encoded_df = pd.DataFrame(
    encoded_data.toarray(),
    columns = enc.get_feature_names(['away_team','home_team','away_pitcher_id','home_pitcher_id','home_1_id','home_2_id','home_3_id','home_4_id','home_5_id','home_6_id','home_7_id','home_8_id','home_9_id','away_1_id','away_2_id','away_3_id','away_4_id','away_5_id','away_6_id','away_7_id','away_8_id','away_9_id'])
)

# Display  data
encoded_df.head()



Unnamed: 0,away_team_ARI,away_team_ATL,away_team_BAL,away_team_BOS,away_team_CHA,away_team_CHN,away_team_CIN,away_team_CLE,away_team_COL,away_team_DET,...,away_9_id_wynna001,away_9_id_yamaj001,away_9_id_yarbr001,away_9_id_yastm001,away_9_id_ynoah001,away_9_id_youna002,away_9_id_zavas001,away_9_id_zimmb001,away_9_id_zimmb002,away_9_id_zunim001
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Add the numerical variables from the original DataFrame to the one-hot encoding DataFrame
encoded_df = pd.concat([encoded_df,df2],axis =1)

# Review the Dataframe
encoded_df.head()


Unnamed: 0,away_team_ARI,away_team_ATL,away_team_BAL,away_team_BOS,away_team_CHA,away_team_CHN,away_team_CIN,away_team_CLE,away_team_COL,away_team_DET,...,away_9_id_yastm001,away_9_id_ynoah001,away_9_id_youna002,away_9_id_zavas001,away_9_id_zimmb001,away_9_id_zimmb002,away_9_id_zunim001,away_score,home_score,outcome
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,8,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,1,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,4,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,7,1


In [10]:

y = encoded_df['outcome']

# Display a sample of y
y.head()


0    1
1    0
2    1
3    0
4    1
Name: outcome, dtype: int64

In [11]:
# Define features set X by selecting all columns but IS_SUCCESSFUL
X = encoded_df.drop(columns = 'outcome')

# Review the features DataFrame
X.head()


Unnamed: 0,away_team_ARI,away_team_ATL,away_team_BAL,away_team_BOS,away_team_CHA,away_team_CHN,away_team_CIN,away_team_CLE,away_team_COL,away_team_DET,...,away_9_id_yarbr001,away_9_id_yastm001,away_9_id_ynoah001,away_9_id_youna002,away_9_id_zavas001,away_9_id_zimmb001,away_9_id_zimmb002,away_9_id_zunim001,away_score,home_score
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,8
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,4
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,7


In [12]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=50)


In [13]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the features training dataset
scaler.fit(X_train)

# Scale both the training and testing data from the features dataset
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [14]:
# Define the the number of inputs (features) to the model
number_inputs = 9986

# Review the number of features
number_inputs

9986

In [15]:
# Define the number of neurons in the output layer
number_outputs = 1

In [16]:
# Define the number of hidden nodes for the first hidden layer
hidden_nodes_layer1 = 40

# Review the number hidden nodes in the first layer
hidden_nodes_layer1


40

In [17]:
# Define the number of hidden nodes for the second hidden layer
hidden_nodes_layer2 =  13

# Review the number hidden nodes in the second layer
hidden_nodes_layer2


13

In [18]:
# Create the Sequential model instance
nn = Sequential()


In [19]:
# Add the first hidden layer
nn.add(Dense(units=hidden_nodes_layer1, input_dim=number_inputs, activation="relu"))


In [20]:
# Add the second hidden layer
nn.add(Dense(units=hidden_nodes_layer2, activation="relu"))


In [21]:
# Add the output layer to the model specifying the number of output neurons and activation function
nn.add(Dense(number_outputs, activation="relu"))


In [22]:
# Display the Sequential model summary
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 40)                399480    
                                                                 
 dense_1 (Dense)             (None, 13)                533       
                                                                 
 dense_2 (Dense)             (None, 1)                 14        
                                                                 
Total params: 400,027
Trainable params: 400,027
Non-trainable params: 0
_________________________________________________________________


In [23]:
# Compile the Sequential model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [24]:
# Fit the model using 50 epochs and the training data
nn.fit(X_train_scaled,y_train, 
                    epochs=1,
                    batch_size=100,
                    shuffle=True)




<keras.callbacks.History at 0x26f906aba48>

In [25]:
# Evaluate the model loss and accuracy metrics using the evaluate method and the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
# Display the model loss and accuracy results
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

45/45 - 0s - loss: 4.8298 - accuracy: 0.5372 - 297ms/epoch - 7ms/step
Loss: 4.829827785491943, Accuracy: 0.5372303128242493


In [27]:
# Define the the number of inputs (features) to the model
number_input_features = len(X_train.iloc[0])

# Review the number of features
number_input_features

9986

In [28]:
# Define the number of neurons in the output layer
number_output_neurons_A1 = 1

In [29]:
# Define the number of hidden nodes for the first hidden layer
hidden_nodes_layer1_A1 = 20

# Review the number of hidden nodes in the first layer
hidden_nodes_layer1_A1

20

In [30]:
# Create the Sequential model instance
nn_A1 = Sequential()

In [31]:
# First hidden layer
nn_A1.add(Dense(units=hidden_nodes_layer1_A1, input_dim=number_inputs, activation="relu"))


# Output layer
nn_A1.add(Dense(number_output_neurons_A1, activation="relu"))


# Check the structure of the model
nn_A1.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 20)                199740    
                                                                 
 dense_4 (Dense)             (None, 1)                 21        
                                                                 
Total params: 199,761
Trainable params: 199,761
Non-trainable params: 0
_________________________________________________________________


In [32]:
# Compile the Sequential model
nn_A1.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [33]:
# Fit the model using 50 epochs and the training data
fit_model_A1 = nn_A1.fit(X_train_scaled,y_train, 
                    epochs=1,
                    batch_size=100,
                    shuffle=True)





In [34]:
# Define the the number of inputs (features) to the model
number_input_features = len(X_train.iloc[0])

# Review the number of features
number_input_features

9986

In [35]:
# Define the number of neurons in the output layer
number_output_neurons_A2 = 1

In [36]:
# Define the number of hidden nodes for the first hidden layer
hidden_nodes_layer1_A2 = 80
hidden_nodes_layer2_A2 = 45
hidden_nodes_layer3_A2 = 12
# Review the number of hidden nodes in the first layer



In [37]:
# Create the Sequential model instance
nn_A2 = Sequential()

In [38]:
# First hidden layer
nn_A2.add(Dense(units=hidden_nodes_layer1_A2, input_dim=number_inputs, activation="relu"))
nn_A2.add(Dense(units=hidden_nodes_layer2_A2, input_dim=number_inputs, activation="relu"))
nn_A2.add(Dense(units=hidden_nodes_layer3_A2, input_dim=number_inputs, activation="relu"))


# Output layer
nn_A2.add(Dense(number_output_neurons_A2, activation="relu"))


# Check the structure of the model
nn_A2.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (None, 80)                798960    
                                                                 
 dense_6 (Dense)             (None, 45)                3645      
                                                                 
 dense_7 (Dense)             (None, 12)                552       
                                                                 
 dense_8 (Dense)             (None, 1)                 13        
                                                                 
Total params: 803,170
Trainable params: 803,170
Non-trainable params: 0
_________________________________________________________________


In [39]:
# Compile the model
nn_A2.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [40]:
# Fit the model
fit_model_A2 = nn_A1.fit(X_train_scaled,y_train, 
                    epochs=1,
                    batch_size=100,
                    shuffle=True)




In [41]:
print("Original Model Results")

# Evaluate the model loss and accuracy metrics using the evaluate method and the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)

# Display the model loss and accuracy results
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Original Model Results
45/45 - 0s - loss: 4.8298 - accuracy: 0.5372 - 186ms/epoch - 4ms/step
Loss: 4.829827785491943, Accuracy: 0.5372303128242493


In [42]:
print("Alternative Model 1 Results")

model_loss, model_accuracy = nn_A1.evaluate(X_test_scaled, y_test, verbose=2)

# Display the model loss and accuracy results
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Alternative Model 1 Results
45/45 - 0s - loss: 6.6134 - accuracy: 0.5094 - 259ms/epoch - 6ms/step
Loss: 6.613367080688477, Accuracy: 0.5093945860862732


In [43]:
print("Alternative Model 2 Results")

# Evaluate the model loss and accuracy metrics using the evaluate method and the test data
model_loss, model_accuracy = nn_A2.evaluate(X_test_scaled, y_test, verbose=2)

# Display the model loss and accuracy results
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Alternative Model 2 Results
45/45 - 0s - loss: 6.8021 - accuracy: 0.4656 - 379ms/epoch - 8ms/step
Loss: 6.802117824554443, Accuracy: 0.4655532240867615


In [44]:
# Set the file path for the first alternative model
file_path_A1 = Path("AlphabetSoup2.h5")

# Export your model to a HDF5 file
nn_A1.save(file_path_A1)

In [45]:
# Set the file path for the second alternative model
file_path_A2 = Path("AlphabetSoup3.h5")

# Export your model to a HDF5 file
nn_A2.save(file_path_A2)

In [47]:
#XG Boost

In [48]:
import math
from sklearn.linear_model import LogisticRegressionCV


In [50]:
import xgboost as xgb
# create an XGBoost model object
xgb_model = xgb.XGBClassifier()

# fit the model to the training data
xgb_model.fit(X_train_scaled, y_train)

# make predictions on the test data
xgb_pred = xgb_model.predict(X_test_scaled)


In [51]:
from sklearn import metrics

# calculate accuracy on the test data
accuracy = metrics.accuracy_score(y_test, xgb_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

# get a summary of the performance
report = metrics.classification_report(y_test, xgb_pred)
print("Classification Report:\n", report)


Accuracy: 100.00%
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       653
           1       1.00      1.00      1.00       784

    accuracy                           1.00      1437
   macro avg       1.00      1.00      1.00      1437
weighted avg       1.00      1.00      1.00      1437



In [52]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# create a GBM model object
gbm_model = GradientBoostingClassifier()

# fit the model to the training data
gbm_model.fit(X_train, y_train)

# make predictions on the test data
gbc_pred = gbm_model.predict(X_test_scaled)

# evaluate the accuracy of the model
accuracy = accuracy_score(y_test, gbc_pred)
print("Accuracy:", accuracy)


Accuracy: 0.9951287404314544


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=50)

# Define the parameter grid for the grid search
param_grid = {
    'learning_rate': [0.05, 0.1],
    'n_estimators': [100, 500],
    'max_depth': [3, 5],
    'subsample': [0.8, 1.0],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Create the GBM model object
gbm_model = GradientBoostingClassifier()

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(gbm_model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Train the model with the best hyperparameters
gbm_model = grid_search.best_estimator_
gbm_model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = gbm_model.predict(X_test)

# Print the accuracy score
print('Accuracy score: {:.4f}'.format(accuracy_score(y_test, y_pred)))
