In [1]:
# Import all packages and dependencies
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import sys, warnings, os
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from IPython.core.display import HTML, display

In [2]:
#Setting columns and rows to display all the results
pd.set_option("display.max_columns", None, "display.max_rows", None)

In [3]:
def load_notebook_config(width=True):
    """
    Loads all neccesary configuration for the notebook's style:
     - plots styling.
     - pandas table sizes and limiting amount of float decimals.
     - adjust the notebook cells width
    """
    pd.options.display.max_columns = 0
    pd.set_option('display.float_format', lambda x: '%.4f' % x)
    pd.options.mode.chained_assignment = None

    if width:
        display(HTML("""<link href='https://fonts.googleapis.com/css?family=Montserrat' rel='stylesheet'>
                        <style> div.text_cell_render{font-family: 'Montserrat';}
                                .container { width:95% !important;}
                        </style>"""))
load_notebook_config()

In [4]:
#Ignore warnings when validating scores
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"

In [5]:
# Read in the dataset
df = pd.read_csv("../Resources/data_preprocessed.csv")
df.head()

Unnamed: 0,index,Winner,title_bout,lose_streak_dif,win_streak_dif,longest_win_streak_dif,win_dif,loss_dif,total_round_dif,total_title_bout_dif,ko_dif,sub_dif,height_dif,reach_dif,age_dif,sig_str_dif,avg_sub_att_dif,avg_td_dif,draw_diff,avg_sig_str_pct_diff,avg_TD_pct_diff,win_by_Decision_Majority_diff,win_by_Decision_Split_diff,win_by_Decision_Unanimous_diff,win_by_TKO_Doctor_Stoppage_diff,odds_diff,ev_diff,kd_bout_diff,sig_str_landed_bout_diff,sig_str_attempted_bout_diff,sig_str_pct_bout_diff,tot_str_landed_bout_diff,tot_str_attempted_bout_diff,td_landed_bout_diff,td_attempted_bout_diff,td_pct_bout_diff,sub_attempts_bout_diff,pass_bout_diff,rev_bout_diff,Stance_diff,better_rank_enc,level_0
0,0,0,0,0,-1,-7,-27,-13,-57,-1,-17,-8,7.62,0.0,-8,1.05,-0.6,-0.75,0,0.0,0.0,-1,1,-2,0,-332,-95.0549,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,-1,0
1,1,1,0,0,0,0,12,7,80,9,2,1,-12.7,-5.08,11,-3.18,-0.2,1.21,1,0.0,0.0,0,0,9,0,700,275.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,-1,1
2,2,1,0,-1,0,-3,-6,-3,-21,0,-2,-2,0.0,2.54,-3,-4.2,-1.1,-1.08,0,0.0,0.0,0,-1,-1,0,225,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1,-1,2
3,3,0,0,0,-1,-1,5,2,15,0,0,3,2.54,-5.08,-5,-1.09,0.1,0.71,1,0.0,0.0,0,1,1,0,230,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1,-1,3
4,4,0,0,-1,0,0,6,3,29,0,-2,4,-7.62,-7.62,5,-1.82,0.6,2.86,0,0.0,0.0,0,2,2,0,400,130.5556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,4


In [6]:
# Index is only needed for SQL join so we must drop it here so it doesnt effect outcomes
index = ['index']
df.drop(index, axis=1, inplace = True)

In [7]:
# Establish X variables and y target
y = df.Winner
X = df.drop(['Winner'], axis=1)

In [8]:
# Split dataset into train test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = 0.25)

In [9]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
rf_model = RandomForestClassifier(random_state=42)

In [11]:
# Fit the model w the training data
rf_model.fit(X_train_scaled, y_train)

RandomForestClassifier(random_state=42)

In [12]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
acc_score = accuracy_score(y_test, predictions)

In [13]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,317,141
Actual 1,109,575


In [14]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,317,141
Actual 1,109,575


Accuracy Score : 0.7810858143607706
Classification Report
              precision    recall  f1-score   support

           0       0.74      0.69      0.72       458
           1       0.80      0.84      0.82       684

    accuracy                           0.78      1142
   macro avg       0.77      0.77      0.77      1142
weighted avg       0.78      0.78      0.78      1142



In [15]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.00315874, 0.01436497, 0.01284411, 0.01631189, 0.01902831,
       0.0200489 , 0.02850978, 0.00887626, 0.0171429 , 0.01572298,
       0.02388869, 0.02672079, 0.0302397 , 0.03748107, 0.03015492,
       0.0361326 , 0.00231787, 0.02903994, 0.        , 0.0030225 ,
       0.01043875, 0.01678938, 0.00322808, 0.07019336, 0.06287763,
       0.02350985, 0.08988226, 0.02906603, 0.04085422, 0.06145082,
       0.03920009, 0.01999072, 0.01282278, 0.021901  , 0.01894976,
       0.03949163, 0.00330994, 0.0138942 , 0.00555775, 0.04158479])

In [16]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.08988226485680968, 'sig_str_landed_bout_diff'),
 (0.07019336378797492, 'odds_diff'),
 (0.06287762737813922, 'ev_diff'),
 (0.061450822029868235, 'tot_str_landed_bout_diff'),
 (0.04158479237543058, 'level_0'),
 (0.04085421649194397, 'sig_str_pct_bout_diff'),
 (0.039491626364137826, 'pass_bout_diff'),
 (0.0392000945946951, 'tot_str_attempted_bout_diff'),
 (0.03748106506197009, 'sig_str_dif'),
 (0.036132600046866, 'avg_td_dif'),
 (0.03023970423646616, 'age_dif'),
 (0.030154920750003925, 'avg_sub_att_dif'),
 (0.02906603424329276, 'sig_str_attempted_bout_diff'),
 (0.029039944581263653, 'avg_sig_str_pct_diff'),
 (0.028509775288449033, 'total_round_dif'),
 (0.02672079322686344, 'reach_dif'),
 (0.023888694257797553, 'height_dif'),
 (0.023509850672159452, 'kd_bout_diff'),
 (0.021901003359226463, 'td_pct_bout_diff'),
 (0.020048901423625778, 'loss_dif'),
 (0.019990722837906493, 'td_landed_bout_diff'),
 (0.019028313967902583, 'win_dif'),
 (0.01894976181761797, 'sub_attempts_bout_diff'),
 (0.017

In [17]:
# Define the model - deep neural net
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 =  25
hidden_nodes_layer2 = 12

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))


# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=25)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
36/36 - 0s - loss: 0.4440 - accuracy: 0.7793
Loss: 0.44398924708366394, Accuracy: 0.7793344855308533
