In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np 

import pandas as pd 

import matplotlib.pyplot as plt

import missingno as msno

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics 
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder

from xgboost import XGBClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import the data

In [None]:
recent_df = pd.read_csv("../input/ultimate-ufc-dataset/most-recent-event.csv")
master_df = pd.read_csv("../input/ultimate-ufc-dataset/ufc-master.csv")
upcoming_df = pd.read_csv("../input/ultimate-ufc-dataset/upcoming-event.csv")

# Inspect and assess the data

In [None]:
master_df.head()

In [None]:
master_df['Winner'].describe()

In [None]:
msno.matrix(master_df)

In [None]:
master_df.describe()

In [None]:
print(master_df.columns.tolist())

In [None]:
master_df1 = master_df.drop([ 'B_match_weightclass_rank', 'R_match_weightclass_rank','finish', 'finish_details', 'finish_round', 'finish_round_time', 'total_fight_time_secs', 'R_kd_bout', 'B_kd_bout', 
                             'R_sig_str_landed_bout', 'B_sig_str_landed_bout', 'R_sig_str_attempted_bout', 'B_sig_str_attempted_bout', 'R_sig_str_pct_bout', 
                             'B_sig_str_pct_bout', 'R_tot_str_landed_bout', 'B_tot_str_landed_bout', 'R_tot_str_attempted_bout', 'B_tot_str_attempted_bout', 
                             'R_td_landed_bout', 'B_td_landed_bout', 'R_td_attempted_bout', 'B_td_attempted_bout', 'R_td_pct_bout', 'B_td_pct_bout', 
                             'R_sub_attempts_bout', 'B_sub_attempts_bout', 'R_pass_bout', 'B_pass_bout', 'R_rev_bout', 'B_rev_bout'], axis = 1)

In [None]:
master_df1 = master_df1.drop([ "R_Women's Flyweight_rank", "R_Women's Featherweight_rank", "R_Women's Strawweight_rank", "R_Women's Bantamweight_rank", 
                              'R_Heavyweight_rank', 'R_Light Heavyweight_rank', 'R_Middleweight_rank', 'R_Welterweight_rank', 'R_Lightweight_rank', 
                              'R_Featherweight_rank', 'R_Bantamweight_rank', 'R_Flyweight_rank', 'R_Pound-for-Pound_rank', "B_Women's Flyweight_rank", 
                              "B_Women's Featherweight_rank", "B_Women's Strawweight_rank", "B_Women's Bantamweight_rank", 'B_Heavyweight_rank', 
                              'B_Light Heavyweight_rank', 'B_Middleweight_rank', 'B_Welterweight_rank', 'B_Lightweight_rank', 'B_Featherweight_rank', 
                              'B_Bantamweight_rank', 'B_Flyweight_rank', 'B_Pound-for-Pound_rank'], axis =1)

In [None]:
master_df1.count()

In [None]:
master_df1['index'] = range(1, len(master_df1) + 1)

In [None]:
master_df1.head()

In [None]:
msno.matrix(master_df1.iloc[:,0:20])

In [None]:
msno.matrix(master_df1.iloc[:,20:40])

In [None]:
msno.matrix(master_df1.iloc[:,40:60])

In [None]:
msno.matrix(master_df1.iloc[:,60:81])

# Create fighter dataframe to impute missing data points

In [None]:
print(master_df1.columns.tolist())

In [None]:
fighter_df_R = master_df1[['date', 'R_fighter', 'R_odds', 'R_ev', 'R_current_lose_streak','R_current_win_streak', 'R_draw', 'R_avg_SIG_STR_landed', 'R_avg_SIG_STR_pct', 
                         'R_avg_SUB_ATT', 'R_avg_TD_landed', 'R_avg_TD_pct','R_longest_win_streak', 'R_losses', 'R_total_rounds_fought', 'R_total_title_bouts', 
                         'R_win_by_Decision_Majority', 'R_win_by_Decision_Split', 'R_win_by_Decision_Unanimous', 'R_win_by_KO/TKO', 'R_win_by_Submission', 
                         'R_win_by_TKO_Doctor_Stoppage', 'R_wins', 'R_Stance', 'R_Height_cms', 'R_Reach_cms', 'R_Weight_lbs', 'R_age','index']]

In [None]:
fighter_df_B = master_df1[['date', 'B_fighter', 'B_odds', 'B_ev', 'B_current_lose_streak','B_current_win_streak', 'B_draw', 'B_avg_SIG_STR_landed', 'B_avg_SIG_STR_pct', 
                         'B_avg_SUB_ATT', 'B_avg_TD_landed', 'B_avg_TD_pct','B_longest_win_streak', 'B_losses', 'B_total_rounds_fought', 'B_total_title_bouts', 
                         'B_win_by_Decision_Majority', 'B_win_by_Decision_Split', 'B_win_by_Decision_Unanimous', 'B_win_by_KO/TKO', 'B_win_by_Submission', 
                         'B_win_by_TKO_Doctor_Stoppage', 'B_wins', 'B_Stance', 'B_Height_cms', 'B_Reach_cms', 'B_Weight_lbs', 'B_age','index']]

In [None]:
fighter_df_R

In [None]:
fighter_df_B

In [None]:
fighter_df_B_columns = fighter_df_B.columns

fighter_df_B.columns = fighter_df_R.columns
fighter_df = pd.concat([fighter_df_B, fighter_df_R], axis=0, ignore_index=True)

In [None]:
fighter_df['index_1'] = range(1, len(fighter_df) + 1)

In [None]:
fighter_df['date'] = pd.to_datetime(fighter_df['date'])

master_df1['date'] = pd.to_datetime(master_df1['date'])

In [None]:
fighter_df = fighter_df.sort_values(by=['R_fighter','date'])

In [None]:
f = fighter_df[['R_avg_SIG_STR_landed','R_avg_SIG_STR_pct', 'R_avg_SUB_ATT', 'R_avg_TD_landed', 'R_avg_TD_pct']]

for value in f:
    fighter_df[value] = fighter_df.groupby(['R_fighter'], sort=False)[value].apply(lambda x: x.fillna(x.interpolate(limit_direction='both')))

In [None]:
fighter_df.describe()

In [None]:
msno.matrix(fighter_df)

# Feature engineering: Create differencing columns

In [None]:
fighter_df.columns

In [None]:
fighter_df['R_odds_diff'] = fighter_df.groupby(['R_fighter'])['R_odds'].diff()
fighter_df['R_ev_diff'] = fighter_df.groupby(['R_fighter'])['R_ev'].diff()
fighter_df['R_current_lose_streak_diff'] = fighter_df.groupby(['R_fighter'])['R_current_lose_streak'].diff()
fighter_df['R_current_win_streak_diff'] = fighter_df.groupby(['R_fighter'])['R_current_win_streak'].diff
fighter_df['R_avg_SIG_STR_landed_diff'] = fighter_df.groupby(['R_fighter'])['R_avg_SIG_STR_landed'].diff()
fighter_df['R_avg_SIG_STR_pct_diff'] = fighter_df.groupby(['R_fighter'])['R_avg_SIG_STR_pct'].diff()
fighter_df['R_avg_SUB_ATT_diff'] = fighter_df.groupby(['R_fighter'])['R_avg_SUB_ATT'].diff()
fighter_df['R_avg_TD_landed_diff'] = fighter_df.groupby(['R_fighter'])['R_avg_TD_landed'].diff()
fighter_df['R_avg_TD_pct_diff'] = fighter_df.groupby(['R_fighter'])['R_avg_TD_pct'].diff()
fighter_df['R_longest_win_streak_diff'] = fighter_df.groupby(['R_fighter'])['R_longest_win_streak'].diff()
fighter_df['R_losses_diff'] = fighter_df.groupby(['R_fighter'])['R_losses'].diff()
fighter_df['R_total_rounds_fought_diff'] = fighter_df.groupby(['R_fighter'])['R_total_rounds_fought'].diff()
fighter_df['R_total_title_bouts_diff'] = fighter_df.groupby(['R_fighter'])['R_total_title_bouts'].diff()
fighter_df['R_win_by_Decision_Majority_diff'] = fighter_df.groupby(['R_fighter'])['R_win_by_Decision_Majority'].diff()
fighter_df['R_win_by_Decision_Split_diff'] = fighter_df.groupby(['R_fighter'])['R_win_by_Decision_Split'].diff()
fighter_df['R_win_by_Decision_Unanimous_diff'] = fighter_df.groupby(['R_fighter'])['R_win_by_Decision_Unanimous'].diff()
fighter_df['R_win_by_KO/TKO_diff'] = fighter_df.groupby(['R_fighter'])['R_win_by_KO/TKO'].diff()
fighter_df['R_win_by_Submission_diff'] = fighter_df.groupby(['R_fighter'])['R_win_by_Submission'].diff()
fighter_df['R_win_by_TKO_Doctor_Stoppage_diff'] = fighter_df.groupby(['R_fighter'])['R_win_by_TKO_Doctor_Stoppage'].diff()
fighter_df['R_wins_diff'] = fighter_df.groupby(['R_fighter'])['R_wins'].diff()
fighter_df['R_Weight_lbs_diff'] = fighter_df.groupby(['R_fighter'])['R_Weight_lbs'].diff()
fighter_df['R_age_diff'] = fighter_df.groupby(['R_fighter'])['R_age'].diff()
fighter_df['R_time_since_last_fight'] = fighter_df.groupby(['R_fighter'])['date'].diff()

In [None]:
fighter_df.describe()

In [None]:
diff_list = ['R_odds_diff', 'R_ev_diff',
       'R_current_lose_streak_diff', 'R_current_win_streak_diff',
       'R_avg_SIG_STR_landed_diff', 'R_avg_SIG_STR_pct_diff',
       'R_avg_SUB_ATT_diff', 'R_avg_TD_landed_diff', 'R_avg_TD_pct_diff',
       'R_longest_win_streak_diff', 'R_losses_diff',
       'R_total_rounds_fought_diff', 'R_total_title_bouts_diff',
       'R_win_by_Decision_Majority_diff', 'R_win_by_Decision_Split_diff',
       'R_win_by_Decision_Unanimous_diff', 'R_win_by_KO/TKO_diff',
       'R_win_by_Submission_diff', 'R_win_by_TKO_Doctor_Stoppage_diff',
       'R_wins_diff', 'R_Weight_lbs_diff', 'R_age_diff',
       'R_time_since_last_fight']

for value in diff_list:
    fighter_df[value] = fighter_df.groupby(['R_fighter'], sort=False)[value].apply(lambda x: x.fillna(x.interpolate(limit_direction='both')))

In [None]:
fighter_df = fighter_df.groupby(['R_fighter']).apply(lambda x: x.fillna(x.mean()))

In [None]:
fighter_df = fighter_df.sort_values(['index_1'])

In [None]:
#rslt_df = dataframe[dataframe['Percentage'] > 80] 

split_B = fighter_df[fighter_df['index_1'] <= 4566]
split_R = fighter_df[fighter_df['index_1'] > 4566]

In [None]:
split_B.columns

In [None]:
split_B.columns = ['date', 'B_fighter', 'B_odds', 'B_ev', 'B_current_lose_streak',
       'B_current_win_streak', 'B_draw', 'B_avg_SIG_STR_landed',
       'B_avg_SIG_STR_pct', 'B_avg_SUB_ATT', 'B_avg_TD_landed', 'B_avg_TD_pct',
       'B_longest_win_streak', 'B_losses', 'B_total_rounds_fought',
       'B_total_title_bouts', 'B_win_by_Decision_Majority',
       'B_win_by_Decision_Split', 'B_win_by_Decision_Unanimous',
       'B_win_by_KO/TKO', 'B_win_by_Submission',
       'B_win_by_TKO_Doctor_Stoppage', 'B_wins', 'B_Stance', 'B_Height_cms',
       'B_Reach_cms', 'B_Weight_lbs', 'B_age', 'index', 'index_1',
       'B_odds_diff', 'B_ev_diff', 'B_current_lose_streak_diff',
       'B_current_win_streak_diff', 'B_avg_SIG_STR_landed_diff',
       'B_avg_SIG_STR_pct_diff', 'B_avg_SUB_ATT_diff', 'B_avg_TD_landed_diff',
       'B_avg_TD_pct_diff', 'B_longest_win_streak_diff', 'B_losses_diff',
       'B_total_rounds_fought_diff', 'B_total_title_bouts_diff',
       'B_win_by_Decision_Majority_diff', 'B_win_by_Decision_Split_diff',
       'B_win_by_Decision_Unanimous_diff', 'B_win_by_KO/TKO_diff',
       'B_win_by_Submission_diff', 'B_win_by_TKO_Doctor_Stoppage_diff',
       'B_wins_diff', 'B_Weight_lbs_diff', 'B_age_diff',
       'B_time_since_last_fight']

split_B

In [None]:
split_R

In [None]:
split_R = split_R.reset_index(drop=True)

In [None]:
split_B = split_B.reset_index(drop=True)

In [None]:
df = split_B.merge(split_R, on="index", how = 'inner')

In [None]:
print(df.columns.tolist())

In [None]:
df = df.rename(columns={"date_x": "date"})
df = df.drop(['date_y'], axis=1)
df

In [None]:
msno.matrix(df)

In [None]:
msno.matrix(master_df1)

In [None]:
print(master_df1.columns.tolist())

In [None]:
cols = ['location', 'country', 'Winner', 'title_bout', 'weight_class', 'gender', 'no_of_rounds','lose_streak_dif', 'win_streak_dif', 
   'longest_win_streak_dif', 'win_dif', 'loss_dif', 'total_round_dif', 'total_title_bout_dif', 'ko_dif', 'sub_dif', 'height_dif', 
   'reach_dif', 'age_dif', 'sig_str_dif', 'avg_sub_att_dif', 'avg_td_dif', 'empty_arena', 'constant_1', 'better_rank'] 

for value in cols:
    df[value] = master_df1[value].values

In [None]:
msno.matrix(df)

In [None]:
df.describe()

In [None]:
df = df.dropna(axis='rows')
df.describe()

In [None]:
print(df.columns.tolist())

In [None]:
df['Winner'] = [1 if win == 'Red' else 0 for win in df['Winner']]

In [None]:
#Separating the features based on their data types

cat_col = [col for col in df.columns if df[col].dtypes == 'object']
num_col = [col for col in df.columns if col not in cat_col]

In [None]:
# One hot encoding

enc = LabelEncoder()
for i in df[cat_col]:
    #using astype(str) to avoid columns with 'float and str' throwing errors
    df[i] = enc.fit_transform(df[i].astype(str))

In [None]:
df['date'] = df['date'].apply(lambda x: x.toordinal())

# Set feature/target variables, and split into training and test sets



In [None]:
feature_cols = [ 'date', 'B_current_lose_streak', 'B_current_win_streak', 'B_draw', 
                'B_avg_SIG_STR_landed', 'B_avg_SIG_STR_pct', 'B_avg_SUB_ATT', 'B_avg_TD_landed', 'B_avg_TD_pct', 
                'B_longest_win_streak', 'B_losses', 'B_total_rounds_fought', 'B_total_title_bouts', 'B_win_by_Decision_Majority', 
                'B_win_by_Decision_Split', 'B_win_by_Decision_Unanimous', 'B_win_by_KO/TKO', 'B_win_by_Submission', 
                'B_win_by_TKO_Doctor_Stoppage', 'B_wins', 'B_Stance', 'B_Height_cms', 'B_Reach_cms', 'B_Weight_lbs', 
                'B_age', 'B_odds_diff', 'B_ev_diff', 'B_current_lose_streak_diff', 
                'B_current_win_streak_diff', 'B_avg_SIG_STR_landed_diff', 'B_avg_SIG_STR_pct_diff', 'B_avg_SUB_ATT_diff', 
                'B_avg_TD_landed_diff', 'B_avg_TD_pct_diff', 'B_longest_win_streak_diff', 'B_losses_diff', 
                'B_total_rounds_fought_diff', 'B_total_title_bouts_diff', 'B_win_by_Decision_Majority_diff', 
                'B_win_by_Decision_Split_diff', 'B_win_by_Decision_Unanimous_diff', 'B_win_by_KO/TKO_diff', 
                'B_win_by_Submission_diff', 'B_win_by_TKO_Doctor_Stoppage_diff', 'B_wins_diff', 'B_Weight_lbs_diff', 
                'B_age_diff', 'R_current_lose_streak', 
                'R_current_win_streak', 'R_draw', 'R_avg_SIG_STR_landed', 'R_avg_SIG_STR_pct', 'R_avg_SUB_ATT', 
                'R_avg_TD_landed', 'R_avg_TD_pct', 'R_longest_win_streak', 'R_losses', 'R_total_rounds_fought', 
                'R_total_title_bouts', 'R_win_by_Decision_Majority', 'R_win_by_Decision_Split', 'R_win_by_Decision_Unanimous', 
                'R_win_by_KO/TKO', 'R_win_by_Submission', 'R_win_by_TKO_Doctor_Stoppage', 'R_wins', 'R_Stance', 'R_Height_cms', 
                'R_Reach_cms', 'R_Weight_lbs', 'R_age', 'R_odds_diff', 'R_ev_diff', 'R_current_lose_streak_diff', 
                'R_current_win_streak_diff', 'R_avg_SIG_STR_landed_diff', 'R_avg_SIG_STR_pct_diff', 'R_avg_SUB_ATT_diff', 
                'R_avg_TD_landed_diff', 'R_avg_TD_pct_diff', 'R_longest_win_streak_diff', 'R_losses_diff', 
                'R_total_rounds_fought_diff', 'R_total_title_bouts_diff', 'R_win_by_Decision_Majority_diff', 
                'R_win_by_Decision_Split_diff', 'R_win_by_Decision_Unanimous_diff', 'R_win_by_KO/TKO_diff', 
                'R_win_by_Submission_diff', 'R_win_by_TKO_Doctor_Stoppage_diff', 'R_wins_diff', 'R_Weight_lbs_diff', 
                'R_age_diff', 'title_bout', 'weight_class', 
                'gender', 'no_of_rounds', 'lose_streak_dif', 'win_streak_dif', 'longest_win_streak_dif', 'win_dif', 
                'loss_dif', 'total_round_dif', 'total_title_bout_dif', 'ko_dif', 'sub_dif', 'height_dif', 'reach_dif', 
                'age_dif', 'sig_str_dif', 'avg_sub_att_dif', 'avg_td_dif', 'empty_arena',  'better_rank','B_odds', 'B_ev','R_odds', 'R_ev']



# 'B_odds', 'B_ev','R_odds', 'R_ev',
target_cols = ['Winner']             

X = df[feature_cols] # Features

y = df[target_cols] # Target variable

In [None]:
# Split dataset into training set and test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,shuffle = False) #, stratify=y.values.ravel()) 

# Model Building

In [None]:
# keras/tf
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout

In [None]:
from sklearn.preprocessing import MinMaxScaler
# scaling
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"X_train_scaled shape: {X_train_scaled.shape} | X_val_scaled shape: {X_test_scaled.shape} | y_train shape: {y_train.shape} | y_val shape: {y_test.shape}")

# model
dnnClf = Sequential()

# first hiden layer
dnnClf.add(Dense(units=20, input_dim=118,activation='relu'))
dnnClf.add(Dropout(0.5)) # deactivates 50% of nodes

dnnClf.add(Dense(units=10, activation='relu'))
dnnClf.add(Dropout(0.5)) # deactivates 50% of nodes

# output layer
dnnClf.add(Dense(units=1, activation='sigmoid'))

dnnClf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

from tensorflow.keras.callbacks import EarlyStopping # prevent divergence of loss & val_loss
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=16)

In [None]:
dnnClf.fit(x=X_train_scaled, 
          y=y_train, 
          epochs=400,
          validation_data=(X_test_scaled, y_test), verbose=1,
          callbacks=[early_stop]
          )
model_loss = pd.DataFrame(dnnClf.history.history)
model_loss.plot()

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

dnnPreds = dnnClf.predict(scaler.transform(X_test))
dnnPreds = [round(i[0]) for i in dnnPreds]
target_names = ['class 0', 'class 1']
print("DNN Validation Performance on UNBALANCED(!):\n------------------\n",classification_report(y_test, dnnPreds , target_names=target_names))
print("AUC: ",roc_auc_score(y_test, dnnPreds))

In [None]:
# Train a random forest classifier

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train,y_train.values.ravel())

y_pred_rf = rf.predict(X_test)

In [None]:
print("Random Forest Accuracy:",metrics.accuracy_score(y_test, y_pred_rf))

In [None]:
# calculate the fpr and tpr for all thresholds of the classification
probs =rf.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

# Plot the ROC curve

plt.title('Receiver Operating Characteristic: RF')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
# Plot the confusion matrix

plot_confusion_matrix(rf, X_test, y_test) 
plt.title("Random Forest Confusion Matrix")
plt.show() 

In [None]:
train_errors = []
valid_errors = []
param_range = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,20,30,40,50]

for max_depth in param_range:
    random_forest = RandomForestClassifier(max_depth=max_depth, n_estimators=100, random_state=42)
    random_forest.fit(X_train, y_train.values.ravel())
    
    train_errors.append(metrics.accuracy_score(y_train.values.ravel(), random_forest.predict(X_train)))
    valid_errors.append(metrics.accuracy_score(y_test.values.ravel(), random_forest.predict(X_test)))
    

plt.xlabel('Max depth')
plt.ylabel('Accuracy')
plt.plot(param_range, train_errors, label="train accuracy")
plt.plot(param_range, valid_errors, label="test accuracy")
plt.legend()
plt.show()

In [None]:
train_errors = []
valid_errors = []
param_range = [10,20,40,60,80,100,120,1000]

for n_estimators in param_range:
    random_forest = RandomForestClassifier(max_depth=13, n_estimators=n_estimators, random_state=42)
    random_forest.fit(X_train, y_train.values.ravel())
    
    train_errors.append(metrics.accuracy_score(y_train.values.ravel(), random_forest.predict(X_train)))
    valid_errors.append(metrics.accuracy_score(y_test.values.ravel(), random_forest.predict(X_test)))
    

plt.xlabel('n_estimators')
plt.ylabel('Accuracy')
plt.plot(param_range, train_errors, label="train accuracy")
plt.plot(param_range, valid_errors, label="test accuracy")
plt.legend()
plt.show()

In [None]:
# Initialize a random forest to perform hyperparameter grid search

rf = RandomForestClassifier(random_state = 42)

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 1000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(2, 50, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 20]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 6, 8]

# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
rf_random.fit(X_train, y_train.values.ravel())

In [None]:
# Get the best random forrest hyperparameters
rf_random.best_params_

In [None]:
rf1 = RandomForestClassifier(n_estimators=450,
 min_samples_split= 5,
 min_samples_leaf= 6,
 max_features= 'auto',
 max_depth= 21,
 bootstrap= False) # max_depth = 10, criterion = "gini", bootstrap=False)
rf1.fit(X_train,y_train.values.ravel())

y_pred_rf = rf1.predict(X_test)

In [None]:
print("Random Forest Accuracy:",metrics.accuracy_score(y_test, y_pred_rf))

In [None]:
# calculate the fpr and tpr for all thresholds of the classification
probs =rf1.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

#plt
plt.title('Receiver Operating Characteristic: RF')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
plot_confusion_matrix(rf1, X_test, y_test)  
plt.title("Tuned Random Forest Confusion Matrix")
plt.show()  

In [None]:
xgb = XGBClassifier(booster='gbtree', random_state=42,learning_rate=0.05) 
xgb.fit(X_train, y_train.values.ravel())
xgb.score(X_test,y_test)

y_pred_xgb = xgb.predict(X_test)

In [None]:
print("XGB Accuracy:",metrics.accuracy_score(y_test, y_pred_xgb))

In [None]:
# calculate the fpr and tpr for all thresholds of the classification
probs = xgb.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

# plt
plt.title('Receiver Operating Characteristic: XGB')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
plot_confusion_matrix(xgb, X_test, y_test)  
plt.title("XGB Confusion Matrix")
plt.show()  