In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
from xgboost import XGBClassifier
import re
from sklearn.metrics import classification_report
from xgboost import plot_importance
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, roc_curve
# from skopt import BayesSearchCV
import warnings
warnings.filterwarnings('ignore')
import pickle

# Data reading and preprocessing

In [2]:
df = pd.read_csv('man_odi_data_2.csv')

In [3]:
df.head(3)

Unnamed: 0,Match Date,Team1 Name,Team1 Captain,Team2 Name,Team2 Captain,Match Venue (Stadium),Match Venue (City),Match Venue (Country),Toss Winner,Toss Winner Choice,Match Winner,Team1 Playing 11,Team2 Playing 11,Debut Players
0,07-01-1988,Australia,1572,New Zealand,1698,Melbourne Cricket Ground,Melbourne,Australia,Australia,bat,Australia,"['1767', '1793', '1754', '1572', '1871', '1795...","['1550', '1863', '1861', '1669', '1698', '1846...",[]
1,12-01-1988,New Zealand,1698,Sri Lanka,1664,Bellerive Oval,Hobart,Australia,Sri Lanka,bowl,Sri Lanka,"['1777', '1550', '1698', '1669', '1861', '1846...","['1810', '1864', '1789', '1762', '1666', '1664...",[]
2,17-01-1988,New Zealand,1698,Australia,1572,Brisbane Cricket Ground,Brisbane,Australia,Australia,bowl,Australia,"['1550', '1863', '1698', '1669', '1407', '1790...","['1793', '1767', '1773', '1754', '1871', '1795...",['1790']


In [4]:
df.columns

Index(['Match Date', 'Team1 Name', 'Team1 Captain', 'Team2 Name',
       'Team2 Captain', 'Match Venue (Stadium)', 'Match Venue (City)',
       'Match Venue (Country)', 'Toss Winner', 'Toss Winner Choice',
       'Match Winner', 'Team1 Playing 11', 'Team2 Playing 11',
       'Debut Players'],
      dtype='object')

In [5]:
player_list = set([])
for cell in df['Team1 Playing 11']:
    lst = cell.strip("[]").replace("'", "").replace(' ', '').split(',')
    lst = [int(x) for x in lst]
    for player in lst:
        player_list.add(player)
player_list = list(player_list)

In [6]:
len(player_list)

1256

In [7]:
player_list = set([])
for cell in df['Team2 Playing 11']:
    lst = cell.strip("[]").replace("'", "").replace(' ', '').split(',')
    lst = [int(x) for x in lst]
    for player in lst:
        player_list.add(player)
player_list = list(player_list)
len(player_list)

2179

In [8]:
df.isnull().sum()

Match Date                 0
Team1 Name                 0
Team1 Captain              0
Team2 Name                 0
Team2 Captain              0
Match Venue (Stadium)      0
Match Venue (City)         0
Match Venue (Country)      0
Toss Winner                0
Toss Winner Choice         0
Match Winner             133
Team1 Playing 11           0
Team2 Playing 11           0
Debut Players              0
dtype: int64

In [9]:
df.head()

Unnamed: 0,Match Date,Team1 Name,Team1 Captain,Team2 Name,Team2 Captain,Match Venue (Stadium),Match Venue (City),Match Venue (Country),Toss Winner,Toss Winner Choice,Match Winner,Team1 Playing 11,Team2 Playing 11,Debut Players
0,07-01-1988,Australia,1572,New Zealand,1698,Melbourne Cricket Ground,Melbourne,Australia,Australia,bat,Australia,"['1767', '1793', '1754', '1572', '1871', '1795...","['1550', '1863', '1861', '1669', '1698', '1846...",[]
1,12-01-1988,New Zealand,1698,Sri Lanka,1664,Bellerive Oval,Hobart,Australia,Sri Lanka,bowl,Sri Lanka,"['1777', '1550', '1698', '1669', '1861', '1846...","['1810', '1864', '1789', '1762', '1666', '1664...",[]
2,17-01-1988,New Zealand,1698,Australia,1572,Brisbane Cricket Ground,Brisbane,Australia,Australia,bowl,Australia,"['1550', '1863', '1698', '1669', '1407', '1790...","['1793', '1767', '1773', '1754', '1871', '1795...",['1790']
3,19-01-1988,Sri Lanka,1664,Australia,1572,Sydney Cricket Ground,Sydney,Australia,Australia,bowl,Australia,"['1810', '1864', '1789', '1753', '1762', '1666...","['1767', '1793', '1754', '1871', '1795', '1572...",['1753']
4,21-10-1988,India,1491,West Indies,1433,Sharjah Cricket Association Stadium,Sharjah,United Arab Emirates,India,bat,West Indies,"['1653', '1639', '1353', '1491', '1774', '1568...","['1433', '1553', '1744', '1873', '1697', '1654...",[]


In [10]:
team_names = [
    'Australia',
    'New Zealand',
    'Sri Lanka',
    'England',
    'India',
    'Pakistan',
    'Bangladesh',
    'South Africa',
    'Netherlands',
    'Afghanistan'
]
df1 = pd.DataFrame()

for team in team_names:
    team_data = df[df['Team1 Name'] == team]
    df1 = pd.concat([df1, team_data], ignore_index=True)

In [11]:
team_names = [
    'Australia',
    'New Zealand',
    'Sri Lanka',
    'England',
    'India',
    'Pakistan',
    'Bangladesh',
    'South Africa',
    'Netherlands',
    'Afghanistan'
]

df2 = pd.DataFrame()

for team in team_names:
    team_data = df1[df1['Team2 Name'] == team]
    df2 = pd.concat([df2, team_data], ignore_index=True)

In [12]:
df2.columns

Index(['Match Date', 'Team1 Name', 'Team1 Captain', 'Team2 Name',
       'Team2 Captain', 'Match Venue (Stadium)', 'Match Venue (City)',
       'Match Venue (Country)', 'Toss Winner', 'Toss Winner Choice',
       'Match Winner', 'Team1 Playing 11', 'Team2 Playing 11',
       'Debut Players'],
      dtype='object')

In [13]:
# winner = []
# for match_win, team_one in zip(df['Match Winner'], df['Team1 Name']):
#     if match_win == team_one:
#         winner.append(0)
#     else:
#         winner.append(1)

In [14]:
# df['match_winner'] = winner

In [15]:
df2 = df2[['Team1 Name','Team2 Name','Match Venue (Stadium)',
       'Match Venue (City)', 'Match Venue (Country)',
       'Match Winner','Team1 Playing 11','Team2 Playing 11']]

In [16]:
df2.isnull().sum()

Team1 Name                 0
Team2 Name                 0
Match Venue (Stadium)      0
Match Venue (City)         0
Match Venue (Country)      0
Match Winner             109
Team1 Playing 11           0
Team2 Playing 11           0
dtype: int64

In [17]:
team_1_players_df = pd.DataFrame(columns=['team1_P1','team1_P2','team1_P3','team1_P4','team1_P5','team1_P6','team1_P7','team1_P8','team1_P9','team1_P10','team1_P11'])
for x in df2['Team1 Playing 11']:
    arr = np.array(x.replace('[', '').replace(']', '').replace("'", "").split(', '))
    arr = arr.astype('int')
    team_1_players_df.loc[len(team_1_players_df)] = arr

In [18]:
team_2_players_df = pd.DataFrame(columns=['team2_P1','team2_P2','team2_P3','team2_P4','team2_P5','team2_P6','team2_P7','team2_P8','team2_P9','team2_P10','team2_P11'])
for x in df2['Team2 Playing 11']:
    arr = np.array(x.replace('[', '').replace(']', '').replace("'", "").split(', '))
    arr = arr.astype('int')
    team_2_players_df.loc[len(team_2_players_df)] = arr
    

In [19]:
df2 = pd.concat([df2, team_1_players_df, team_2_players_df], axis=1)

In [20]:
#df = df.drop(columns=['Match Winner', 'Toss Winner'], axis=1)

In [21]:
#df = df.drop(columns=['Debut Players', 'Team1 Playing 11', 'Team2 Playing 11'], axis=1)

In [22]:
df2.columns

Index(['Team1 Name', 'Team2 Name', 'Match Venue (Stadium)',
       'Match Venue (City)', 'Match Venue (Country)', 'Match Winner',
       'Team1 Playing 11', 'Team2 Playing 11', 'team1_P1', 'team1_P2',
       'team1_P3', 'team1_P4', 'team1_P5', 'team1_P6', 'team1_P7', 'team1_P8',
       'team1_P9', 'team1_P10', 'team1_P11', 'team2_P1', 'team2_P2',
       'team2_P3', 'team2_P4', 'team2_P5', 'team2_P6', 'team2_P7', 'team2_P8',
       'team2_P9', 'team2_P10', 'team2_P11'],
      dtype='object')

In [23]:
df2.head()

Unnamed: 0,Team1 Name,Team2 Name,Match Venue (Stadium),Match Venue (City),Match Venue (Country),Match Winner,Team1 Playing 11,Team2 Playing 11,team1_P1,team1_P2,...,team2_P2,team2_P3,team2_P4,team2_P5,team2_P6,team2_P7,team2_P8,team2_P9,team2_P10,team2_P11
0,New Zealand,Australia,Brisbane Cricket Ground,Brisbane,Australia,Australia,"['1550', '1863', '1698', '1669', '1407', '1790...","['1793', '1767', '1773', '1754', '1871', '1795...",1550,1863,...,1767,1773,1754,1871,1795,1572,1854,1859,1875,1797
1,New Zealand,Australia,Carisbrook,Dunedin,New Zealand,Australia,"['1456', '1328', '1251', '1326', '1311', '1259...","['1215', '1278', '1243', '1364', '1418', '1277...",1456,1328,...,1278,1243,1364,1418,1277,1361,1402,1419,1528,1423
2,New Zealand,Australia,Melbourne Cricket Ground,Melbourne,Australia,Australia,"['1566', '1550', '1421', '1625', '1407', '1311...","['1541', '1572', '1364', '1530', '1277', '1646...",1566,1550,...,1572,1364,1530,1277,1646,1547,1361,1369,1617,1525
3,New Zealand,Australia,Sydney Cricket Ground,Sydney,Australia,New Zealand,"['1550', '1566', '1444', '1408', '1311', '1421...","['1572', '1547', '1364', '1530', '1277', '1646...",1550,1566,...,1547,1364,1530,1277,1646,1361,1617,1369,1569,1525
4,New Zealand,Australia,Sydney Cricket Ground,Sydney,Australia,New Zealand,"['1550', '1566', '1444', '1421', '1311', '1407...","['1547', '1647', '1361', '1364', '1530', '1572...",1550,1566,...,1647,1361,1364,1530,1572,1277,1369,1402,1569,1525


In [24]:
df2['Team1 Name'] = df2['Team1 Name'].replace({'Australia': 1})
df2['Team1 Name'] = df2['Team1 Name'].replace({'New Zealand': 2})
df2['Team1 Name'] = df2['Team1 Name'].replace({'Sri Lanka': 3})
df2['Team1 Name'] = df2['Team1 Name'].replace({'England': 4})
df2['Team1 Name'] = df2['Team1 Name'].replace({'India': 5})
df2['Team1 Name'] = df2['Team1 Name'].replace({'Pakistan': 6})
df2['Team1 Name'] = df2['Team1 Name'].replace({'Bangladesh': 7})
df2['Team1 Name'] = df2['Team1 Name'].replace({'South Africa': 8})
df2['Team1 Name'] = df2['Team1 Name'].replace({'Netherlands': 9})
df2['Team1 Name'] = df2['Team1 Name'].replace({'Afghanistan': 10})

df2['Team2 Name'] = df2['Team2 Name'].replace({'Australia': 1})
df2['Team2 Name'] = df2['Team2 Name'].replace({'New Zealand': 2})
df2['Team2 Name'] = df2['Team2 Name'].replace({'Sri Lanka': 3})
df2['Team2 Name'] = df2['Team2 Name'].replace({'England': 4})
df2['Team2 Name'] = df2['Team2 Name'].replace({'India': 5})
df2['Team2 Name'] = df2['Team2 Name'].replace({'Pakistan': 6})
df2['Team2 Name'] = df2['Team2 Name'].replace({'Bangladesh': 7})
df2['Team2 Name'] = df2['Team2 Name'].replace({'South Africa': 8})
df2['Team2 Name'] = df2['Team2 Name'].replace({'Netherlands': 9})
df2['Team2 Name'] = df2['Team2 Name'].replace({'Afghanistan': 10})

df2['Match Winner'] = df2['Match Winner'].replace({'Australia': 1})
df2['Match Winner'] = df2['Match Winner'].replace({'New Zealand': 2})
df2['Match Winner'] = df2['Match Winner'].replace({'Sri Lanka': 3})
df2['Match Winner'] = df2['Match Winner'].replace({'England': 4})
df2['Match Winner'] = df2['Match Winner'].replace({'India': 5})
df2['Match Winner'] = df2['Match Winner'].replace({'Pakistan': 6})
df2['Match Winner'] = df2['Match Winner'].replace({'Bangladesh': 7})
df2['Match Winner'] = df2['Match Winner'].replace({'South Africa': 8})
df2['Match Winner'] = df2['Match Winner'].replace({'Netherlands': 9})
df2['Match Winner'] = df2['Match Winner'].replace({'Afghanistan': 10})

In [25]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

column_name = [
    'Match Venue (Stadium)',
    'Match Venue (City)',
    'Match Venue (Country)',
]
for name in column_name:
    df2[name] = label_encoder.fit_transform(df2[name])
    le_name_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
    print(le_name_mapping)
    print("\n")

{'AMI Stadium': 0, 'Adelaide Oval': 1, 'Andhra Cricket Association-Visakhapatnam District Cricket Association Stadium': 2, 'Arbab Niaz Stadium': 3, 'Arun Jaitley Stadium': 4, 'Asgiriya Stadium': 5, 'Ayub National Stadium': 6, 'Bangabandhu National Stadium': 7, 'Barabati Stadium': 8, 'Barsapara Cricket Stadium': 9, 'Basin Reserve': 10, 'Bay Oval': 11, 'Beausejour Stadium': 12, 'Bellerive Oval': 13, 'Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium': 14, 'Boland Bank Park': 15, 'Boland Park': 16, 'Brabourne Stadium': 17, 'Brisbane Cricket Ground': 18, 'Buffalo Park': 19, 'Captain Roop Singh Stadium': 20, 'Carisbrook': 21, "Cazaly's Stadium": 22, 'Centurion Park': 23, 'Chevrolet Park': 24, 'Chittagong Divisional Stadium': 25, 'Chittagong Stadium': 26, 'Civil Service Cricket Club': 27, 'Clontarf Cricket Club Ground': 28, 'County Ground': 29, 'Crusaders Ground': 30, 'Davies Park': 31, 'De Beers Diamond Oval': 32, 'Docklands Stadium': 33, 'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cric

In [26]:
df2 = df2.dropna()

In [29]:
df2['Match Winner'] = df2['Match Winner'].astype(int)

In [30]:
df2['Match Winner'].unique()

array([ 1,  2,  3,  5,  6,  8, 10,  4,  7,  9])

In [32]:
import pandas as pd

df2['Match Winner Encoded'] = pd.NA

df2.loc[df2['Match Winner'] == df2['Team1 Name'], 'Match Winner Encoded'] = 0
df2.loc[df2['Match Winner'] == df2['Team2 Name'], 'Match Winner Encoded'] = 1

In [33]:
df2.head(3)

Unnamed: 0,Team1 Name,Team2 Name,Match Venue (Stadium),Match Venue (City),Match Venue (Country),Match Winner,Team1 Playing 11,Team2 Playing 11,team1_P1,team1_P2,...,team2_P3,team2_P4,team2_P5,team2_P6,team2_P7,team2_P8,team2_P9,team2_P10,team2_P11,Match Winner Encoded
0,2,1,18,15,0,1,"['1550', '1863', '1698', '1669', '1407', '1790...","['1793', '1767', '1773', '1754', '1871', '1795...",1550,1863,...,1773,1754,1871,1795,1572,1854,1859,1875,1797,1
1,2,1,21,40,10,1,"['1456', '1328', '1251', '1326', '1311', '1259...","['1215', '1278', '1243', '1364', '1418', '1277...",1456,1328,...,1243,1364,1418,1277,1361,1402,1419,1528,1423,1
2,2,1,88,78,0,1,"['1566', '1550', '1421', '1625', '1407', '1311...","['1541', '1572', '1364', '1530', '1277', '1646...",1566,1550,...,1364,1530,1277,1646,1547,1361,1369,1617,1525,1


In [35]:
df2 = df2.drop(columns=['Match Venue (Stadium)', 'Match Venue (Country)','Team1 Playing 11','Team2 Playing 11'], axis=1)

In [36]:
df2.head()

Unnamed: 0,Team1 Name,Team2 Name,Match Venue (City),Match Winner,team1_P1,team1_P2,team1_P3,team1_P4,team1_P5,team1_P6,...,team2_P3,team2_P4,team2_P5,team2_P6,team2_P7,team2_P8,team2_P9,team2_P10,team2_P11,Match Winner Encoded
0,2,1,15,1,1550,1863,1698,1669,1407,1790,...,1773,1754,1871,1795,1572,1854,1859,1875,1797,1
1,2,1,40,1,1456,1328,1251,1326,1311,1259,...,1243,1364,1418,1277,1361,1402,1419,1528,1423,1
2,2,1,78,1,1566,1550,1421,1625,1407,1311,...,1364,1530,1277,1646,1547,1361,1369,1617,1525,1
3,2,1,113,2,1550,1566,1444,1408,1311,1421,...,1364,1530,1277,1646,1361,1617,1369,1569,1525,0
4,2,1,113,2,1550,1566,1444,1421,1311,1407,...,1361,1364,1530,1572,1277,1369,1402,1569,1525,0


In [37]:
X = df2.drop(columns=['Match Winner','Match Winner Encoded'], axis=1)
y = df2['Match Winner Encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [38]:
X_train.shape

(1569, 25)

In [39]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(units=64, activation='relu', input_shape=(25, )))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=16, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                1664      
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 16)                528       
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 4289 (16.75 KB)
Trainable params: 4289 (16.75 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [42]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [44]:
X_train = np.array(X_train, dtype=np.float32)
y_train = np.array(y_train, dtype=np.float32)

# Convert your testing data to float32
X_test = np.array(X_test, dtype=np.float32)

# Fit the model
model.fit(X_train, y_train, epochs=50, batch_size=32)

# Predict probabilities (scores) for each class
predictions = model.predict(X_test)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [48]:
y_test = np.array(y_test, dtype=np.float32)
accuracy = model.evaluate(X_test, y_test)[1]
print(f'Test Accuracy: {accuracy}')

Test Accuracy: 0.49236640334129333


In [23]:
df = pd.get_dummies(data=df, columns=['Team1 Name', 'Team1 Captain', 'Team2 Name', 'Team2 Captain', 'Match Venue (Stadium)', 'Match Venue (City)',
                                       'Match Venue (Country)', 'Toss Winner Choice'])
le = LabelEncoder()
lst = ['team1_P1','team1_P2','team1_P3','team1_P4','team1_P5','team1_P6','team1_P7','team1_P8','team1_P9','team1_P10','team1_P11','team2_P1','team2_P2','team2_P3','team2_P4','team2_P5','team2_P6','team2_P7','team2_P8','team2_P9','team2_P10','team2_P11']
for i in lst:
    df[i] = le.fit_transform(df[i])

ValueError: y should be a 1d array, got an array of shape (2839, 2) instead.

In [None]:
df.columns

In [None]:
df.head()

In [None]:
x = df.drop(columns=['match_winner'], axis=1)
y = df['match_winner']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

In [None]:
df.dtypes

# Model-1 with default params and GridSearch

In [None]:

# model0 = XGBClassifier(
#     objective='binary:logistic',
#     booster='gbtree',
#     eval_metric='auc',
#     tree_method='hist',
#     # device='cuda',
#     grow_policy='lossguide',
#     use_label_encoder=False
# )
# model0.fit(x_train, y_train)

In [None]:
default_params = {}
dparams = model0.get_params()

for key in dparams.keys():
    gp = dparams[key]
    default_params[key] = [gp]

In [None]:
# clf0 = GridSearchCV(estimator=model0, scoring='accuracy', param_grid=default_params, verbose=3, cv=10, refit=True)
# clf0.fit(x_train, y_train)
# predictions = clf0.predict(x_test)
# print(classification_report(predictions, y_test))

In [None]:
# Best Params of Model-1
bp = clf0.best_params_

# Model-2 with Grid Search Parameter Tuning

In [None]:
param_grid = {'gamma': [12.8,25.6,51.2,102.4, 200],
              'learning_rate': [0.01, 0.03, 0.06, 0.1, 0.15, 0.2, 0.25, 0.300000012, 0.4, 0.5, 0.6, 0.7],
              'max_depth': [5,6,7,8,9,10,11,12,13,14],
              'n_estimators': [50,65,80,100,115,130,150],
              'reg_alpha': [0,0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6,51.2,102.4,200],
              'reg_lambda': [0,0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6,51.2,102.4,200]}

In [None]:
# model0 = XGBClassifier(
#     objective='binary:logistic',
#     booster='gbtree',
#     eval_metric='auc',
#     tree_method='hist',
#     device='cuda',
#     grow_policy='lossguide',
#     use_label_encoder=False
# )

# clf = RandomizedSearchCV(n_iter=500, estimator=model0, param_distributions=param_grid, scoring='accuracy', verbose=3, cv=10, refit=True)
# clf.fit(x_train, y_train)

In [None]:
# predictions = clf.predict(x_test)
# print(classification_report(predictions, y_test))

# Final Outcome

In [53]:
file_name = 'xgb_base.pkl'
xgb_bayesian = pickle.load(open(file_name, 'rb'))
predictions = xgb_bayesian.predict(x_test)
print(classification_report(predictions, y_test))
print(accuracy_score(predictions, y_test))

              precision    recall  f1-score   support

           0       0.65      0.69      0.67       363
           1       0.65      0.61      0.63       347

    accuracy                           0.65       710
   macro avg       0.65      0.65      0.65       710
weighted avg       0.65      0.65      0.65       710

0.6492957746478873


In [57]:
file_name = 'xgb_random_tuned.pkl'
xgb_bayesian = pickle.load(open(file_name, 'rb'))
predictions = xgb_bayesian.predict(x_test)
print(classification_report(predictions, y_test))
print(accuracy_score(predictions, y_test))

              precision    recall  f1-score   support

           0       0.44      0.69      0.54       248
           1       0.76      0.53      0.63       462

    accuracy                           0.59       710
   macro avg       0.60      0.61      0.58       710
weighted avg       0.65      0.59      0.59       710

0.5859154929577465


In [55]:
file_name = 'xgb_bayesian_tuned.pkl'
xgb_bayesian = pickle.load(open(file_name, 'rb'))
predictions = xgb_bayesian.predict(x_test)
print(classification_report(predictions, y_test))
print(accuracy_score(predictions, y_test))

              precision    recall  f1-score   support

           0       0.66      0.68      0.67       373
           1       0.63      0.61      0.62       337

    accuracy                           0.64       710
   macro avg       0.64      0.64      0.64       710
weighted avg       0.64      0.64      0.64       710

0.643661971830986


# Accuracy after Tuning:

Parameters that are tuned: 

- gamma
- learning_rate
- max_depth
- n_estimators
- reg_alpha
- reg_lambda

- Accuracy with default parameters:  0.6492957746478873
- Accuracy with RandomSearch tuning: 0.5859154929577465
- Accuracy with Bayesian tuning:     0.643661971830986