In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
from xgboost import XGBClassifier
import re
from sklearn.metrics import classification_report
from xgboost import plot_importance
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, roc_curve
from skopt import BayesSearchCV
import warnings
warnings.filterwarnings('ignore')
import pickle

# Data reading and preprocessing

In [None]:
path = 'H:/work_projects/College/DM_Projects/course_project_3/odi_Matches_Data/man_odi_data_2.csv'

In [None]:
df = pd.read_csv(path)

In [None]:
df.head(3)

In [None]:
df.columns

In [None]:
player_list = set([])
for cell in df['Team1 Playing 11']:
    lst = cell.strip("[]").replace("'", "").replace(' ', '').split(',')
    lst = [int(x) for x in lst]
    for player in lst:
        player_list.add(player)
player_list = list(player_list)

In [None]:
len(player_list)

In [None]:
player_list = set([])
for cell in df['Team2 Playing 11']:
    lst = cell.strip("[]").replace("'", "").replace(' ', '').split(',')
    lst = [int(x) for x in lst]
    for player in lst:
        player_list.add(player)
player_list = list(player_list)
len(player_list)

In [None]:
team_1_players_df = pd.DataFrame(columns=['team1_P1','team1_P2','team1_P3','team1_P4','team1_P5','team1_P6','team1_P7','team1_P8','team1_P9','team1_P10','team1_P11'])
for x in df['Team1 Playing 11']:
    arr = np.array(x.replace('[', '').replace(']', '').replace("'", "").split(', '))
    arr = arr.astype('int')
    team_1_players_df.loc[len(team_1_players_df)] = arr

In [None]:
team_2_players_df = pd.DataFrame(columns=['team2_P1','team2_P2','team2_P3','team2_P4','team2_P5','team2_P6','team2_P7','team2_P8','team2_P9','team2_P10','team2_P11'])
for x in df['Team2 Playing 11']:
    arr = np.array(x.replace('[', '').replace(']', '').replace("'", "").split(', '))
    arr = arr.astype('int')
    team_2_players_df.loc[len(team_2_players_df)] = arr
    

In [None]:
df = pd.concat([df, team_1_players_df, team_2_players_df], axis=1)

In [None]:
winner = []
for match_win, team_one in zip(df['Match Winner'], df['Team1 Name']):
    if match_win == team_one:
        winner.append(0)
    else:
        winner.append(1)

In [None]:
df['match_winner'] = winner

In [None]:
df = df.drop(columns=['Match Winner', 'Toss Winner'], axis=1)

In [None]:
df = df.drop(columns=['Debut Players', 'Team1 Playing 11', 'Team2 Playing 11'], axis=1)

In [None]:
df = df.drop(columns=['Match Date'], axis=1)

In [None]:
df = pd.get_dummies(data=df, columns=['Team1 Name', 'Team1 Captain', 'Team2 Name', 'Team2 Captain', 'Match Venue (Stadium)', 'Match Venue (City)',
                                       'Match Venue (Country)', 'Toss Winner Choice'])
le = LabelEncoder()
lst = ['team1_P1','team1_P2','team1_P3','team1_P4','team1_P5','team1_P6','team1_P7','team1_P8','team1_P9','team1_P10','team1_P11','team2_P1','team2_P2','team2_P3','team2_P4','team2_P5','team2_P6','team2_P7','team2_P8','team2_P9','team2_P10','team2_P11']
for i in lst:
    df[i] = le.fit_transform(df[i])

In [None]:
df.columns

In [None]:
df.head()

In [None]:
x = df.drop(columns=['match_winner'], axis=1)
y = df['match_winner']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

In [None]:
df.dtypes

# Model-1 with default params and GridSearch

In [None]:

# model0 = XGBClassifier(
#     objective='binary:logistic',
#     booster='gbtree',
#     eval_metric='auc',
#     tree_method='hist',
#     # device='cuda',
#     grow_policy='lossguide',
#     use_label_encoder=False
# )
# model0.fit(x_train, y_train)

In [None]:
default_params = {}
dparams = model0.get_params()

for key in dparams.keys():
    gp = dparams[key]
    default_params[key] = [gp]

In [None]:
# clf0 = GridSearchCV(estimator=model0, scoring='accuracy', param_grid=default_params, verbose=3, cv=10, refit=True)
# clf0.fit(x_train, y_train)
# predictions = clf0.predict(x_test)
# print(classification_report(predictions, y_test))

In [None]:
# Best Params of Model-1
bp = clf0.best_params_

# Model-2 with Grid Search Parameter Tuning

In [None]:
param_grid = {'gamma': [12.8,25.6,51.2,102.4, 200],
              'learning_rate': [0.01, 0.03, 0.06, 0.1, 0.15, 0.2, 0.25, 0.300000012, 0.4, 0.5, 0.6, 0.7],
              'max_depth': [5,6,7,8,9,10,11,12,13,14],
              'n_estimators': [50,65,80,100,115,130,150],
              'reg_alpha': [0,0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6,51.2,102.4,200],
              'reg_lambda': [0,0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6,51.2,102.4,200]}

In [None]:
# model0 = XGBClassifier(
#     objective='binary:logistic',
#     booster='gbtree',
#     eval_metric='auc',
#     tree_method='hist',
#     device='cuda',
#     grow_policy='lossguide',
#     use_label_encoder=False
# )

# clf = RandomizedSearchCV(n_iter=500, estimator=model0, param_distributions=param_grid, scoring='accuracy', verbose=3, cv=10, refit=True)
# clf.fit(x_train, y_train)

In [None]:
# predictions = clf.predict(x_test)
# print(classification_report(predictions, y_test))

# Final Outcome

In [53]:
file_name = 'xgb_base.pkl'
xgb_bayesian = pickle.load(open(file_name, 'rb'))
predictions = xgb_bayesian.predict(x_test)
print(classification_report(predictions, y_test))
print(accuracy_score(predictions, y_test))

              precision    recall  f1-score   support

           0       0.65      0.69      0.67       363
           1       0.65      0.61      0.63       347

    accuracy                           0.65       710
   macro avg       0.65      0.65      0.65       710
weighted avg       0.65      0.65      0.65       710

0.6492957746478873


In [57]:
file_name = 'xgb_random_tuned.pkl'
xgb_bayesian = pickle.load(open(file_name, 'rb'))
predictions = xgb_bayesian.predict(x_test)
print(classification_report(predictions, y_test))
print(accuracy_score(predictions, y_test))

              precision    recall  f1-score   support

           0       0.44      0.69      0.54       248
           1       0.76      0.53      0.63       462

    accuracy                           0.59       710
   macro avg       0.60      0.61      0.58       710
weighted avg       0.65      0.59      0.59       710

0.5859154929577465


In [55]:
file_name = 'xgb_bayesian_tuned.pkl'
xgb_bayesian = pickle.load(open(file_name, 'rb'))
predictions = xgb_bayesian.predict(x_test)
print(classification_report(predictions, y_test))
print(accuracy_score(predictions, y_test))

              precision    recall  f1-score   support

           0       0.66      0.68      0.67       373
           1       0.63      0.61      0.62       337

    accuracy                           0.64       710
   macro avg       0.64      0.64      0.64       710
weighted avg       0.64      0.64      0.64       710

0.643661971830986


# Accuracy after Tuning:

Parameters that are tuned: 

- gamma
- learning_rate
- max_depth
- n_estimators
- reg_alpha
- reg_lambda

- Accuracy with default parameters:  0.6492957746478873
- Accuracy with RandomSearch tuning: 0.5859154929577465
- Accuracy with Bayesian tuning:     0.643661971830986