In [1]:
#----------------standard packages-------------------
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

#-----------------caio's modules---------------------
import sys, os #add code folder to sys.path
sys.path.append(os.path.abspath(os.path.join('../../data_processing')))

from cricsheet_read import cricsheet_read #data processing

#-------------------ML packages---------------------
from imblearn.under_sampling import RandomUnderSampler

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier as xgb
#import xgboost as xgb

from sklearn.neural_network import MLPClassifier

from sklearn.metrics import classification_report, accuracy_score, roc_curve, roc_auc_score, auc

match_data = pd.read_csv('../../../data/player_data.csv', dtype = {'match_id':str}).set_index('match_id')
results = cricsheet_read()[1].drop('toss_winner', axis = 1)

In [13]:
def cric_model(match_data, results, algorithm = 'forest'):
#-----------------preprocessing---------------------
    
    X = match_data.drop(['set_team_win'], axis = 1).reset_index()
    y = match_data['set_team_win']
    non_enconding = ['match_id', 'set_team_toss', 'days_since_match']
    
    encoder = OneHotEncoder()
    X = X[non_enconding].join(pd.DataFrame(encoder.fit_transform(X.drop(non_enconding, axis = 1)).toarray()))
    X.columns = X.columns.map(str)
    
    #sampler = RandomUnderSampler(sampling_strategy = 'majority')
    #X, y = sampler.fit_resample(X, y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = False)
    
    test_data = results[results.index.isin(y_test.index)].copy()
    
    scaler = StandardScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns).drop('match_id', axis = 1)
    X_test = pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns).drop('match_id', axis = 1)

#------------------algorithms-----------------------      
    if algorithm == 'xgb':
        clf = xgb.XGBClassifier(use_label_encoder = False, eval_metric = 'logloss')
        clf.fit(X_train,y_train)
        xgb.plot_importance(clf,importance_type='weight')
        plt.show()
        print(pd.DataFrame.from_dict(clf.get_booster().get_fscore(),
                             orient = 'index').sort_values(0, ascending = False).head(10))  
    else:
        clf = RandomForestClassifier(n_estimators = 200, min_samples_split = 2, min_samples_leaf = 1,
                                     max_features = 'auto', max_depth = 100, bootstrap = True)
        clf.fit(X_train, y_train)
    
    test_data['prob_set'] = clf.predict_proba(X_test)[:,1]
    test_data['prob_chase'] = clf.predict_proba(X_test)[:,0]
    y_pred = clf.predict(X_test)

#-------------------metrics-------------------------    
    accuracy = accuracy_score(y_test,y_pred)
    print(classification_report(y_test,y_pred))

    false_auc = roc_auc_score(y_test, y_test.clip(upper = False))
    model_auc = roc_auc_score(y_test, test_data['prob_set'])
    
    print('chasing team always wins: roc auc = %.3f' % (false_auc))
    print('model prediction: roc auc = %.3f' % (model_auc))
    
    false_fpr, false_tpr, _ = roc_curve(y_test, y_test.clip(upper = False))
    model_fpr, model_tpr, _ = roc_curve(y_test, test_data['prob_set'])

    plt.plot(false_fpr, false_tpr, linestyle = '--', label = 'chasing team always wins')
    plt.plot(model_fpr, model_tpr, marker = '.', label = 'model prediction')

    plt.xlabel('false positive rate')
    plt.ylabel('true positive rate')
    plt.legend()
    plt.show()

    return test_data, clf, accuracy
#---------------------------------------------------
test_data, clf, accuracy = cric_model(match_data, results, algorithm = 'forest')

KeyError: 'start_date'

In [18]:
test_data

Unnamed: 0_level_0,start_date,league,venue,match_name,set_team,chase_team,result,prob_set,prob_chase
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1207702,2020-08-27,Vitality Blast,Riverside Ground,Lancashire v Durham,Lancashire,Durham,Lancashire,0.374496,0.625504
1207707,2020-08-29,Vitality Blast,Riverside Ground,Durham v Nottinghamshire,Durham,Nottinghamshire,Nottinghamshire,0.378173,0.621827
1207710,2020-08-29,Vitality Blast,"County Ground, New Road",Worcestershire v Northamptonshire,Worcestershire,Northamptonshire,Northamptonshire,0.333463,0.666537
1207711,2020-08-29,Vitality Blast,County Ground,Glamorgan v Gloucestershire,Glamorgan,Gloucestershire,Glamorgan,0.202870,0.797130
1207712,2020-08-30,Vitality Blast,County Ground,Hampshire v Sussex,Hampshire,Sussex,Sussex,0.684440,0.315560
...,...,...,...,...,...,...,...,...,...
1233955,2020-11-29,T20 Internationals,Bay Oval,New Zealand v West Indies,New Zealand,West Indies,New Zealand,0.405495,0.594505
1277976,2021-11-22,T20 Internationals,"Shere Bangla National Stadium, Mirpur",Bangladesh v Pakistan,Bangladesh,Pakistan,Pakistan,0.649743,0.350257
1287773,2021-12-13,T20 Internationals,"National Stadium, Karachi",Pakistan v West Indies,Pakistan,West Indies,Pakistan,0.244649,0.755351
1287774,2021-12-14,T20 Internationals,"National Stadium, Karachi",Pakistan v West Indies,Pakistan,West Indies,Pakistan,0.230129,0.769871


In [109]:
X, y = match_data.drop(['set_team_win'], axis = 1), match_data['set_team_win']
non_players = ['days_since_match', 'set_team_toss']

encoder = OneHotEncoder()
X = X[non_players].join(pd.DataFrame(encoder.fit_transform(X.drop(non_players, axis = 1)).toarray()))
X.columns = X.columns.map(str)

#sampler = RandomUnderSampler(sampling_strategy = 'majority')
#X, y = sampler.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = False)

test_data = X_test.join(y_test).copy()

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns)

In [111]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

rf_random.best_params_

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}
Fitting 3 folds for each of 100 candidates, totalling 300 fits


{'n_estimators': 1400,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 100,
 'bootstrap': True}

In [112]:
def evaluate(model, X_test, y_test):
    
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print('Model Performance')
    print('Accuracy = {:0.2f}%.'.format(100 * accuracy))
    
    return accuracy

base_model = RandomForestClassifier(random_state = 42)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

Model Performance
Accuracy = 68.65%.
Model Performance
Accuracy = 70.18%.
Improvement of 2.23%.


In [None]:
team_h = ['RG Sharma', 'Babar Azam', 'SPD Smith', 'KS Williamson', 'Q de Kock',
          'M Labuschagne', 'JO Holder', 'R Ashwin', 'TA Boult', 'Shaheen Shah Afridi', 'PJ Cummins']

team_c = ['DA Warner', 'V Kohli', 'BA Stokes', 'C de Grandhomme', 'MS Dhoni',
          'JE Root', 'JDS Neesham', 'MA Starc', 'Rashid Khan', 'JJ Bumrah', 'SL Malinga']

team_harsha = ['JC Buttler', 'Mohammad Rizwan', 'MR Marsh', 'MM Ali', 'GJ Maxwell',
               'AD Russell', 'SP Narine', 'Rashid Khan', 'Shaheen Shah Afridi', 'A Nortje', 'JJ Bumrah']

team_boss = ['EJG Morgan']

data = match_data.head(1)
data[data.columns] = 0
data[team_h] = 1
data[team_boss] = 2
data['days_since_match'] = 0
data.drop(['match_id', 'set_team_win'], axis = 1, inplace = True)

clf.predict(data)

print('set_prob',clf.predict_proba(data)[:,1])
print('chase_prob',clf.predict_proba(data)[:,0])
clf.predict(data)
