In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import xgboost as xgb

import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from scipy.stats import skew
from scipy.stats.stats import pearsonr
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [2]:
matches = pd.read_csv("originalDataset.csv")
matches = matches.iloc[:997]

In [3]:
teams = ["England", "South Africa", "India" , "Australia", "New Zealand" ,
         "Bangladesh" , "Pakistan", "Sri Lanka" , "Afghanistan", "West Indies"]
notTeams = []
for i in range(0,997):
    if(not (matches.loc[i,"Team1"] in teams) or (not (matches.loc[i,"Team2"] in teams))):
        notTeams.append(i)
wcMatches = matches.drop(notTeams)

In [4]:
#Calculation of Team v Team win rates
winRates = np.zeros((10,10))
for i in range(0,10):
    #print(teams[i] + " Win Rates:")
    for j in range(0,10):
        if(not (i == j)):
            iCount = 0
            jCount = 0
            for index, row in wcMatches.iterrows():
                if((row['Team1'] == teams[i] and row['Team2'] == teams[j]) or 
                   (row["Team2"] == teams[i] and row["Team1"] == teams[j])):
                    if(row["Winner"] == teams[i]):
                        iCount = iCount + 1
                    if(row["Winner"] == teams[j]):
                        jCount = jCount + 1
            if(iCount + jCount > 0):
                winRates[i][j] = round(100 * iCount/(iCount + jCount),2)
                #print("Win Rate of " + teams[i] + " vs " + teams[j] +  ": " + str(round(100 * iCount/(iCount + jCount) ,2)) + "%")
            else:
                winRates[i][j] = -10.00
                #print("Win Rate of " + teams[i] + " vs " + teams[j] +  ": 50.0%" )
        else:
            winRates[i][j] = -10.00
winRatesDF = pd.DataFrame(winRates, index = teams, columns=teams)
winRatesDF

Unnamed: 0,England,South Africa,India,Australia,New Zealand,Bangladesh,Pakistan,Sri Lanka,Afghanistan,West Indies
England,-10.0,57.14,39.13,40.0,57.14,66.67,73.68,43.48,100.0,92.31
South Africa,42.86,-10.0,52.63,55.56,61.11,71.43,57.14,65.0,-10.0,80.0
India,60.87,47.37,-10.0,47.83,57.89,76.92,63.64,69.44,100.0,72.0
Australia,60.0,44.44,52.17,-10.0,56.25,100.0,83.33,53.85,100.0,82.35
New Zealand,42.86,38.89,42.11,43.75,-10.0,47.06,66.67,50.0,100.0,40.0
Bangladesh,33.33,28.57,23.08,0.0,52.94,-10.0,30.0,23.08,60.0,33.33
Pakistan,26.32,42.86,36.36,16.67,33.33,70.0,-10.0,60.61,100.0,66.67
Sri Lanka,56.52,35.0,30.56,46.15,50.0,76.92,39.39,-10.0,100.0,77.78
Afghanistan,0.0,-10.0,0.0,0.0,0.0,40.0,0.0,0.0,-10.0,50.0
West Indies,7.69,20.0,28.0,17.65,60.0,66.67,33.33,22.22,50.0,-10.0


In [5]:
winRatesDF = winRatesDF.replace(-10,'-')
winRatesDFStrings = winRatesDF.replace(100, "100.00")
winRatesDFStrings = winRatesDFStrings.replace(40, "40.00")
winRatesDFStrings = winRatesDFStrings.replace(0, "0.00")
winRatesDFStrings = winRatesDFStrings.replace(50, "50.00")
winRatesDFStrings = winRatesDFStrings.replace(60, "60.00")
winRatesDFStrings = winRatesDFStrings.replace(65, "65.00")
winRatesDFStrings = winRatesDFStrings.replace(72, "72.00")
winRatesDFStrings = winRatesDFStrings.replace(20, "20.00")
winRatesDFStrings = winRatesDFStrings.replace(35, "35.00")
winRatesDFStrings = winRatesDFStrings.replace(30, "30.00")
winRatesDFStrings = winRatesDFStrings.replace(70, "70.00")
winRatesDFStrings = winRatesDFStrings.replace(80, "80.00")
winRatesDFStrings = winRatesDFStrings.replace(28, "28.00")



winRatesDFStrings

Unnamed: 0,England,South Africa,India,Australia,New Zealand,Bangladesh,Pakistan,Sri Lanka,Afghanistan,West Indies
England,-,57.14,39.13,40.00,57.14,66.67,73.68,43.48,100.00,92.31
South Africa,42.86,-,52.63,55.56,61.11,71.43,57.14,65.00,-,80.00
India,60.87,47.37,-,47.83,57.89,76.92,63.64,69.44,100.00,72.00
Australia,60.00,44.44,52.17,-,56.25,100.00,83.33,53.85,100.00,82.35
New Zealand,42.86,38.89,42.11,43.75,-,47.06,66.67,50.00,100.00,40.00
Bangladesh,33.33,28.57,23.08,0.00,52.94,-,30.00,23.08,60.00,33.33
Pakistan,26.32,42.86,36.36,16.67,33.33,70.00,-,60.61,100.00,66.67
Sri Lanka,56.52,35.00,30.56,46.15,50.00,76.92,39.39,-,100.00,77.78
Afghanistan,0.00,-,0.00,0.00,0.00,40.00,0.00,0.00,-,50.00
West Indies,7.69,20.00,28.00,17.65,60.00,66.67,33.33,22.22,50.00,-


In [6]:
winRatesDFStrings.to_csv("WCWinRates.csv")

In [6]:
wcMatches
i = 1
for index, row in wcMatches.iterrows():
    row["MatchID"] = i
    i+=1

In [7]:
#wcMatches.to_csv("godhelp.csv")

In [8]:
realMatches = pd.read_csv("godhelp.csv")

In [9]:
realMatches

Unnamed: 0,Winner,MatchID,Team1,Team2,Margin,Year
0,Sri Lanka,0,Bangladesh,Sri Lanka,7 wickets,2010
1,Sri Lanka,1,India,Sri Lanka,5 wickets,2010
2,India,2,Bangladesh,India,6 wickets,2010
3,Sri Lanka,3,Bangladesh,Sri Lanka,9 wickets,2010
4,India,4,India,Sri Lanka,8 wickets,2010
5,India,5,Bangladesh,India,6 wickets,2010
6,Sri Lanka,6,India,Sri Lanka,4 wickets,2010
7,Australia,7,Australia,Pakistan,5 wickets,2010
8,Australia,8,Australia,Pakistan,140 runs,2010
9,Australia,9,Australia,Pakistan,40 runs,2010


In [10]:
winrates = []
for index, row in realMatches.iterrows():
    Team1 = row["Team1"]
    Team2 = row["Team2"]
    count1 = 0
    count2 = 0
    j=1
    for i in range(0,row["MatchID"]):
        if((realMatches["Team1"][i] == Team1 and realMatches["Team2"][i] == Team2)
          or (realMatches["Team1"][i] == Team2 and realMatches["Team2"][i] == Team1)):
            if(realMatches["Winner"][i] == Team1):
                count1 += realMatches["Year"][i] - 2009
            else:
                count2 += realMatches["Year"][i] - 2009
            j*=1.5
    if(count1 + count2 > 0):
        #print(round(count1/(count1+count2),2))
        winrates.append(round(count1/(count1+count2),2))
    else:
        winrates.append(0.50)
realMatches["WinRateVsTeam"] = winrates

In [11]:
winratesT1 = []
for index, row in realMatches.iterrows():
    Team1 = row["Team1"]
    Team2 = row["Team2"]
    count1 = 0
    count2 = 0
    j=1
    for i in range(0,row["MatchID"]):
        if(realMatches["Team1"][i] == Team1  or realMatches["Team2"][i] == Team1):
            if(realMatches["Winner"][i] == Team1):
                count1 += realMatches["Year"][i] - 2009
            else:
                count2 += realMatches["Year"][i] - 2009
    if(count1 + count2 > 0):
        #print(round(count1/(count1+count2),2))
        winratesT1.append(round(count1/(count1+count2),2))
    else:
        winratesT1.append(0.50)
realMatches["T1WinRate"] = winratesT1

In [12]:
winratesT2 = []
for index, row in realMatches.iterrows():
    Team1 = row["Team1"]
    Team2 = row["Team2"]
    count1 = 0
    count2 = 0
    j=1
    for i in range(0,row["MatchID"]):
        if(realMatches["Team1"][i] == Team2  or realMatches["Team2"][i] == Team2):
            if(realMatches["Winner"][i] == Team2):
                count1 += realMatches["Year"][i] - 2009
            else:
                count2 += realMatches["Year"][i] - 2009
    if(count1 + count2 > 0):
        #print(round(count1/(count1+count2),2))
        winratesT2.append(round(count1/(count1+count2),2))
    else:
        winratesT2.append(0.50)
realMatches["T2WinRate"] = winratesT2

In [13]:
t2Win = []
for index,row in realMatches.iterrows():
    if(isinstance(row["Margin"],float)):
        t2Win.append(0)
    elif(row["Margin"].split()[-1] == "wickets"):
        t2Win.append(1)
    else:
        t2Win.append(0)
realMatches["Team2Wins"] = t2Win

In [14]:
realMatches = realMatches.rename({"Team2Wins" : "WinByWickets"},axis="columns")

In [63]:
margin = []
for index,row in realMatches.iterrows():
    if(not isinstance(row["Margin"],float)):
        margin.append(row["Margin"].split()[0])
    else:
        margin.append(0)
realMatches["Margin"] = margin

In [74]:
xgMatches = realMatches.copy()

In [75]:
xgMatches

Unnamed: 0,Winner,MatchID,Team1,Team2,Margin,Year,WinRateVsTeam,T1WinRate,T2WinRate,WinByWickets
0,Sri Lanka,0,Bangladesh,Sri Lanka,7,2010,0.50,0.50,0.50,1
1,Sri Lanka,1,India,Sri Lanka,5,2010,0.50,0.50,1.00,1
2,India,2,Bangladesh,India,6,2010,0.50,0.00,0.00,1
3,Sri Lanka,3,Bangladesh,Sri Lanka,9,2010,0.00,0.00,1.00,1
4,India,4,India,Sri Lanka,8,2010,0.00,0.50,1.00,1
5,India,5,Bangladesh,India,6,2010,0.00,0.00,0.67,1
6,Sri Lanka,6,India,Sri Lanka,4,2010,0.50,0.75,0.75,1
7,Australia,7,Australia,Pakistan,5,2010,0.50,0.50,0.50,1
8,Australia,8,Australia,Pakistan,140,2010,1.00,1.00,0.00,0
9,Australia,9,Australia,Pakistan,40,2010,1.00,1.00,0.00,0


In [76]:
i=0
for index,row in xgMatches.iterrows():
    if(row["Winner"] == row["Team1"]):
        xgMatches["Winner"][i] = 0
    elif(row["Winner"] == row["Team2"]):
        xgMatches["Winner"][i] = 1
    i+=1
#xgMatches = xgMatches.drop('ifor', axis = 'columns')
xgMatches

Unnamed: 0,Winner,MatchID,Team1,Team2,Margin,Year,WinRateVsTeam,T1WinRate,T2WinRate,WinByWickets
0,1,0,Bangladesh,Sri Lanka,7,2010,0.50,0.50,0.50,1
1,1,1,India,Sri Lanka,5,2010,0.50,0.50,1.00,1
2,1,2,Bangladesh,India,6,2010,0.50,0.00,0.00,1
3,1,3,Bangladesh,Sri Lanka,9,2010,0.00,0.00,1.00,1
4,0,4,India,Sri Lanka,8,2010,0.00,0.50,1.00,1
5,1,5,Bangladesh,India,6,2010,0.00,0.00,0.67,1
6,1,6,India,Sri Lanka,4,2010,0.50,0.75,0.75,1
7,0,7,Australia,Pakistan,5,2010,0.50,0.50,0.50,1
8,0,8,Australia,Pakistan,140,2010,1.00,1.00,0.00,0
9,0,9,Australia,Pakistan,40,2010,1.00,1.00,0.00,0


In [77]:
xgMatches=xgMatches[:651]
xgMatches = xgMatches.drop("Margin",axis="columns")


In [78]:
xgMatches=xgMatches.drop("WinByWickets",axis="columns")

In [328]:
teamData = pd.read_csv("teamStatsAdvanced.csv")
teamData = teamData.rename({"Unnamed: 0" : "Team"}, axis="columns")
teamData

Unnamed: 0,Team,Matches played,Innings batted,Not outs,Highest inns score,Batting average,Batting strike rate,Stumpings made,Runs scored/Innings,Balls faced/Innings,...,Boundary fours/Innings,Boundary sixes/Innings,Catches/Innings,Innings bowled,Bowling average,Economy rate,Bowling strike rate,Balls bowled/Innings,Runs conceded/Innings,Wickets taken/Innings
0,India,1573,110.0,23.1,128.8,40.327,91.667,12.7,324.720405,356.848083,...,29.455556,6.409462,6.599264,59.111111,37.175556,5.11,42.777778,411.372639,347.582162,11.746992
1,Pakistan,1012,69.909091,10.818182,114.636364,39.55,85.589091,2.090909,357.770459,419.182396,...,30.65695,4.431474,5.589405,51.375,35.865,5.32375,40.6,375.817792,335.690898,10.001532
2,England,1234,75.0,11.090909,114.636364,34.098182,91.858182,2.818182,324.464611,348.959827,...,31.030437,7.15361,5.975101,61.5,38.58625,5.93,39.275,366.68424,358.623145,9.681133
3,Afghanistan,848,59.666667,5.555556,104.444444,29.146667,76.084444,2.777778,238.367626,320.249531,...,20.356499,5.242254,3.357003,50.444444,27.538889,4.755556,34.377778,390.091477,299.877785,12.169411
4,Australia,814,67.625,5.25,131.75,39.115,91.815,0.375,283.659426,315.172296,...,26.578815,5.116816,4.135243,39.375,33.33375,5.34875,37.45,402.16452,358.059542,11.630773
5,Bangladesh,1343,82.181818,12.090909,102.363636,31.041818,81.051818,4.090909,286.695282,355.424325,...,27.163847,4.703703,4.966758,89.75,33.1775,5.1075,39.55,377.296617,321.141103,10.680032
6,New Zealand,1157,86.222222,13.0,125.111111,35.52,92.472222,0.777778,264.394001,295.061498,...,23.527136,5.528133,4.456428,55.25,33.105,5.42625,36.8,378.76124,341.118306,10.906928
7,South Africa,1143,101.714286,15.857143,138.714286,48.001429,88.97,1.142857,258.147887,293.820842,...,23.532812,3.85968,4.598364,57.111111,28.64,5.077778,33.611111,413.527999,348.316086,13.122426
8,Sri Lanka,1106,58.818182,8.909091,94.909091,28.145455,88.938182,0.181818,259.55349,296.375485,...,23.95183,5.356121,3.922753,72.8,44.219,5.53,47.34,378.584401,351.577051,8.984971
9,West Indies,878,71.444444,7.444444,117.0,29.236667,81.802222,1.0,233.745902,270.228733,...,19.792667,7.141778,4.722404,55.1,50.865,5.693,53.28,437.073041,413.788636,11.59179


In [329]:
stats = ["MatchCt","TotInnings","NotOuts", "MaxInnScore", "BatAvg","BatStrRate","Stumpings","Run/Inning","BallsFaced/Inning","Hundreds","Fifties","4/Inning","6/Inning",
         "Catch/Inning","BowlInns", "BowlAvg","EconRate","BowlStrRate","BallBowl/Inning","RunConc/Inning","WickTake/Inning"]
teamIndices = ["India" , "Pakistan" , "England", "Afghanistan", "Australia", "Bangladesh", "New Zealand", "South Africa" ,
               "Sri Lanka", "West Indies"]
#teamData = teamData.drop(["Hundreds/Innings","Fifties/Innings"],axis = 'columns')
i = 1
for stat in stats:
    arr = []
    for index,row in xgMatches.iterrows():
        arr.append(teamData.loc[teamIndices.index(row["Team2"])][i])
    i+=1
    xgMatches[stat + '2'] = arr
    
i = 1
for stat in stats:
    arr = []
    for index,row in xgMatches.iterrows():
        arr.append(teamData.loc[teamIndices.index(row["Team1"])][i])
    i+=1
    xgMatches[stat + '1'] = arr

In [330]:
#basicXGMatches = xgMatches.drop(["Team1","Team2","MatchID"],axis = 'columns')
basicXGMatches = xgMatches.drop("MatchID",axis = 'columns')
T1 = []
T2 = []
for index,row in basicXGMatches.iterrows():
    T1.append(teamIndices.index(row["Team1"]))
    T2.append(teamIndices.index(row["Team2"]))
basicXGMatches["Team1"] = T1
basicXGMatches["Team2"] = T2

basicXGMatches = basicXGMatches.astype("float64")

In [94]:
basicXGMatches

Unnamed: 0,Winner,Team1,Team2,Year,WinRateVsTeam,T1WinRate,T2WinRate,BatAvg2,BatStrRate2,Run/Inning2,...,Fifties1,4/Inning1,6/Inning1,Catch/Inning1,BowlAvg1,EconRate1,BowlStrRate1,BallBowl/Inning1,RunConc/Inning1,WickTake/Inning1
0,1.0,5.0,8.0,2010.0,0.50,0.50,0.50,309.60,978.32,259.553490,...,1.731382,27.163847,4.703703,4.966758,265.42,40.86,316.4,377.296617,321.141103,10.680032
1,1.0,0.0,8.0,2010.0,0.50,0.50,1.00,309.60,978.32,259.553490,...,1.519688,29.455556,6.409462,6.599264,334.58,45.99,385.0,411.372639,347.582162,11.746992
2,1.0,5.0,0.0,2010.0,0.50,0.00,0.00,403.27,916.67,324.720405,...,1.731382,27.163847,4.703703,4.966758,265.42,40.86,316.4,377.296617,321.141103,10.680032
3,1.0,5.0,8.0,2010.0,0.00,0.00,1.00,309.60,978.32,259.553490,...,1.731382,27.163847,4.703703,4.966758,265.42,40.86,316.4,377.296617,321.141103,10.680032
4,0.0,0.0,8.0,2010.0,0.00,0.50,1.00,309.60,978.32,259.553490,...,1.519688,29.455556,6.409462,6.599264,334.58,45.99,385.0,411.372639,347.582162,11.746992
5,1.0,5.0,0.0,2010.0,0.00,0.00,0.67,403.27,916.67,324.720405,...,1.731382,27.163847,4.703703,4.966758,265.42,40.86,316.4,377.296617,321.141103,10.680032
6,1.0,0.0,8.0,2010.0,0.50,0.75,0.75,309.60,978.32,259.553490,...,1.519688,29.455556,6.409462,6.599264,334.58,45.99,385.0,411.372639,347.582162,11.746992
7,0.0,4.0,1.0,2010.0,0.50,0.50,0.50,435.05,941.48,357.770459,...,1.576691,26.578815,5.116816,4.135243,266.67,42.79,299.6,402.164520,358.059542,11.630773
8,0.0,4.0,1.0,2010.0,1.00,1.00,0.00,435.05,941.48,357.770459,...,1.576691,26.578815,5.116816,4.135243,266.67,42.79,299.6,402.164520,358.059542,11.630773
9,0.0,4.0,1.0,2010.0,1.00,1.00,0.00,435.05,941.48,357.770459,...,1.576691,26.578815,5.116816,4.135243,266.67,42.79,299.6,402.164520,358.059542,11.630773


In [331]:
basicTrain = basicXGMatches[0:500]
basicTest = basicXGMatches[500:651]

In [332]:
basicTrainX = basicTrain.loc[:,"Team1": "WickTake/Inning1"]
basicTrainY = basicTrain.loc[:,"Winner"]
basicTestX = basicTest.loc[:,"Team1" : "WickTake/Inning1"]
basicTestY = basicTest.loc[:,"Winner"]
np.mean(basicTrainY)

0.406

In [365]:
xg_reg1 = xgb.XGBRegressor(objective ='binary:logistic', colsample_bytree = 0.25, learning_rate = 0.007, max_depth = 12, 
                          alpha = 0, n_estimators = 95, subsample = 1, min_child_weight = 2.15, gamma = 0.19)

xg_reg1.fit(basicTrainX,basicTrainY)
preds = xg_reg1.predict(basicTestX)
error = roc_auc_score(basicTestY,preds)

j=0
for i in range(0,len(preds)):
    if (preds[i]>0.5):
        preds[i]=1
    else:
        preds[i]=0
    if(preds[i] != basicTestY.loc[i+500]):
        j+=1
print(error)
print(j)

0.6981188118811882
39


In [180]:
param_test1 = {
    'max_depth':range(5,10,1),
    'n_estimators': range(80,120,5)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( 
    learning_rate =0.0075, 
    n_estimators=90, 
    max_depth=12,
    min_child_weight=2.29,
    gamma=0.29, 
    subsample=1, 
    colsample_bytree=0.4,
    objective= 'binary:logistic', 
    nthread=16, 
    scale_pos_weight=1, 
    seed=42,
    reg_lambda=0
), 
param_grid = param_test1, scoring='roc_auc',n_jobs=-1,iid=False, cv=10)
gsearch1.fit(basicXGMatches.loc[:,"Team1": "WickTake/Inning1"],basicXGMatches.loc[:,'Winner'])
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

KeyboardInterrupt: 

In [273]:
for i in range(0,101):
    print(xgMatches["Team1"][i+550] + " " + xgMatches["Team2"][i+550] + " " + str(preds[i]) + " " + str(basicTestY.loc[i+550]))

South Africa Australia 0.0 0.0
Bangladesh Afghanistan 0.0 0.0
South Africa Australia 0.0 0.0
Pakistan West Indies 0.0 0.0
Pakistan West Indies 0.0 0.0
South Africa Australia 0.0 0.0
Bangladesh England 0.0 1.0
South Africa Australia 0.0 0.0
Bangladesh England 0.0 0.0
Bangladesh England 0.0 1.0
South Africa Australia 0.0 0.0
India New Zealand 0.0 0.0
India New Zealand 0.0 1.0
India New Zealand 0.0 0.0
India New Zealand 0.0 1.0
India New Zealand 0.0 0.0
Sri Lanka West Indies 0.0 1.0
Sri Lanka West Indies 0.0 0.0
Australia New Zealand 0.0 0.0
Australia New Zealand 0.0 0.0
Australia New Zealand 0.0 0.0
New Zealand Bangladesh 0.0 0.0
New Zealand Bangladesh 0.0 0.0
New Zealand Bangladesh 0.0 0.0
Australia Pakistan 0.0 0.0
Australia Pakistan 1.0 1.0
India England 1.0 0.0
Australia Pakistan 1.0 0.0
India England 0.0 0.0
Australia Pakistan 1.0 0.0
India England 1.0 1.0
Australia Pakistan 1.0 0.0
South Africa Sri Lanka 0.0 0.0
New Zealand Australia 1.0 0.0
South Africa Sri Lanka 1.0 0.0
South Afr

In [34]:
mirrorData = xgMatches.copy()
mirrorData["Team1"] = xgMatches["Team2"]
mirrorData["Team2"] = xgMatches["Team1"]
mirrorData["Winner"] = 1 - xgMatches["Winner"]
mirrorData["WinRateVsTeam"] = 1 - xgMatches["WinRateVsTeam"]
mirrorData["T1WinRate"] = xgMatches["T2WinRate"]
mirrorData["T2WinRate"] = xgMatches["T1WinRate"]
mirrorData

Unnamed: 0,Winner,MatchID,Team1,Team2,Year,WinRateVsTeam,T1WinRate,T2WinRate,BatAvg2,BatStrRate2,...,Fifties1,4/Inning1,6/Inning1,Catch/Inning1,BowlAvg1,EconRate1,BowlStrRate1,BallBowl/Inning1,RunConc/Inning1,WickTake/Inning1
0,0,0,Sri Lanka,Bangladesh,2010,0.50,0.50,0.50,309.60,978.32,...,1.731382,27.163847,4.703703,4.966758,265.42,40.86,316.4,377.296617,321.141103,10.680032
1,0,1,Sri Lanka,India,2010,0.50,1.00,0.50,309.60,978.32,...,1.519688,29.455556,6.409462,6.599264,334.58,45.99,385.0,411.372639,347.582162,11.746992
2,0,2,India,Bangladesh,2010,0.50,0.00,0.00,403.27,916.67,...,1.731382,27.163847,4.703703,4.966758,265.42,40.86,316.4,377.296617,321.141103,10.680032
3,0,3,Sri Lanka,Bangladesh,2010,1.00,1.00,0.00,309.60,978.32,...,1.731382,27.163847,4.703703,4.966758,265.42,40.86,316.4,377.296617,321.141103,10.680032
4,1,4,Sri Lanka,India,2010,1.00,1.00,0.50,309.60,978.32,...,1.519688,29.455556,6.409462,6.599264,334.58,45.99,385.0,411.372639,347.582162,11.746992
5,0,5,India,Bangladesh,2010,1.00,0.67,0.00,403.27,916.67,...,1.731382,27.163847,4.703703,4.966758,265.42,40.86,316.4,377.296617,321.141103,10.680032
6,0,6,Sri Lanka,India,2010,0.50,0.75,0.75,309.60,978.32,...,1.519688,29.455556,6.409462,6.599264,334.58,45.99,385.0,411.372639,347.582162,11.746992
7,1,7,Pakistan,Australia,2010,0.50,0.50,0.50,435.05,941.48,...,1.576691,26.578815,5.116816,4.135243,266.67,42.79,299.6,402.164520,358.059542,11.630773
8,1,8,Pakistan,Australia,2010,0.00,0.00,1.00,435.05,941.48,...,1.576691,26.578815,5.116816,4.135243,266.67,42.79,299.6,402.164520,358.059542,11.630773
9,1,9,Pakistan,Australia,2010,0.00,0.00,1.00,435.05,941.48,...,1.576691,26.578815,5.116816,4.135243,266.67,42.79,299.6,402.164520,358.059542,11.630773


In [35]:
i = 1
for stat in stats:
    arr = []
    for index,row in mirrorData.iterrows():
        arr.append(teamData.loc[teamIndices.index(row["Team2"])][i])
    i+=1
    mirrorData[stat + '2'] = arr
    
i = 1
for stat in stats:
    arr = []
    for index,row in mirrorData.iterrows():
        arr.append(teamData.loc[teamIndices.index(row["Team1"])][i])
    i+=1
    mirrorData[stat + '1'] = arr

In [36]:
mirrorData

Unnamed: 0,Winner,MatchID,Team1,Team2,Year,WinRateVsTeam,T1WinRate,T2WinRate,BatAvg2,BatStrRate2,...,Fifties1,4/Inning1,6/Inning1,Catch/Inning1,BowlAvg1,EconRate1,BowlStrRate1,BallBowl/Inning1,RunConc/Inning1,WickTake/Inning1
0,0,0,Sri Lanka,Bangladesh,2010,0.50,0.50,0.50,341.46,891.57,...,1.568280,23.951830,5.356121,3.922753,442.19,55.30,473.4,378.584401,351.577051,8.984971
1,0,1,Sri Lanka,India,2010,0.50,1.00,0.50,403.27,916.67,...,1.568280,23.951830,5.356121,3.922753,442.19,55.30,473.4,378.584401,351.577051,8.984971
2,0,2,India,Bangladesh,2010,0.50,0.00,0.00,341.46,891.57,...,1.519688,29.455556,6.409462,6.599264,334.58,45.99,385.0,411.372639,347.582162,11.746992
3,0,3,Sri Lanka,Bangladesh,2010,1.00,1.00,0.00,341.46,891.57,...,1.568280,23.951830,5.356121,3.922753,442.19,55.30,473.4,378.584401,351.577051,8.984971
4,1,4,Sri Lanka,India,2010,1.00,1.00,0.50,403.27,916.67,...,1.568280,23.951830,5.356121,3.922753,442.19,55.30,473.4,378.584401,351.577051,8.984971
5,0,5,India,Bangladesh,2010,1.00,0.67,0.00,341.46,891.57,...,1.519688,29.455556,6.409462,6.599264,334.58,45.99,385.0,411.372639,347.582162,11.746992
6,0,6,Sri Lanka,India,2010,0.50,0.75,0.75,403.27,916.67,...,1.568280,23.951830,5.356121,3.922753,442.19,55.30,473.4,378.584401,351.577051,8.984971
7,1,7,Pakistan,Australia,2010,0.50,0.50,0.50,312.92,734.52,...,1.767377,30.656950,4.431474,5.589405,286.92,42.59,324.8,375.817792,335.690898,10.001532
8,1,8,Pakistan,Australia,2010,0.00,0.00,1.00,312.92,734.52,...,1.767377,30.656950,4.431474,5.589405,286.92,42.59,324.8,375.817792,335.690898,10.001532
9,1,9,Pakistan,Australia,2010,0.00,0.00,1.00,312.92,734.52,...,1.767377,30.656950,4.431474,5.589405,286.92,42.59,324.8,375.817792,335.690898,10.001532


In [52]:
xgMatchesFull = xgMatches.append(mirrorData)
#xgMatchesFull.to_csv("help.csv")

In [37]:
xgMatchesFull = pd.read_csv("help.csv")


In [38]:
basicXGFullMatches = xgMatchesFull.drop(["Team1","Team2","MatchID"],axis = 'columns')
basicXGFullMatches = basicXGFullMatches.astype("float64")

In [80]:
trainX = basicXGFullMatches.loc[0:500].append(basicXGFullMatches.loc[651:1151]).loc[:,"Year":"WickTake/Inning1"]
trainY = basicXGFullMatches.loc[0:500].append(basicXGFullMatches.loc[651:1151]).loc[:,"Winner"]
testX = basicXGFullMatches.loc[500:651].append(basicXGFullMatches.loc[1151:]).loc[:,"Year":"WickTake/Inning1"]
testY = basicXGFullMatches.loc[500:651].append(basicXGFullMatches.loc[1151:]).loc[:,"Winner"]
testY

500     0.0
501     0.0
502     1.0
503     0.0
504     0.0
505     0.0
506     0.0
507     0.0
508     1.0
509     0.0
510     0.0
511     0.0
512     0.0
513     0.0
514     1.0
515     0.0
516     0.0
517     0.0
518     1.0
519     1.0
520     1.0
521     0.0
522     0.0
523     0.0
524     0.0
525     0.0
526     1.0
527     1.0
528     0.0
529     0.0
       ... 
1272    0.0
1273    0.0
1274    0.0
1275    1.0
1276    0.0
1277    0.0
1278    0.0
1279    0.0
1280    0.0
1281    0.0
1282    1.0
1283    1.0
1284    1.0
1285    1.0
1286    1.0
1287    1.0
1288    0.0
1289    1.0
1290    1.0
1291    1.0
1292    1.0
1293    1.0
1294    1.0
1295    1.0
1296    1.0
1297    0.0
1298    1.0
1299    1.0
1300    1.0
1301    1.0
Name: Winner, Length: 303, dtype: float64

In [164]:
xg_reg2 = xgb.XGBRegressor(objective ='binary:logistic', colsample_bytree = 0.3, learning_rate = 0.005, max_depth = 5, 
                          alpha = 0, n_estimators = 111, subsample = 1, min_child_weight = 2.5, gamma = 0.25)

xg_reg2.fit(trainX,trainY)
preds2 = xg_reg2.predict(testX)
error = roc_auc_score(testY,preds2)

j=0
for i in range(0,len(preds2)):
    if (preds2[i]>0.5):
        preds2[i]=1
    else:
        preds2[i]=0
    if(i<=151):
        if(preds2[i] != testY.loc[i+500]):
            j+=1
    else:
        if(preds2[i] != testY.loc[i+999]):
            j+=1
print(error)
print(j)
print(len(preds2))

0.6618377483443708
115
303


In [None]:
for i in range(0,len(preds2)):

In [309]:
worldCupData = pd.read_csv("WorldCupMatches.csv")

In [316]:
T1 = []
T2 = []
for index,row in worldCupData.iterrows():
    T1.append(teamIndices.index(row["Team1"]))
    T2.append(teamIndices.index(row["Team2"]))
worldCupData["Team1"] = T1
worldCupData["Team2"] = T2

worldCupData = worldCupData.astype("float64")

In [310]:
worldCupData["Year"]=2019
worldCupData

Unnamed: 0,Team1,Team2,Year
0,England,South Africa,2019
1,Pakistan,West Indies,2019
2,New Zealand,Sri Lanka,2019
3,Australia,Afghanistan,2019
4,Bangladesh,South Africa,2019
5,England,Pakistan,2019
6,Afghanistan,Sri Lanka,2019
7,India,South Africa,2019
8,Bangladesh,New Zealand,2019
9,Australia,West Indies,2019


In [311]:
winrates = []
for index, row in worldCupData.iterrows():
    Team1 = row["Team1"]
    Team2 = row["Team2"]
    count1 = 0
    count2 = 0
    j=1
    for i in range(0,651):
        if((realMatches["Team1"][i] == Team1 and realMatches["Team2"][i] == Team2)
          or (realMatches["Team1"][i] == Team2 and realMatches["Team2"][i] == Team1)):
            if(realMatches["Winner"][i] == Team1):
                count1 += realMatches["Year"][i] - 2009
            else:
                count2 += realMatches["Year"][i] - 2009
            j*=1.5
    if(count1 + count2 > 0):
        #print(round(count1/(count1+count2),2))
        winrates.append(round(count1/(count1+count2),2))
    else:
        winrates.append(0.50)
worldCupData["WinRateVsTeam"] = winrates

In [312]:
winratesT1 = []
for index, row in worldCupData.iterrows():
    Team1 = row["Team1"]
    Team2 = row["Team2"]
    count1 = 0
    count2 = 0
    j=1
    for i in range(0,651):
        if(realMatches["Team1"][i] == Team1  or realMatches["Team2"][i] == Team1):
            if(realMatches["Winner"][i] == Team1):
                count1 += realMatches["Year"][i] - 2009
            else:
                count2 += realMatches["Year"][i] - 2009
    if(count1 + count2 > 0):
        #print(round(count1/(count1+count2),2))
        winratesT1.append(round(count1/(count1+count2),2))
    else:
        winratesT1.append(0.50)
worldCupData["T1WinRate"] = winratesT1

In [313]:
winratesT2 = []
for index, row in worldCupData.iterrows():
    Team1 = row["Team1"]
    Team2 = row["Team2"]
    count1 = 0
    count2 = 0
    j=1
    for i in range(0,651):
        if(realMatches["Team1"][i] == Team2  or realMatches["Team2"][i] == Team2):
            if(realMatches["Winner"][i] == Team2):
                count1 += realMatches["Year"][i] - 2009
            else:
                count2 += realMatches["Year"][i] - 2009
    if(count1 + count2 > 0):
        #print(round(count1/(count1+count2),2))
        winratesT2.append(round(count1/(count1+count2),2))
    else:
        winratesT2.append(0.50)
worldCupData["T2WinRate"] = winratesT2

In [314]:
worldCupData

Unnamed: 0,Team1,Team2,Year,WinRateVsTeam,T1WinRate,T2WinRate
0,England,South Africa,2019,0.55,0.56,0.61
1,Pakistan,West Indies,2019,0.69,0.46,0.3
2,New Zealand,Sri Lanka,2019,0.61,0.52,0.41
3,Australia,Afghanistan,2019,1.0,0.61,0.22
4,Bangladesh,South Africa,2019,0.27,0.37,0.61
5,England,Pakistan,2019,0.73,0.56,0.46
6,Afghanistan,Sri Lanka,2019,0.0,0.22,0.41
7,India,South Africa,2019,0.51,0.62,0.61
8,Bangladesh,New Zealand,2019,0.46,0.37,0.52
9,Australia,West Indies,2019,0.8,0.61,0.3


In [315]:
i = 1
for stat in stats:
    arr = []
    for index,row in worldCupData.iterrows():
        arr.append(teamData.loc[teamIndices.index(row["Team2"])][i])
    i+=1
    worldCupData[stat + '2'] = arr
    
i = 1
for stat in stats:
    arr = []
    for index,row in worldCupData.iterrows():
        arr.append(teamData.loc[teamIndices.index(row["Team1"])][i])
    i+=1
    worldCupData[stat + '1'] = arr

In [None]:
worldCupData

In [366]:
xg_reg1.fit(basicXGMatches.loc[:,"Team1":"WickTake/Inning1"],basicXGMatches.loc[:,"Winner"])
wcPreds = xg_reg1.predict(worldCupData)
wcPreds

array([0.43535936, 0.423692  , 0.42313468, 0.40903378, 0.47071308,
       0.4466241 , 0.47198913, 0.4605532 , 0.4403514 , 0.39453873,
       0.4150568 , 0.4067349 , 0.4371676 , 0.40107104, 0.41476774,
       0.45256132, 0.40588567, 0.42037633, 0.40611824, 0.40660873,
       0.50145173, 0.42619914, 0.441254  , 0.43418586, 0.46834868,
       0.39782646, 0.4320345 , 0.5013491 , 0.40287715, 0.46987233,
       0.45206147, 0.41105247, 0.4315536 , 0.428414  , 0.42098755,
       0.48014688, 0.38973352, 0.43425095, 0.43213344, 0.49610913,
       0.42227527, 0.4617461 , 0.47602654, 0.43510532, 0.43348396],
      dtype=float32)

In [367]:
i=0
worldCupGames = pd.read_csv("WorldCupMatches.csv")
for index,row in worldCupGames.iterrows():
    if(wcPreds[i]<0.45):
        print(row["Team1"] + " vs " + row["Team2"] + ": Winner = " + row["Team1"])
    else:
        print(row["Team1"] + " vs " + row["Team2"] + ": Winner = " + row["Team2"])
    i+=1
    


England vs South Africa: Winner = England
Pakistan vs West Indies: Winner = Pakistan
New Zealand vs Sri Lanka: Winner = New Zealand
Australia vs Afghanistan: Winner = Australia
Bangladesh vs South Africa: Winner = South Africa
England vs Pakistan: Winner = England
Afghanistan vs Sri Lanka: Winner = Sri Lanka
India vs South Africa: Winner = South Africa
Bangladesh vs New Zealand: Winner = Bangladesh
Australia vs West Indies: Winner = Australia
Pakistan vs Sri Lanka: Winner = Pakistan
England vs Bangladesh: Winner = England
Afghanistan vs New Zealand: Winner = Afghanistan
Australia vs India: Winner = Australia
South Africa vs West Indies: Winner = South Africa
Bangladesh vs Sri Lanka: Winner = Sri Lanka
Australia vs Pakistan: Winner = Australia
India vs New Zealand: Winner = India
England vs West Indies: Winner = England
Australia vs Sri Lanka: Winner = Australia
Afghanistan vs South Africa: Winner = South Africa
India vs Pakistan: Winner = India
Bangladesh vs West Indies: Winner = Bangl