In [None]:
import operator
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Data Importing

In [None]:
match_df = pd.read_csv("matches.csv")
score_df = pd.read_csv("deliveries.csv")

## Data Preparing

In [None]:
# runs and wickets per over #
score_df = pd.merge(score_df, match_df[['id','season', 'toss_winner','winner', 'result', 'dl_applied', 'team1', 'team2','venue']], left_on='match_id', right_on='id')
score_df.player_dismissed.fillna(0, inplace=True)

score_df=score_df.loc[(score_df['venue']=='Eden Gardens')|(score_df['venue']=='Wankhede Stadium')|(score_df['venue']=='Rajiv Gandhi International Stadium, Uppal')|(score_df['venue']=='Maharashtra Cricket Association Stadium')|(score_df['venue']=='Saurashtra Cricket Association Stadium')|(score_df['venue']=='Holkar Cricket Stadium')|(score_df['venue']=='M Chinnaswamy Stadium')|(score_df['venue']=='Feroz Shah Kotla')|(score_df['venue']=='Punjab Cricket Association IS Bindra Stadium, Mohali')|(score_df['venue']=='Green Park')]

#score_df=score_df.loc[score_df.over==15,:]
score_df['player_dismissed'].loc[score_df['player_dismissed'] != 0] = 1
train_df = score_df.groupby(['match_id', 'inning', 'over', 'team1', 'team2', 'venue','batting_team','toss_winner', 'winner'])[['total_runs', 'player_dismissed']].agg(['sum']).reset_index()
train_df.columns = train_df.columns.get_level_values(0)

# innings score and wickets #
train_df['innings_wickets'] = train_df.groupby(['match_id', 'inning'])['player_dismissed'].cumsum()
train_df['innings_score'] = train_df.groupby(['match_id', 'inning'])['total_runs'].cumsum()
train_df.head()

# Get the target column #
temp_df = train_df.groupby(['match_id', 'inning'])['total_runs'].sum().reset_index()
temp_df = temp_df.loc[temp_df['inning']==1,:]
temp_df['inning'] = 2
temp_df.columns = ['match_id', 'inning', 'score_target']
train_df = train_df.merge(temp_df, how='left', on = ['match_id', 'inning'])
train_df['score_target'].fillna(-1, inplace=True)

# get the remaining target #
def get_remaining_target(row):
    if row['score_target'] == -1.:
        return -1
    else:
        return row['score_target'] - row['innings_score']

train_df['remaining_target'] = train_df.apply(lambda row: get_remaining_target(row),axis=1)

# get the run rate #
train_df['run_rate'] = train_df['innings_score'] / train_df['over']

# get the remaining run rate #
def get_required_rr(row):
    if row['remaining_target'] == -1:
        return -1.
    elif row['over'] == 20:
        return 99
    else:
        return row['remaining_target'] / (20-row['over'])

train_df['required_run_rate'] = train_df.apply(lambda row: get_required_rr(row), axis=1)

def get_rr_diff(row):
    if row['inning'] == 1:
        return -1
    else:
        return row['run_rate'] - row['required_run_rate']

train_df['runrate_diff'] = train_df.apply(lambda row: get_rr_diff(row), axis=1)
train_df['is_toss_winner_bat_first'] = (train_df['team1'] == train_df['toss_winner']).astype('int')
train_df['is_batting_team'] = (train_df['team1'] == train_df['batting_team']).astype('int')
train_df['target'] = (train_df['team1'] == train_df['winner']).astype('int')
# Get the score_total column for the final score after 1st inning #
temp_df = train_df.groupby(['match_id', 'inning'])['total_runs'].sum().reset_index()
temp_df = temp_df.loc[temp_df['inning']==1,:]
temp_df['inning'] = 1
temp_df.columns = ['match_id', 'inning', 'score_total']
train_df = train_df.merge(temp_df, how='left', on = ['match_id', 'inning'])
train_df['score_total'].fillna(-1, inplace=True)

## Overall Correlation Matrix Heatmap

In [None]:
plt.figure(figsize=(20, 20))
sns.heatmap(train_df.corr(),linewidths=0.25,vmax=1.0,square=True,cmap="YlGnBu",linecolor='w',annot=True)

# Regression Analysis to find the Final Score of the 1st Inning after every second Over

In [None]:
train_1st=train_df.loc[train_df.is_batting_team==1,:] #selecting only the 1st innings

## Correlation Matrix and Heatmap

In [None]:
plt.figure(figsize=(20, 20))
sns.heatmap(train_1st.corr(),linewidths=0.25,vmax=1.0,square=True,cmap="YlGnBu",linecolor='w',annot=True)

## Multiple Linear Regression

In [None]:
print('Over--------rmse----------Accuracy Percentage---------Predicted Score--------Actual Score')
import random
n=random.randint(0,57) #selecting a random match of 2017 season
PercentageList=[]
OverList=[]
y_pred_list=[]
y_test_list=[]
rmse_List=[]
for i in range(2,19,2):
    test_df = train_1st.loc[train_df.match_id<=59,:] #testing dataset
    train_train_df = train_1st.loc[train_df.match_id>59,:] #training dataset
    x_train=train_train_df.iloc[:,[2,5,9,10,11,12,15,18,20]].values #selecting the attributes for training data
    
    #Encoding 'venue' of training data
    from sklearn.preprocessing import LabelEncoder, OneHotEncoder
    labelencoder_X = LabelEncoder()
    x_train[:, 1] = labelencoder_X.fit_transform(x_train[:, 1])
    onehotencoder = OneHotEncoder(categorical_features = [1])
    x_train = onehotencoder.fit_transform(x_train).toarray()
    y_train=train_train_df.iloc[:,21].values #selecting the output column for training data
    x_test=test_df.loc[test_df.over==i,:,] #fitting over
    x_test=x_test.iloc[:,[2,5,9,10,11,12,15,18,20]].values #selecting the attributes for testing data
    
    #Encoding 'venue' of testing data
    labelencoder_X = LabelEncoder()
    x_test[:, 1] = labelencoder_X.fit_transform(x_test[:, 1])
    onehotencoder = OneHotEncoder(categorical_features = [1])
    x_test = onehotencoder.fit_transform(x_test).toarray()
    y_test=test_df.loc[test_df.over==i,:,]
    y_test=y_test.iloc[:,21].values
    
    # Feature Scaling of the attributes
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)

    # Fitting Multiple Linear Regression to the Training set
    from sklearn.linear_model import LinearRegression
    regressor= LinearRegression()
    regressor.fit(x_train, y_train)

    # Predicting the Test set results
    y_pred = regressor.predict(x_test)

    p=(np.mean(1-abs(y_pred-y_test)/y_test)*100) #average accuracy percentage
    rmse = np.around(np.sqrt(np.mean((y_test - y_pred)**2))) #average rmse
    
    PercentageList.append(p)
    rmse_List.append(rmse)
    OverList.append(i)
    y_p=y_pred[n] #Predicted final score of any random match
    y_t=y_test[n] #Actual final score of any random match
    print(i,'------',rmse,'-------',p, '----------',y_p,'------------', y_t)
    y_pred_list.append(y_p) 
    y_test_list.append(y_t)
   

In [None]:
#Accuracy Percentage bar graph
OverPerDict=dict(zip(OverList,PercentageList))
OverPerDic_df=pd.Series(OverPerDict, name='Percentage')
OverPerDic_df.index.name = 'Over'
Over_df=OverPerDic_df.reset_index()
import altair as alt
from vega_datasets import data
alt.renderers.enable('notebook')
a=alt.Chart(Over_df).mark_bar().encode(
    x='Over',
    y='Percentage'
).properties(
    title='Overwise average accuracy to predict average final score after 20th over in the 1st innings using MLR'
)

#Average predicted final score line graph
y_pred_Dict=dict(zip(OverList,y_pred_list))
y_pred_Dic_df=pd.Series(y_pred_Dict, name='Predicted Value')
y_pred_Dic_df.index.name = 'Over'
y_pred_df=y_pred_Dic_df.reset_index()
import altair as alt
from vega_datasets import data
alt.renderers.enable('notebook')
b=alt.Chart(y_pred_df).mark_line().encode(
    x='Over',
    y='Predicted Value'
)

# Average actual final score line graph
y_test_Dict=dict(zip(OverList,y_test_list))
y_test_Dic_df=pd.Series(y_test_Dict, name='Actual Value')
y_test_Dic_df.index.name = 'Over'
y_test_df=y_test_Dic_df.reset_index()
import altair as alt
from vega_datasets import data
alt.renderers.enable('notebook')
c=alt.Chart(y_test_df).mark_line(color='red').encode(
    x='Over',
    y='Actual Value'
)
d=(b+c).properties(
    title='Overwise predicted score(Blue) using MLR and the actual score(Red) after 20th Over of a randomly selected match'
)

# Average rmse deviation from final score bar graph
rmse_Dict=dict(zip(OverList,rmse_List))
rmse_Dic_df=pd.Series(rmse_Dict, name='rmse')
rmse_Dic_df.index.name = 'Over'
rmse_df=rmse_Dic_df.reset_index()
import altair as alt
from vega_datasets import data
alt.renderers.enable('notebook')
e=alt.Chart(rmse_df).mark_line().encode(
    x='Over',
    y='rmse'
).properties(
    title='Overwise average rmse deviation from the predicted and actual final score of MLR'
)
a|d|e

In [None]:
d

## Random Forest Regression

In [None]:
print('Over--------rmse----------Accuracy Percentage---------Predicted Score--------Actual Score')
PercentageList=[]
OverList=[]
y_pred_list=[]
y_test_list=[]
rmse_List=[]
for i in range(2,19,2):
    test_df = train_1st.loc[train_df.match_id<=59,:] #testing dataset
    train_train_df = train_1st.loc[train_df.match_id>59,:] #training dataset
    x_train=train_train_df.iloc[:,[2,5,9,10,11,12,15,18,20]].values #selecting the attributes for training data
    
    #Encoding 'venue' of training data
    from sklearn.preprocessing import LabelEncoder, OneHotEncoder
    labelencoder_X = LabelEncoder()
    x_train[:, 1] = labelencoder_X.fit_transform(x_train[:, 1])
    onehotencoder = OneHotEncoder(categorical_features = [1])
    x_train = onehotencoder.fit_transform(x_train).toarray()
    y_train=train_train_df.iloc[:,21].values #selecting the output column for training data
    x_test=test_df.loc[test_df.over==i,:,] #fitting over
    x_test=x_test.iloc[:,[2,5,9,10,11,12,15,18,20]].values #selecting the attributes for testing data
    
    #Encoding 'venue' of testing data
    labelencoder_X = LabelEncoder()
    x_test[:, 1] = labelencoder_X.fit_transform(x_test[:, 1])
    onehotencoder = OneHotEncoder(categorical_features = [1])
    x_test = onehotencoder.fit_transform(x_test).toarray()
    y_test=test_df.loc[test_df.over==i,:,]
    y_test=y_test.iloc[:,21].values
    
    # Feature Scaling of the attributes
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)


    # Fitting Random Forest Regression to the Training Set
    from sklearn.ensemble import RandomForestRegressor
    regressor = RandomForestRegressor(n_estimators=250,random_state=0)
    regressor.fit(x_train, y_train)


    # Predicting the Test set results
    y_pred = regressor.predict(x_test)

    p=(np.mean(1-abs(y_pred-y_test)/y_test)*100) #average accuracy percentage
    rmse = np.around(np.sqrt(np.mean((y_test - y_pred)**2))) #average rmse
    print(i,'------',rmse,'-------',p, '----------',y_p,'------------', y_t)
    PercentageList.append(p)
    rmse_List.append(rmse)
    OverList.append(i)
    y_p=y_pred[n]# predicted final score of any random match
    y_t=y_test[n] #actual final score of any random match
    y_pred_list.append(y_p) 
    y_test_list.append(y_t)
   

In [None]:
#Accuracy Percentage bar graph
OverPerDict=dict(zip(OverList,PercentageList))
OverPerDic_df=pd.Series(OverPerDict, name='Percentage')
OverPerDic_df.index.name = 'Over'
Over_df=OverPerDic_df.reset_index()
import altair as alt
from vega_datasets import data
alt.renderers.enable('notebook')
a=alt.Chart(Over_df).mark_bar().encode(
    x='Over',
    y='Percentage'
).properties(
    title='Overwise average accuracy to predict average final score after 20th over in the 1st innings using RFR'
)

#Average predicted final score line graph
y_pred_Dict=dict(zip(OverList,y_pred_list))
y_pred_Dic_df=pd.Series(y_pred_Dict, name='Predicted Value')
y_pred_Dic_df.index.name = 'Over'
y_pred_df=y_pred_Dic_df.reset_index()
import altair as alt
from vega_datasets import data
alt.renderers.enable('notebook')
b=alt.Chart(y_pred_df).mark_line().encode(
    x='Over',
    y='Predicted Value'
)

# Average actual final score line graph
y_test_Dict=dict(zip(OverList,y_test_list))
y_test_Dic_df=pd.Series(y_test_Dict, name='Actual Value')
y_test_Dic_df.index.name = 'Over'
y_test_df=y_test_Dic_df.reset_index()
import altair as alt
from vega_datasets import data
alt.renderers.enable('notebook')
c=alt.Chart(y_test_df).mark_line(color='red').encode(
    x='Over',
    y='Actual Value'
)
d=(b+c).properties(
    title='Overwise predicted score(Blue) using RFR and the actual score(Red) after 20th Over of a randomly selected match'
)

# Average rmse deviation from final score bar graph
rmse_Dict=dict(zip(OverList,rmse_List))
rmse_Dic_df=pd.Series(rmse_Dict, name='rmse')
rmse_Dic_df.index.name = 'Over'
rmse_df=rmse_Dic_df.reset_index()
import altair as alt
from vega_datasets import data
alt.renderers.enable('notebook')
e=alt.Chart(rmse_df).mark_line().encode(
    x='Over',
    y='rmse'
).properties(
    title='Overwise average rmse deviation from the predicted and actual final score of RFR'
)
a|d|e

In [None]:
d

## Support Vector Regression

In [None]:
print('Over--------rmse----------Accuracy Percentage---------Predicted Score--------Actual Score')
PercentageList=[]
OverList=[]
y_pred_list=[]
y_test_list=[]
rmse_List=[]
for i in range(2,19,2):
    test_df = train_1st.loc[train_df.match_id<=59,:] #testing dataset
    train_train_df = train_1st.loc[train_df.match_id>59,:] #training dataset
    x_train=train_train_df.iloc[:,[2,5,9,10,11,12,15,18,20]].values #selecting the attributes for training data
    
    #Encoding 'venue' of training data
    from sklearn.preprocessing import LabelEncoder, OneHotEncoder
    labelencoder_X = LabelEncoder()
    x_train[:, 1] = labelencoder_X.fit_transform(x_train[:, 1])
    onehotencoder = OneHotEncoder(categorical_features = [1])
    x_train = onehotencoder.fit_transform(x_train).toarray()
    y_train=train_train_df.iloc[:,21].values #selecting the output column for training data
    x_test=test_df.loc[test_df.over==i,:,] #fitting over
    x_test=x_test.iloc[:,[2,5,9,10,11,12,15,18,20]].values #selecting the attributes for testing data
    
    #Encoding 'venue' of testing data
    labelencoder_X = LabelEncoder()
    x_test[:, 1] = labelencoder_X.fit_transform(x_test[:, 1])
    onehotencoder = OneHotEncoder(categorical_features = [1])
    x_test = onehotencoder.fit_transform(x_test).toarray()
    y_test=test_df.loc[test_df.over==i,:,] #fitting over
    y_test=y_test.iloc[:,21].values
    
    # Feature Scaling of the attributes
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)

    #Fitting Support Vector Regression to the training set
    from sklearn.svm import SVR
    regressor=SVR(kernel='linear')
    regressor.fit(x_train, y_train)

    # Predicting the Test set results
    y_pred = regressor.predict(x_test)

    p=(np.mean(1-abs(y_pred-y_test)/y_test)*100) #average accuracy percentage
    rmse = np.around(np.sqrt(np.mean((y_test - y_pred)**2))) #average rmse
    print(i,'------',rmse,'-------',p, '----------',y_p,'------------', y_t)
    PercentageList.append(p)
    rmse_List.append(rmse)
    OverList.append(i)
    y_p=y_pred[n] #predicted final score of any random match
    y_t=y_test[n] #predicted final score of any random match
    y_pred_list.append(y_p) 
    y_test_list.append(y_t)
   

In [None]:
#Accuracy Percentage bar graph
OverPerDict=dict(zip(OverList,PercentageList))
OverPerDic_df=pd.Series(OverPerDict, name='Percentage')
OverPerDic_df.index.name = 'Over'
Over_df=OverPerDic_df.reset_index()
import altair as alt
from vega_datasets import data
alt.renderers.enable('notebook')
a=alt.Chart(Over_df).mark_bar().encode(
    x='Over',
    y='Percentage'
).properties(
    title='Overwise average accuracy to predict average final score after 20th over in the 1st innings using SVR'
)

#Average predicted final score line graph
y_pred_Dict=dict(zip(OverList,y_pred_list))
y_pred_Dic_df=pd.Series(y_pred_Dict, name='Predicted Value')
y_pred_Dic_df.index.name = 'Over'
y_pred_df=y_pred_Dic_df.reset_index()
import altair as alt
from vega_datasets import data
alt.renderers.enable('notebook')
b=alt.Chart(y_pred_df).mark_line().encode(
    x='Over',
    y='Predicted Value'
)

# Average actual final score line graph
y_test_Dict=dict(zip(OverList,y_test_list))
y_test_Dic_df=pd.Series(y_test_Dict, name='Actual Value')
y_test_Dic_df.index.name = 'Over'
y_test_df=y_test_Dic_df.reset_index()
import altair as alt
from vega_datasets import data
alt.renderers.enable('notebook')
c=alt.Chart(y_test_df).mark_line(color='red').encode(
    x='Over',
    y='Actual Value'
)
d=(b+c).properties(
    title='Overwise predicted score(Blue) using SVR and the actual score(Red) after 20th Over of any randomly selected match'
)

# Average rmse deviation from final score bar graph
rmse_Dict=dict(zip(OverList,rmse_List))
rmse_Dic_df=pd.Series(rmse_Dict, name='rmse')
rmse_Dic_df.index.name = 'Over'
rmse_df=rmse_Dic_df.reset_index()
import altair as alt
from vega_datasets import data
alt.renderers.enable('notebook')
e=alt.Chart(rmse_df).mark_line().encode(
    x='Over',
    y='rmse'
).properties(
    title='Overwise average rmse deviation from the predicted and actual final score of SVR'
)
a|d|e

In [None]:
e

## Classification for match winner prediction after every second over of 2nd inning

In [None]:
train_1st=train_df.loc[train_df.is_batting_team==0,:] #selecting only the second innings

## Correlation Matrix and Heatmap

In [None]:
plt.figure(figsize=(20, 20))
sns.heatmap(train_1st.corr(),linewidths=0.25,vmax=1.0,square=True,cmap="YlGnBu",linecolor='w',annot=True)

## Random Forest Classifier

In [None]:
print('Over-------------Precision')
PrecisionList=[]
OverList=[]
for i in range(2,19,2):
    test_df = train_1st.loc[train_df.match_id<=59,:] #Testing Dataset
    train_train_df = train_1st.loc[train_df.match_id>50,:] #Training Dataset
    x_train=train_train_df.iloc[:,[2,5,9,10,11,12,13,15,16,18]].values #selecting attributes for training data
    
    #Encoding 'venue' of training data
    from sklearn.preprocessing import LabelEncoder, OneHotEncoder
    labelencoder_X = LabelEncoder()
    x_train[:, 1] = labelencoder_X.fit_transform(x_train[:, 1])
    onehotencoder = OneHotEncoder(categorical_features = [1])
    x_train = onehotencoder.fit_transform(x_train).toarray()
    y_train=train_train_df.iloc[:,20].values #Selecting the output column for training data
    x_test=test_df.loc[test_df.over==i,:,]#fitting the over
    x_test=x_test.iloc[:,[2,5,9,10,11,12,13,15,16,18]].values #selecting attributes for testing data
    
    #Encoding 'venue' of testing data
    labelencoder_X = LabelEncoder()
    x_test[:, 1] = labelencoder_X.fit_transform(x_test[:, 1])
    onehotencoder = OneHotEncoder(categorical_features = [1])
    x_test = onehotencoder.fit_transform(x_test).toarray()
    y_test=test_df.loc[test_df.over==i,:,]#fitting the over
    y_test=y_test.iloc[:,20].values #Selecting the output column for training data
    
    # Feature Scaling of the attributes
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)
    
    #Fitting Random Forest Classifier to the training set
    from sklearn.ensemble import RandomForestClassifier
    classifier = RandomForestClassifier(n_estimators=250,criterion='entropy',random_state=0)
    classifier.fit(x_train, y_train)

    # Predicting the Test set results
    y_pred = classifier.predict(x_test)

    # Making the Confusion Matrix
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, y_pred)
    p=(cm[0,0]+cm[1,1])/(cm[0,1]+cm[1,0]+cm[0,0]+cm[1,1])*100 #Calculating Precision 
    from sklearn.metrics import classification_report
    cr=classification_report(y_test, y_pred) #Calculating Classification Report
    #print('Classification Report after over: ',i)
    #print(cr)
    print(i,'-------------',p)
    PrecisionList.append(p)
    OverList.append(i)

In [None]:
#Precision Percentage bar graph
OverPerDict=dict(zip(OverList,PrecisionList))
OverPerDic_df=pd.Series(OverPerDict, name='Precision')
OverPerDic_df.index.name = 'Over'
Over_df=OverPerDic_df.reset_index()
import altair as alt
from vega_datasets import data
alt.renderers.enable('notebook')
a=alt.Chart(Over_df).mark_bar().encode(
    x='Over',
    y='Precision'
).properties(
    title='Overwise average precision to predict the winnerin the 2nd innings using RFC'
)

a

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

def rocAucCurve(classifier):
    logit_roc_auc = roc_auc_score(y_test, classifier.predict(x_test))
    fpr, tpr, thresholds = roc_curve(y_test, classifier.predict_proba(x_test)[:,1])
    plt.figure()
    plt.plot(fpr, tpr, label='Random Forest Classification (area = %0.2f)' % logit_roc_auc)
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Match winning prediction charactaristic using RFC')
    plt.legend(loc="lower right")
    plt.savefig('Log_ROC')
    plt.show()
b=rocAucCurve(classifier)
b

## Support Vector Machine 

In [None]:
print('Over-------------Precision')
PrecisionList=[]
OverList=[]
for i in range(2,19,2):
    test_df = train_1st.loc[train_df.match_id<=59,:] #Testing Dataset
    train_train_df = train_1st.loc[train_df.match_id>50,:] #Training Dataset
    x_train=train_train_df.iloc[:,[2,5,9,10,11,12,13,15,16,18]].values #selecting attributes for training data
    
    #Encoding 'venue' of training data
    from sklearn.preprocessing import LabelEncoder, OneHotEncoder
    labelencoder_X = LabelEncoder()
    x_train[:, 1] = labelencoder_X.fit_transform(x_train[:, 1])
    onehotencoder = OneHotEncoder(categorical_features = [1])
    x_train = onehotencoder.fit_transform(x_train).toarray()
    y_train=train_train_df.iloc[:,20].values #Selecting the output column for training data
    x_test=test_df.loc[test_df.over==i,:,]#fitting the over
    x_test=x_test.iloc[:,[2,5,9,10,11,12,13,15,16,18]].values #selecting attributes for testing data
    
    #Encoding 'venue' of testing data
    labelencoder_X = LabelEncoder()
    x_test[:, 1] = labelencoder_X.fit_transform(x_test[:, 1])
    onehotencoder = OneHotEncoder(categorical_features = [1])
    x_test = onehotencoder.fit_transform(x_test).toarray()
    y_test=test_df.loc[test_df.over==i,:,]#fitting the over
    y_test=y_test.iloc[:,20].values #Selecting the output column for training data
    
    # Feature Scaling of the attributes
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)
    
    #Fitting Support Vector Machine to the training set
    from sklearn.svm import SVC
    classifier = SVC(kernel = 'poly', random_state = 0, probability=True)
    classifier.fit(x_train, y_train)

    # Predicting the Test set results
    y_pred = classifier.predict(x_test)

    # Making the Confusion Matrix
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, y_pred)
    p=(cm[0,0]+cm[1,1])/(cm[0,1]+cm[1,0]+cm[0,0]+cm[1,1])*100 #Calculating Precision 
    from sklearn.metrics import classification_report
    cr=classification_report(y_test, y_pred) #Calculating Classification Report
    #print('Classification Report after over: ',i)
    #print(cr)
    print(i,'-------------',p)
    PrecisionList.append(p)
    OverList.append(i)

In [None]:
#Precision Percentage bar graph
OverPerDict=dict(zip(OverList,PrecisionList))
OverPerDic_df=pd.Series(OverPerDict, name='Precision')
OverPerDic_df.index.name = 'Over'
Over_df=OverPerDic_df.reset_index()
import altair as alt
from vega_datasets import data
alt.renderers.enable('notebook')
a=alt.Chart(Over_df).mark_bar().encode(
    x='Over',
    y='Precision'
).properties(
    title='Overwise average precision to predict the winnerin the 2nd innings using SVM'
)

a

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

def rocAucCurve(classifier):
    logit_roc_auc = roc_auc_score(y_test, classifier.predict(x_test))
    fpr, tpr, thresholds = roc_curve(y_test, classifier.predict_proba(x_test)[:,1])
    plt.figure()
    plt.plot(fpr, tpr, label='Support Vector Machine (area = %0.2f)' % logit_roc_auc)
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Match winning prediction charactaristic with SVM')
    plt.legend(loc="lower right")
    plt.savefig('Log_ROC')
    plt.show()
b=rocAucCurve(classifier)
b

## K-Nearest Neighbour

In [None]:
print('Over-------------Precision')
PrecisionList=[]
OverList=[]
for i in range(2,19,2):
    test_df = train_1st.loc[train_df.match_id<=59,:] #Testing Dataset
    train_train_df = train_1st.loc[train_df.match_id>50,:] #Training Dataset
    x_train=train_train_df.iloc[:,[2,5,9,10,11,12,13,15,16,18]].values #selecting attributes for training data
    
    #Encoding 'venue' of training data
    from sklearn.preprocessing import LabelEncoder, OneHotEncoder
    labelencoder_X = LabelEncoder()
    x_train[:, 1] = labelencoder_X.fit_transform(x_train[:, 1])
    onehotencoder = OneHotEncoder(categorical_features = [1])
    x_train = onehotencoder.fit_transform(x_train).toarray()
    y_train=train_train_df.iloc[:,20].values #Selecting the output column for training data
    x_test=test_df.loc[test_df.over==i,:,]#fitting the over
    x_test=x_test.iloc[:,[2,5,9,10,11,12,13,15,16,18]].values #selecting attributes for testing data
    
    #Encoding 'venue' of testing data
    labelencoder_X = LabelEncoder()
    x_test[:, 1] = labelencoder_X.fit_transform(x_test[:, 1])
    onehotencoder = OneHotEncoder(categorical_features = [1])
    x_test = onehotencoder.fit_transform(x_test).toarray()
    y_test=test_df.loc[test_df.over==i,:,]#fitting the over
    y_test=y_test.iloc[:,20].values #Selecting the output column for training data
    
    # Feature Scaling of the attributes
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)
    
    #Fitting K-Nearest Neighbour to the training set
    from sklearn.neighbors import KNeighborsClassifier
    classifier = KNeighborsClassifier(n_neighbors=6,metric='minkowski', p=2)
    classifier.fit(x_train, y_train)

    # Predicting the Test set results
    y_pred = classifier.predict(x_test)

    # Making the Confusion Matrix
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, y_pred)
    p=(cm[0,0]+cm[1,1])/(cm[0,1]+cm[1,0]+cm[0,0]+cm[1,1])*100 #Calculating Precision 
    from sklearn.metrics import classification_report
    cr=classification_report(y_test, y_pred) #Calculating Classification Report
    #print('Classification Report after over: ',i)
    #print(cr)
    print(i,'-------------',p)
    PrecisionList.append(p)
    OverList.append(i)

In [None]:
#Precision Percentage bar graph
OverPerDict=dict(zip(OverList,PrecisionList))
OverPerDic_df=pd.Series(OverPerDict, name='Precision')
OverPerDic_df.index.name = 'Over'
Over_df=OverPerDic_df.reset_index()
import altair as alt
from vega_datasets import data
alt.renderers.enable('notebook')
a=alt.Chart(Over_df).mark_bar().encode(
    x='Over',
    y='Precision'
).properties(
    title='Overwise average precision to predict the winnerin the 2nd innings using KNN'
)

a

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

def rocAucCurve(classifier):
    logit_roc_auc = roc_auc_score(y_test, classifier.predict(x_test))
    fpr, tpr, thresholds = roc_curve(y_test, classifier.predict_proba(x_test)[:,1])
    plt.figure()
    plt.plot(fpr, tpr, label='K-Nearest Neighbor (area = %0.2f)' % logit_roc_auc)
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Match winning prediction charactaristic with KNN')
    plt.legend(loc="lower right")
    plt.savefig('Log_ROC')
    plt.show()
b=rocAucCurve(classifier)
b