In [36]:
# Importing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from keras.utils import to_categorical
import keras
from keras.layers import Dense
from keras import Sequential

In [2]:
data = pd.read_csv("matches.csv")

In [3]:
data.head()

Unnamed: 0,id,season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2,umpire3
0,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,normal,0,Sunrisers Hyderabad,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal",AY Dandekar,NJ Llong,
1,2,2017,Pune,2017-04-06,Mumbai Indians,Rising Pune Supergiant,Rising Pune Supergiant,field,normal,0,Rising Pune Supergiant,0,7,SPD Smith,Maharashtra Cricket Association Stadium,A Nand Kishore,S Ravi,
2,3,2017,Rajkot,2017-04-07,Gujarat Lions,Kolkata Knight Riders,Kolkata Knight Riders,field,normal,0,Kolkata Knight Riders,0,10,CA Lynn,Saurashtra Cricket Association Stadium,Nitin Menon,CK Nandan,
3,4,2017,Indore,2017-04-08,Rising Pune Supergiant,Kings XI Punjab,Kings XI Punjab,field,normal,0,Kings XI Punjab,0,6,GJ Maxwell,Holkar Cricket Stadium,AK Chaudhary,C Shamshuddin,
4,5,2017,Bangalore,2017-04-08,Royal Challengers Bangalore,Delhi Daredevils,Royal Challengers Bangalore,bat,normal,0,Royal Challengers Bangalore,15,0,KM Jadhav,M Chinnaswamy Stadium,,,


In [4]:
data.toss_decision = data.toss_decision.map({'bat':1, 'field':0})

In [5]:
data.result = data.result.map({'normal':1, 'tie':2, 'no result':0})

In [6]:
data.drop(columns=['venue', 'player_of_match', 'dl_applied','umpire1','umpire2','umpire3','date','city','season','id'], inplace=True)

In [7]:
r = len(data.team2.unique())
teams = data.team1.unique()
mapping = {}

In [8]:
for i in range(14): # There are 14 teams.
    mapping[teams[i]] = i

In [9]:
# Encoding data in numeric form
data.team1 = data.team1.map(mapping)
data.team2 = data.team2.map(mapping)

In [10]:
data.toss_winner = data.toss_winner.map(mapping)

In [11]:
data.winner = data.winner.map(mapping)

In [12]:
data.winner.fillna(0, axis=0, inplace=True)

In [13]:
data.winner = data.winner.astype(int)

In [14]:
for i in range(len(data['win_by_runs'].values)):
    if (data['win_by_runs'][i]) >= 20: # Strong team
        data['win_by_runs'][i] = 1
    else:
        data['win_by_runs'][i] = 0

In [15]:
for i in range(len(data['win_by_wickets'].values)):
    if data['win_by_wickets'][i] >= 7: # Strong team
        data['win_by_wickets'][i] = 1
    else:
        data['win_by_wickets'][i] = 0

In [16]:
data.head()

Unnamed: 0,team1,team2,toss_winner,toss_decision,result,winner,win_by_runs,win_by_wickets
0,0,4,4,0,1,0,1,0
1,1,3,3,0,1,3,0,1
2,2,5,5,0,1,5,0,1
3,3,7,7,0,1,7,0,0
4,4,6,4,1,1,4,0,0


In [17]:
# Extracting features and labels
labels = data.winner.values
features = data.drop(columns=["winner"]).values

In [18]:
# For neural network
ndim = features.shape[1]

In [19]:
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, random_state=3, shuffle=True)

In [20]:
len(features_train)

477

In [22]:
len(data.columns)

8

In [23]:
features.shape[1]

7

In [25]:
# Viewing labels
labels

array([ 0,  3,  5,  7,  4,  0,  1,  7,  6,  1,  5,  1,  2,  5,  6,  1,  3,
        5,  0,  4,  0,  1,  2,  1,  3,  7,  5,  3,  5,  2,  5,  0,  3,  1,
        7,  0,  1,  3,  6,  3,  6,  7,  3,  1,  5,  2,  0,  7,  6,  7,  6,
        0,  1,  3,  4,  3,  5,  1,  1,  5,  8,  6,  4,  5,  9,  6,  8,  9,
        7,  9,  8, 10,  7,  8,  1,  6,  7,  9,  6,  4,  7,  1,  9,  7, 10,
        1,  8,  5,  9,  1,  8,  5,  9,  7,  5,  1,  7,  6,  1,  7,  9,  1,
        8,  6,  9,  7,  4,  7,  6,  9,  4,  5,  9,  8,  9,  8,  9,  1,  4,
        6, 10,  8,  5, 10,  6,  9,  7, 10,  6,  7, 10,  1,  9,  4,  7,  6,
        8,  1,  4,  9,  8,  7,  4,  8,  9,  6, 10,  9,  8,  6,  7,  8,  1,
        6, 10,  4,  1,  6,  4,  9,  7,  8, 10,  7,  6,  5,  4,  5,  8,  6,
        4, 10,  4, 10,  5,  1,  6,  5, 10,  6,  4,  8,  1,  4,  8, 10,  9,
        4, 10,  7,  1,  4,  9,  1,  9,  5,  6,  9,  1,  6,  1,  8,  6,  5,
        4,  8,  1,  7,  6,  9,  8,  9,  5, 10,  7, 10,  4,  7,  1, 10,  1,
        8,  4,  6, 10,  1

In [26]:
# As the labels are multi class, we perform one hot encoding.

In [27]:
labels_copy = to_categorical(labels)

In [40]:
labels_copy.shape
features_copy_train, features_copy_test,labels_copy_train,labels_copy_test = train_test_split(features, labels_copy, random_state=3, shuffle=True)

In [35]:
# Building models
clf1 = SVC(C=100)
clf1.fit(features_train, labels_train)
clf1.score(features_test, labels_test)

0.61006289308176098

In [33]:
clf2 = RandomForestClassifier(n_estimators=100, min_samples_split=10)
clf2.fit(features_train, labels_train)
clf2.score(features_test, labels_test)

0.62893081761006286

In [105]:
# Building neural network
model = Sequential()
model.add(Dense(100, input_dim=ndim, activation="relu"))
model.add(Dense(100,activation="relu"))
model.add(Dense(75, activation="relu"))
model.add(Dense(14, activation="softmax"))
model.compile(optimizer="adam", loss=keras.losses.mse, metrics=["accuracy"])

In [106]:
model.fit(features_copy_train, labels_copy_train,epochs=200, batch_size=100, validation_data=(features_copy_test, labels_copy_test) )

Train on 477 samples, validate on 159 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
E

Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 

<keras.callbacks.History at 0x126db4d68>

## The model is overfitting.

In [46]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_features_train = scaler.fit_transform(features_copy_train)



In [47]:
scaled_features_train

array([[ 0.40455895, -1.44032478, -1.397596  , ..., -0.04583492,
        -0.57977104, -0.58299883],
       [-0.51436781,  1.08902605,  1.04323985, ..., -0.04583492,
        -0.57977104, -0.58299883],
       [ 0.40455895, -1.44032478,  0.43303089, ..., -0.04583492,
        -0.57977104, -0.58299883],
       ..., 
       [ 0.40455895,  0.14051949,  0.12792641, ..., -0.04583492,
        -0.57977104,  1.7152693 ],
       [-1.43329456,  0.7728572 ,  0.73813537, ..., -0.04583492,
        -0.57977104, -0.58299883],
       [ 1.93610354,  0.14051949,  0.12792641, ..., -0.04583492,
        -0.57977104, -0.58299883]])

In [48]:
model.fit(scaled_features_train, labels_copy_train, epochs=100, batch_size=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x1240c05f8>

In [56]:
model.evaluate(features_copy_test, labels_copy_test)



[15.366900869885331, 0.037735848869167782]

In [51]:
scaled_features_test = scaler.fit_transform(features_copy_test)



In [58]:
from keras import backend as K

In [59]:
def new_relu(x):
    return K.relu(x, max_value=13)
    

In [61]:
nn = Sequential()
nn.add(Dense(100, activation="relu", input_dim=features_train.shape[1]))
nn.add(Dense(100, activation="relu"))
nn.add(Dense(75, activation="relu"))
nn.add(Dense(1, activation=new_relu))

In [62]:
nn.compile(optimizer="adam", loss=keras.losses.mse, metrics=["accuracy"])


In [65]:
nn.fit(features_train, labels_train, epochs=150)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

<keras.callbacks.History at 0x125dbdb38>

In [66]:
nn.evaluate(features_test, labels_test)



[3.9925836886999742, 0.49056603511174518]

In [77]:
pred = np.round(nn.predict(features_test))

In [74]:
features_test[7]

array([9, 7, 7, 0, 1, 1, 0])

In [78]:
pred[7]

array([ 9.], dtype=float32)

## By using a custom activation function, we are getting an accuracy of 50% on the test data, hence preventing overfitting.

In [79]:
nn

<keras.models.Sequential at 0x12468bd68>

In [101]:
model.evaluate(features_copy_test, labels_copy_test)



[1.6023147436057996, 0.50314465127651042]

In [84]:
pred = np.argmax(model.predict(features_copy_test), axis=1)

In [85]:
pred

array([ 5,  5,  6,  7,  9,  7,  8,  9,  9, 10,  1,  1,  9,  6,  9,  1,  1,
        4,  2,  1,  5,  1,  2,  4,  9,  8, 12,  4,  0,  4,  2,  6,  7,  9,
        4,  8,  7,  9,  5,  8,  2,  1, 10,  9, 10,  8,  0,  6,  4,  9,  7,
        9,  1,  9,  0,  6,  4,  7,  0,  8,  4,  1,  5,  1,  9,  4,  9,  4,
        1,  6,  8,  2, 12,  9,  4,  9,  1,  9,  8, 10,  9,  0,  8,  4,  7,
        6,  1,  5,  6,  4,  5,  6,  1,  1,  8,  4,  4,  1,  9,  9, 10,  5,
        9,  5,  8,  9, 12,  9,  8,  8,  4,  8,  1, 10,  0,  5,  9, 10,  8,
        1,  1,  5,  8,  1,  9, 12,  5,  4,  0, 10,  1,  9,  1,  8,  4,  5,
        6, 10,  8, 10,  6,  9,  1,  3,  6,  6, 10,  9,  7,  4, 12,  8,  0,
        5,  7,  4, 12,  7,  4])

In [88]:
features_copy_test[0]


array([7, 5, 7, 1, 1, 0, 0])

In [90]:
pred[0]

5

In [97]:
np.argmax(labels_copy_test[1], axis=0)

5

# With nn, we get 49% and with model we get 55% using one hot encoding.

In [104]:
nn.save("nn-matches-model-with-49%.hdf5")
model.save('model-matches-model-with-55%.hdf5')

In [107]:
model.evaluate(features_copy_test, labels_copy_test)



[0.048274208164814883, 0.52201257561737635]

In [112]:
from sklearn.ensemble import ExtraTreesClassifier, BaggingClassifier, GradientBoostingClassifier

In [113]:
clf = ExtraTreesClassifier(n_estimators=100)
clf.fit(features_train, labels_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [114]:
clf.score(features_test, labels_test)

0.61635220125786161

In [116]:
p = clf.predict(features_test)

In [117]:
features_test[0]

array([7, 5, 7, 1, 1, 0, 0])

In [118]:
a = np.array([5, 7, 5, 1, 1, 0, 0])

In [119]:
clf.predict(a.reshape(1,-1))

array([7])

In [130]:
np.round(model.predict(a.reshape(1,-1))

SyntaxError: unexpected EOF while parsing (<ipython-input-130-05bf4e0d4c51>, line 1)

In [132]:
c = model.predict(a.reshape(1,-1))

In [135]:
np.argmax(c, axis=1)

array([7])

In [138]:
nn.predict(a.reshape(1,-1)).astype(int)

array([[5]])

In [148]:
clf.predict(features_test)

array([ 5,  5,  6,  7,  8,  7,  8,  9,  1,  9,  1,  1,  9,  6,  7,  1,  1,
        4,  2,  1,  5,  1,  2,  4,  0,  9, 12,  4,  0,  4,  2,  6,  6,  6,
        4,  8,  7,  8,  5,  8,  3,  1, 10,  8, 10,  8,  0,  9,  4,  7,  7,
        8,  1,  7,  0,  6,  4,  7,  0,  5,  4,  1,  5,  1, 10,  3,  9,  4,
        1,  5,  7,  1, 12,  9,  4,  8,  1,  8,  8,  9,  6,  0,  8,  4,  7,
        6,  1,  5,  6,  4,  5,  9,  1,  1,  8,  4,  4,  1,  8,  9, 10,  5,
       10,  6,  8,  9,  9,  8,  8,  8,  4,  7,  1,  9,  9,  5,  8, 10,  8,
        1,  1,  5,  7,  1,  9, 12,  5,  4,  1, 10,  1,  7,  1,  8,  5,  5,
        7, 11,  8,  8,  6,  9,  1,  3,  7,  6, 11,  9,  7,  4, 12,  8,  0,
        5,  7,  5, 12,  8,  4])

In [149]:
labels_test

array([ 7,  5,  6,  7,  9,  1,  8,  9,  1,  9,  9,  1, 13,  6, 10,  1,  1,
        4,  2,  1,  5,  6,  6,  4,  9,  7, 10,  4,  0,  4,  2,  4,  1,  6,
        4,  8,  6, 10,  5,  8,  3,  1, 10,  8,  9,  6,  0, 12,  4,  9,  6,
        9,  1,  9,  0,  6,  4,  7,  0,  9,  4,  1,  5,  1,  7,  3,  8,  4,
        1,  5,  8,  4, 12,  9,  6,  8,  6,  9,  9,  9,  9,  0,  8,  4,  7,
        6,  7,  5,  6,  0,  0,  9,  9,  1,  9,  8,  0,  1,  8,  9, 10,  5,
       10,  6,  8,  9, 12,  8,  8,  8,  4,  7,  1, 12, 13,  5,  9,  1,  4,
        1,  3,  5,  8,  1,  9, 12,  7,  5,  0, 10,  4, 10,  6,  8,  4,  5,
        7, 11,  8, 10,  6,  9,  1,  3,  6,  6, 11,  5,  1,  4, 10,  8,  0,
        5, 10,  4, 12,  8,  4])

In [150]:
labels_test

array([ 7,  5,  6,  7,  9,  1,  8,  9,  1,  9,  9,  1, 13,  6, 10,  1,  1,
        4,  2,  1,  5,  6,  6,  4,  9,  7, 10,  4,  0,  4,  2,  4,  1,  6,
        4,  8,  6, 10,  5,  8,  3,  1, 10,  8,  9,  6,  0, 12,  4,  9,  6,
        9,  1,  9,  0,  6,  4,  7,  0,  9,  4,  1,  5,  1,  7,  3,  8,  4,
        1,  5,  8,  4, 12,  9,  6,  8,  6,  9,  9,  9,  9,  0,  8,  4,  7,
        6,  7,  5,  6,  0,  0,  9,  9,  1,  9,  8,  0,  1,  8,  9, 10,  5,
       10,  6,  8,  9, 12,  8,  8,  8,  4,  7,  1, 12, 13,  5,  9,  1,  4,
        1,  3,  5,  8,  1,  9, 12,  7,  5,  0, 10,  4, 10,  6,  8,  4,  5,
        7, 11,  8, 10,  6,  9,  1,  3,  6,  6, 11,  5,  1,  4, 10,  8,  0,
        5, 10,  4, 12,  8,  4])

In [152]:
features[0]

array([0, 4, 4, 0, 1, 1, 0])

In [153]:
labels[0]

0

In [155]:
a = model.predict(features[0].reshape(1,-1))

In [156]:
a

array([[  9.81611907e-01,   8.42660107e-03,   9.77830013e-08,
          8.47694930e-03,   1.00995225e-04,   1.28758221e-03,
          5.53907121e-05,   3.99190758e-05,   4.88160254e-07,
          3.05353232e-09,   1.57500253e-08,   9.04296105e-09,
          5.01230311e-08,   2.48539900e-09]], dtype=float32)

In [157]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_13 (Dense)             (None, 100)               800       
_________________________________________________________________
dense_14 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_15 (Dense)             (None, 75)                7575      
_________________________________________________________________
dense_16 (Dense)             (None, 14)                1064      
Total params: 19,539
Trainable params: 19,539
Non-trainable params: 0
_________________________________________________________________


In [158]:
np.argmax(a, axis=1)

array([0])