# ETHICAL - SATTA
## Predicting first innings cricket score

### Importing important libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
%matplotlib inline

### 'ipl.csv' contains ball-by-ball details of all the IPL matches from 2008 till 2017

In [2]:
df = pd.read_csv('ipl.csv')
df.head()

Unnamed: 0,mid,date,venue,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
0,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,1,0,0.1,1,0,0,0,222
1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,1,0,0.2,1,0,0,0,222
2,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.2,2,0,0,0,222
3,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.3,2,0,0,0,222
4,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.4,2,0,0,0,222


In [4]:
df['venue'].unique()

array(['M Chinnaswamy Stadium',
       'Punjab Cricket Association Stadium, Mohali', 'Feroz Shah Kotla',
       'Wankhede Stadium', 'Eden Gardens', 'Sawai Mansingh Stadium',
       'Rajiv Gandhi International Stadium, Uppal',
       'MA Chidambaram Stadium, Chepauk', 'Dr DY Patil Sports Academy',
       'Newlands', "St George's Park", 'Kingsmead', 'SuperSport Park',
       'Buffalo Park', 'New Wanderers Stadium', 'De Beers Diamond Oval',
       'OUTsurance Oval', 'Brabourne Stadium',
       'Sardar Patel Stadium, Motera', 'Barabati Stadium',
       'Vidarbha Cricket Association Stadium, Jamtha',
       'Himachal Pradesh Cricket Association Stadium', 'Nehru Stadium',
       'Holkar Cricket Stadium',
       'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium',
       'Subrata Roy Sahara Stadium',
       'Shaheed Veer Narayan Singh International Stadium',
       'JSCA International Stadium Complex', 'Sheikh Zayed Stadium',
       'Sharjah Cricket Stadium', 'Dubai International Cricket St

In [None]:
df.shape

We have 76014 number of rows and 
Given data have all the details of that match

In [None]:
# Getting all colummn names
df.columns

In [None]:
# few column do not have much impact of prediction , we can remove these to make model simple and robust
less_useful_columns = ['mid','venue','batsman','bowler','striker','non-striker']
df.drop(labels =less_useful_columns, axis = 1 , inplace = True) 

In [None]:
# Now less important columns has been deleted
df.head()

### We will consider only those teams which are regular part of the IPL

In [None]:
df['bat_team'].unique()

In [None]:
regular_playing_teams = ['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals','Mumbai Indians',
                         'Kings XI Punjab','Royal Challengers Bangalore', 'Delhi Daredevils', 'Sunrisers Hyderabad']

In [None]:
regular_playing_teams

### Check if batting and bowling  both are in regular playing team

In [None]:
df = df[(df['bat_team'].isin(regular_playing_teams)) & (df['bowl_team'].isin(regular_playing_teams))]

In [None]:
# Shape of the data will reduce now
df.shape

In [None]:
df.head()

## To predict score we must know score of atleast 5 overs


In [None]:
# filtering out those rows for which over is less than 5
df = df[df['overs']>=5.0]
df.shape

In [None]:
df.head()

In [None]:
# List of unique batting team
df.bat_team.unique()

In [None]:
# List of unique bowling team
df.bowl_team.unique()

## From given data we can analyze that data is time series data
### date will play key role in splitting the dataset

In [None]:
# We need only year information to divide data
# yy-mm-dd => int(yy)
df['date'] = df['date'].apply(lambda x:int(x.split('-')[0]))

In [None]:
df['date'].head()

In [None]:
df.head()

## Onehot Encoding
### Categorical features must be encoded becauase ML algorithms is based on distance


In [None]:
# Encoding all the batting teams and bowling teams
df = pd.get_dummies(data = df , columns = ['bat_team','bowl_team'], drop_first=True)

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.shape


Number of columns arae increased

## Splitting the data 
### Training set : Before 2016 and Test set : After 2016

In [None]:
# Dropping dependent variable 
X_train = df.drop(labels='total',axis=1)[df['date']<=2016]
X_test = df.drop(labels='total',axis=1)[df['date']>=2017]

In [None]:
X_test.head()

In [None]:
# depentent variable
Y_train = df[df['date']<=2016]['total'].values
Y_test = df[df['date']>=2017]['total'].values

In [None]:
Y_test

In [None]:
# Now date will not play any role in prediction 
# Simply remove them
X_train.drop(labels='date',axis=1,inplace=True)
X_test.drop(labels='date',axis=1,inplace=True)

In [None]:
X_train.head()

In [None]:
X_train.columns

## Prediction using Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train,Y_train)
prediction = lin_reg.predict(X_test)

In [None]:
sns.distplot(Y_test-prediction)

Results seems to be pretty good, because it's like left skewed Gaussion Distribution

## Performance measurement

In [None]:
from sklearn import metrics
print("MAE : ",metrics.mean_absolute_error(Y_test,prediction))
print("MSE : ",metrics.mean_squared_error(Y_test,prediction))
print("SMSE : ",pow(metrics.mean_absolute_error(Y_test,prediction),0.5))
print('R-squared : ', metrics.r2_score(Y_test,prediction))

In [None]:
X_test.columns

In [None]:
newdata = np.array([[55]+[1]+[5.0]+[55]+[1]+[0]+[0]+[0]+[1]+[0]+[0]+[0]+[0]+[0]+[0]+[0]+[0]+[1]+[0]])
print(X_test.shape,newdata.shape)

In [None]:
newPred = lin_reg.predict(newdata)
int(newPred)

## Using Linear Regression we are getting R-squared score of 0.7522 which is pretty good