In [None]:
import numpy as np
import pandas as pd

In [None]:
match = pd.read_csv('IPL Matches 2008-2020.csv')
delivery = pd.read_csv('IPL Ball-by-Ball 2008-2020.csv')

In [None]:
match.head()

In [None]:
match.shape

In [None]:
delivery

In [None]:
delivery.shape

In [None]:
#To find out the total runs of both the innings we use groupby
total_score_df = delivery.groupby(['id','inning']).sum()['total_runs'].reset_index()


In [None]:
total_score_df = total_score_df[total_score_df['inning'] == 1] #because we need the scores of 1st innings which is chasing by team2

In [None]:
#now we will merge the total_score_df and mached.head, since from mached we would not get total runs by 1 and 2 team
match_df = match.merge(total_score_df[['id','total_runs']],left_on='id',right_on='id')

In [None]:
match_df

In [None]:
match_df['team1'].unique()

In [None]:
teams = [
    'Sunrises Hyderabad',
    'Mumbai Indians',
    'Royal Challengers Bangalore',
    'Kolkata Knight Riders',
    'Kings XI punjab',
    'Chennai Super Kings',
    'Rajasthan Royals',
    'Delhi Capitals'
]

In [None]:
match_df['team1'] = match_df['team1'].str.replace('Delhi Daredevils','Delhi Capitals')
match_df['team2'] = match_df['team2'].str.replace('Delhi Daredevils','Delhi Capitals')

match_df['team1'] = match_df['team1'].str.replace('Deccan Chargers','Sunrisers Hyderabad')
match_df['team2'] = match_df['team2'].str.replace('Deccan Chargers','Sunrisers Hyderabad')


In [None]:
match_df = match_df[match_df['team1'].isin(teams)]
match_df = match_df[match_df['team2'].isin(teams)]

In [None]:
match_df.shape

In [None]:
match_df = match_df[['id','city','winner','total_runs']]

In [None]:
delivery_df = match_df.merge(delivery,on='id')

In [None]:
delivery_df = delivery_df[delivery_df['inning']==2]

In [None]:
delivery_df.shape

In [None]:
#for current_score
delivery_df['current_score'] = delivery_df.groupby('id').cumsum()['total_runs_y']

In [None]:
#for runs_left
delivery_df['runs_left'] = delivery_df['total_runs_x'] - delivery_df['current_score'] #found the runs left

In [None]:
delivery_df

In [None]:
#for balls_left
delivery_df['balls_left'] = 126 - (delivery_df['over']*6 + delivery_df['ball'])

In [None]:
delivery_df

In [None]:
#for player_dismissed
delivery_df['player_dismissed'] = delivery_df['player_dismissed'].fillna("0")
delivery_df['player_dismissed'] = delivery_df['player_dismissed'].apply(lambda x:x if x == "0" else "1")
delivery_df['player_dismissed'] = delivery_df['player_dismissed'].astype('int')
wickets = delivery_df.groupby('id').cumsum()['player_dismissed'].values
delivery_df['wickets'] = 10 - wickets
delivery_df.head()

In [None]:
delivery_df.tail()

In [None]:
#for current runrate crr=runs/overs
delivery_df['crr'] = (delivery_df['current_score']*6)/(120 - delivery_df['balls_left'])

In [None]:
#for required runrate
delivery_df['rrr'] = (delivery_df['runs_left']*6)/delivery_df['balls_left']

In [None]:
#for results
def result(row):
    return 1 if row['batting_team'] == row['winner'] else 0

In [None]:
delivery_df['result'] = delivery_df.apply(result,axis=1)

In [None]:
#we will arrange in which order we need the columns which are as follows
final_df = delivery_df[['batting_team','bowling_team','city','runs_left','balls_left','wickets','total_runs_x','crr','rrr','result']]

In [None]:
#to shuffle
final_df = final_df.sample(final_df.shape[0])

In [None]:
final_df.sample()

In [None]:
final_df.isnull().sum()

In [None]:
final_df.dropna(inplace=True)

In [None]:
final_df = final_df[final_df['balls_left'] !=0]

In [None]:
# for train_test and split
X = final_df.iloc[:,:-1] #all rows except last column
y = final_df.iloc[:,-1] #all rows except last column
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [None]:
X_train

In [None]:
#since the first three rows are string,so we have to apply one hot coding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

trf = ColumnTransformer([
    ('trf',OneHotEncoder(sparse=False,drop='first'),['batting_team','bowling_team','city'])
]
,remainder='passthrough')

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [None]:
pipe = Pipeline(steps=[
    ('step1',trf),
    ('step2',LogisticRegression(solver='liblinear'))
])

In [None]:
pipe.fit(X_train,y_train)

In [None]:
X_train.describe()

In [None]:
y_pred = pipe.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

In [None]:
pipe.predict_proba(X_test)[10] # to find the probability of a team to win a match

In [None]:
def match_progression(x_df,id,pipe):
    match = x_df[x_df['id'] == id]
    match = match[(match['ball'] == 6)]
    temp_df = match[['batting_team','bowlig_team','city','runs_left','balls_left','wickets','total_runs']]
    temp_df = temp_df[temp_df['balls_left'] != 0]
    result = pipe.predict_proba(temp_df)
    temp_df['lose'] = np.round(result.T[0]*100,1)
    temp_df['win'] = np.round(result.T[1]*100,1)
    temp_df['end_of_over'] = range(1,temp_df.shape[0]+1)
    
    target = temp_df['total_runs_x'].values[0]
    runs = list(temp_df['runs_left'].values)
    new_runs = runs[:]
    runs.insert(0,target)
    temp_df['runs_after_over'] = np.array(runs)[:-1] - np.array(new_runs)
    wickets = lsit(temp_df['wickets'].values)
    new_wickets = wickets[:]
    new_wickets.insert(0,10)
    wickets.append(0)
    w = np.array(wickets)
    nw = np.array(new_wickets)
    temp_df['wickets_in_over'] = (nw-w)[0:temp_df.shape[0]]
    
    print("Target-",target)
    temp_df = temp_df[['end_of_over','runs_after_over','wickets_in_over','lose','win']]
    return temp_df,target

    

In [None]:
#summary of first match
temp_df,target = match_progression(delivery_df,1,pipe)
temp_df

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(18,8))
plt.plot(temp_df['end_of_over'],temp_df['wickets_in_over'],color='yellow',linewidth=3)
plt.plot(temp_df['end_of_over'],temp_df['win'],color='#00a65a',linewidth=4)
plt.plot(temp_df['end_of_over'],temp_df['lose'],color='red',linewidth=4)
plot.bar(temp_df['end_of_over'],temp_df['runs_after_over'])
plt.title('Target-' + str(target))

In [None]:
teams

In [None]:
#city names
delivery_df['city'].unique()

In [None]:
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))