In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px

In [None]:
balls = pd.read_csv('../../static/data/dataset_3/IPL_Ball_by_Ball_2008_2022.csv')
balls.shape

In [None]:
balls.head()

In [None]:
balls['BattingTeam'].unique()

In [None]:
# Rename old names to new names
balls['BattingTeam'] = balls['BattingTeam'].str.replace('Delhi Daredevils', 'Delhi Capitals')
balls['BattingTeam'] = balls['BattingTeam'].str.replace('Kings XI Punjab', 'Punjab Kings')
balls['BattingTeam'] = balls['BattingTeam'].str.replace('Deccan Chargers', 'Sunrisers Hyderabad')
balls['BattingTeam'] = balls['BattingTeam'].str.replace('Rising Pune Supergiants', 'Rising Pune Supergiant')


In [None]:
balls['BattingTeam'].unique()

In [None]:
balls.describe()

In [None]:
balls.info()

In [None]:
matches = pd.read_csv('../../static/data/dataset_3//IPL_Matches_2008_2022.csv')
matches.shape

In [None]:
matches.head()

In [None]:
matches.describe()

In [None]:
matches.info()

In [None]:
matches['City'].unique()

In [None]:
# Rename the city where single city has different names present
matches['City'] = matches['City'].str.replace('Bengaluru', 'Bangalore')
matches.City.unique()

In [None]:
matches['Team1'].unique()

In [None]:
# Rename old names to new names
matches['Team1'] = matches['Team1'].str.replace('Delhi Daredevils', 'Delhi Capitals')
matches['Team1'] = matches['Team1'].str.replace('Kings XI Punjab', 'Punjab Kings')
matches['Team1'] = matches['Team1'].str.replace('Deccan Chargers', 'Sunrisers Hyderabad')
matches['Team1'] = matches['Team1'].str.replace('Rising Pune Supergiants', 'Rising Pune Supergiant')


matches['Team2'] = matches['Team2'].str.replace('Delhi Daredevils', 'Delhi Capitals')
matches['Team2'] = matches['Team2'].str.replace('Kings XI Punjab', 'Punjab Kings')
matches['Team2'] = matches['Team2'].str.replace('Deccan Chargers', 'Sunrisers Hyderabad')
matches['Team2'] = matches['Team2'].str.replace('Rising Pune Supergiants', 'Rising Pune Supergiant')


matches['TossWinner'] = matches['TossWinner'].str.replace('Delhi Daredevils', 'Delhi Capitals')
matches['TossWinner'] = matches['TossWinner'].str.replace('Kings XI Punjab', 'Punjab Kings')
matches['TossWinner'] = matches['TossWinner'].str.replace('Deccan Chargers', 'Sunrisers Hyderabad')
matches['TossWinner'] = matches['TossWinner'].str.replace('Rising Pune Supergiants', 'Rising Pune Supergiant')


matches['WinningTeam'] = matches['WinningTeam'].str.replace('Delhi Daredevils', 'Delhi Capitals')
matches['WinningTeam'] = matches['WinningTeam'].str.replace('Kings XI Punjab', 'Punjab Kings')
matches['WinningTeam'] = matches['WinningTeam'].str.replace('Deccan Chargers', 'Sunrisers Hyderabad')
matches['WinningTeam'] = matches['WinningTeam'].str.replace('Rising Pune Supergiants', 'Rising Pune Supergiant')



In [None]:
matches['Team1'].unique()

##### Data Visualization

In [None]:
plt.figure(figsize = (10,6))
sns.countplot(y = 'WinningTeam',data = matches,order= matches['WinningTeam'].value_counts().index)
plt.xlabel('Wins')
plt.ylabel('Team')
plt.title('Number of  IPL  matches won by each team')

##### EDA and Feature Engineering

In [None]:
# Finding total score of the innings
total_score = balls.groupby(['ID', 'innings']).sum()['total_run'].reset_index()
total_score.head()

In [None]:
# We only need score of 1st innings
# Our target is winner prediction so we only need the score of first innings
total_score = total_score[total_score['innings']==1]
total_score.head()

In [None]:
# fig = px.histogram(total_score,nbins=30, x='total_run')
fig = plt.hist(total_score)
# fig.show()


In [None]:
# Adding new feature to total_score
total_score['target'] = total_score['total_run'] + 1
total_score.head()

In [None]:
# Merge total_score with the mathces dataset
match_df = matches.merge(total_score[['ID','target']], on='ID')
match_df.head()

In [None]:
# Current teams in IPl
current_teams = [
    'Rajasthan Royals',
    'Royal Challengers Bangalore',
    'Sunrisers Hyderabad', 
    'Delhi Capitals', 
    'Chennai Super Kings',
    'Gujarat Titans', 
    'Lucknow Super Giants', 
    'Kolkata Knight Riders',
    'Punjab Kings', 
    'Mumbai Indians'
]


In [None]:
# Setting data of current teams only
match_df = match_df[match_df['Team1'].isin(current_teams)]
match_df = match_df[match_df['Team2'].isin(current_teams)]
match_df = match_df[match_df['TossWinner'].isin(current_teams)]
match_df = match_df[match_df['WinningTeam'].isin(current_teams)]
match_df.shape

In [None]:
match_df.columns

In [None]:
# fig = px.histogram(match_df, x='WinningTeam')
# fig.show()

In [None]:
# We want only the matches where D/L is not applied
# Removing all matches effected due to rain
match_df['method'].unique()

In [None]:
res = match_df['method'].value_counts()
res

In [None]:
match_df = match_df[match_df['method'].isna()]
match_df.shape

In [None]:
match_df = match_df[['ID','City','Team1','Team2','WinningTeam','target']].dropna()

In [None]:
match_df.head()

In [None]:
# Checking for null values
match_df.isna().sum()

In [None]:
# Merge the match_df dataset with balls dataset
balls_df = match_df.merge(balls, on='ID')
balls_df.head()

In [None]:
balls_df['BattingTeam'].value_counts()

In [None]:
# fig = px.bar(balls_df['BattingTeam'].value_counts())
# fig.show()

In [None]:
# Only select rows where we are in 2nd innings
balls_df = balls_df[balls_df['innings']==2]
balls_df.shape

In [None]:
balls_df.head()

In [None]:
# Create new feature current_score after each ball
balls_df['current_score'] = balls_df.groupby('ID')['total_run'].cumsum()
balls_df.head()

In [None]:
# Adding other new feature
balls_df['runs_left'] = np.where(balls_df['target']-balls_df['current_score']>=0, balls_df['target']-balls_df['current_score'], 0)
balls_df['balls_left'] = np.where(120 - balls_df['overs']*6 - balls_df['ballnumber']>=0,120 - balls_df['overs']*6 - balls_df['ballnumber'], 0)
balls_df['wickets_left'] = 10 - balls_df.groupby('ID')['isWicketDelivery'].cumsum()
balls_df['current_run_rate'] = (balls_df['current_score']*6)/(120-balls_df['balls_left'])
balls_df['required_run_rate'] = np.where(balls_df['balls_left']>0, balls_df['runs_left']*6/balls_df['balls_left'], 0)

balls_df.head()

In [None]:
def result(row):
    return 1 if row['BattingTeam'] == row['WinningTeam'] else 0

In [None]:
balls_df['result'] = balls_df.apply(result, axis=1)

In [None]:
balls_df.head()

In [None]:
index1 = balls_df[balls_df['Team2']==balls_df['BattingTeam']]['Team1'].index
index2 = balls_df[balls_df['Team1']==balls_df['BattingTeam']]['Team2'].index


In [None]:
balls_df.loc[index1, 'BowlingTeam'] = balls_df.loc[index1, 'Team1']
balls_df.loc[index2, 'BowlingTeam'] = balls_df.loc[index2, 'Team2']

In [None]:
final_data = balls_df[['BattingTeam', 'BowlingTeam','City','runs_left','balls_left','wickets_left','current_run_rate','required_run_rate','target','result']]
final_data.head()

In [None]:
fig = px.bar(final_data['City'].value_counts())
fig.show()


In [None]:
fig = px.bar(final_data['BattingTeam'].value_counts())
fig.show()

In [None]:
fig = px.bar(final_data['BowlingTeam'].value_counts())
fig.show()

In [None]:
fig = px.histogram(final_data[final_data['runs_left']>0]['runs_left'], nbins=30,)
fig.show()

In [None]:
fig = px.bar(final_data['wickets_left'].value_counts())
fig.show()

In [None]:
fig = px.histogram(final_data['target'], nbins=30)
fig.show()

In [None]:
final_data.isna().sum()

In [None]:
final_data.sample(final_data.shape[0])

In [None]:
# Randomly shuffle all the rows
final_data.sample()

##### One hot encoding

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

transformer = ColumnTransformer([
    ('transformer', OneHotEncoder(sparse=False,drop='first'),['BattingTeam','BowlingTeam','City'])
],
remainder = 'passthrough')

In [None]:
from sklearn.model_selection import train_test_split

X = final_data.drop('result', axis=1)
y = final_data['result']
X.shape, y.shape

In [None]:
# Split data into training data and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.pipeline import Pipeline

In [None]:
pipe = Pipeline(steps=[
    ('step1',transformer),
    ('step2',RandomForestClassifier())
])

# Train the model
pipe.fit(X_train, y_train)

In [None]:
pipe_2 = Pipeline(steps=[
    ('step1',transformer),
    ('step2',DecisionTreeClassifier())
])

# Train the model
pipe_2.fit(X_train, y_train)

In [None]:
pipe_3 = Pipeline(steps=[
    ('step1',transformer),
    ('step2',DecisionTreeRegressor())
])

# Train the model
pipe_3.fit(X_train, y_train)

In [None]:
pipe_4 = Pipeline(steps=[
    ('step1',transformer),
    ('step2',LogisticRegression())
])

# Train the model
pipe_4.fit(X_train, y_train)

In [None]:
X_test.head()

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
y_predictions = pipe.predict(X_test)
accuracy = accuracy_score(y_predictions, y_test)
print('Accuracy of the model: ', accuracy)
cm = confusion_matrix(y_test, y_predictions)
cm_display = ConfusionMatrixDisplay(cm).plot()


In [None]:
y_predictions = pipe_2.predict(X_test)
y_predictions
accuracy = accuracy_score(y_predictions, y_test)
print('Accuracy of the model: ', accuracy)
cm = confusion_matrix(y_test, y_predictions)
cm_display = ConfusionMatrixDisplay(cm).plot()

In [None]:
y_predictions = pipe_3.predict(X_test)
y_predictions
accuracy = accuracy_score(y_predictions, y_test)
print('Accuracy of the model: ', accuracy)
cm = confusion_matrix(y_test, y_predictions)
cm_display = ConfusionMatrixDisplay(cm).plot()

In [None]:
y_predictions = pipe_4.predict(X_test)
y_predictions
accuracy = accuracy_score(y_predictions, y_test)
print('Accuracy of the model: ', accuracy)
cm = confusion_matrix(y_test, y_predictions)
cm_display = ConfusionMatrixDisplay(cm).plot()

In [None]:
probability = pipe.predict_proba(X_test)
probability[:10]

In [None]:
# Saving the IPL Team Win Predictor Model
# import pickle
# file_name = '../../static/models/ipl_match_win_predict_model.pkl'
# pickle.dump(pipe, open(file_name,'wb'))