In [None]:
import pandas as pd
import matplotlib as plt

In [None]:
data = pd.read_csv('../../static/data/dataset_3/IPL_Ball_by_Ball_2008_2022.csv')
data.head()

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.isna().sum()

In [None]:
# Drop the unnecessary features.
# features_to_remove = ['fielders_involved', 'kind', 'player_out', 'isWicketDelivery', 'non_boundary', 'extras_run', 'batsman_run', 'extra_type']
# data.drop(labels=features_to_remove, axis=1, inplace=True)
# data

In [None]:
# Rename overs to over
data = data.rename(columns={'overs': 'over'})
data.head(10)

In [None]:
def overs(row):
    over = str(row['over'])
    ball = str(row['ballnumber'])
    overs = over + '.' + ball if int(ball) < 6 else int(over) + 1
    return float(overs)

In [None]:
# Add new feature overs in place of over and ballnumber
data['overs'] = data.apply(overs, axis=1)
data.head()

In [None]:
# Now we do not need over and ballnumber features, drop them
# data.drop(labels=['over', 'ballnumber'], axis=1, inplace=True)
# data

In [None]:
# Add new feature current_score
data['current_score'] = data.groupby(['ID', 'innings'])['total_run'].cumsum()
data.head()

In [None]:
# Add new feature wickets
data['wickets'] = data.groupby(['ID', 'innings'])['isWicketDelivery'].cumsum()
data.head()

In [None]:
# Helper function for calculating runs in last 5 overs
def runs_rolling_sum_(df):
    df['runs_in_prev_5'] = df['total_run'].rolling(window=30).sum()
    return df

In [None]:
# Add new feature runs_in_prev_5

# runs_in_prev_5 = data.groupby(['ID', 'innings'])['total_run'].rolling(window=30).sum()
# runs_in_prev_5 = runs_in_prev_5.iloc[::-1]
# runs_in_prev_5 = runs_in_prev_5.reset_index(drop=True)
# data['runs_in_prev_5'] = runs_in_prev_5

data = data.groupby(['ID', 'innings']).apply(runs_rolling_sum_)
data = data.reset_index(drop=True)
data.head(31)

In [None]:
# Helper function for calculating wickets in last 5 overs
def wickets_rolling_sum_(df):
    df['wickets_in_prev_5'] = df['isWicketDelivery'].rolling(window=30).sum()
    return df

In [None]:
# Add new feature wickets_in_prev_5

# data['wickets_in_prev_5'] = data.groupby(['ID', 'innings'])['isWicketDelivery'].rolling(30).sum().reset_index(drop=True)
# data = data.merge(data.groupby(['ID', 'innings'])['isWicketDelivery'].rolling(30).sum(), on=['ID', 'innings'])

data = data.groupby(['ID', 'innings']).apply(wickets_rolling_sum_)
data = data.reset_index(drop=True)
data.head(31)

In [None]:
# Now filter the data based on 5 overs, we have to keep data of after 5 overs
data = data[data['over'] >= 5]
data.head(31)

In [None]:
# Since values of runs_in_prev_5 and wickets_in_prev_5 are in decimal, we have to make these values to integer
def decimal_to_int_for_runs(row):
    return int(row['runs_in_prev_5'])
    
def decimal_to_int_for_wickets(row):
    return int(row['wickets_in_prev_5'])

data['runs_in_prev_5'] = data.apply(decimal_to_int_for_runs, axis=1)
data['wickets_in_prev_5'] = data.apply(decimal_to_int_for_wickets, axis=1)

data.head(31)

In [None]:
# Finding total score of the innings
total_score = data.groupby(['ID', 'innings']).sum()['total_run'].reset_index()
total_score.head()

In [None]:
# Rename the total_run feature to total_score
total_score = total_score.rename(columns={'total_run': 'total_score'})
total_score.head()

In [None]:
# Add new feature total_score
data = data.merge(total_score[['ID', 'innings', 'total_score']], on=['ID', 'innings'])
data.head()

In [None]:
# Now we do not need total_run feature, drop it
# data.drop(labels=['total_run'], axis=1, inplace=True)

In [None]:
matches = pd.read_csv('../../static/data/dataset_3//IPL_Matches_2008_2022.csv')
matches.head()

In [None]:
data = data.merge(matches[['ID', 'Team1', 'Team2']], on='ID')
data.head()

In [None]:
index1 = data[data['Team2'] == data['BattingTeam']]['Team1'].index
index2 = data[data['Team1'] == data['BattingTeam']]['Team2'].index

In [None]:
data.loc[index1, 'BowlingTeam'] = data.loc[index1, 'Team1']
data.loc[index2, 'BowlingTeam'] = data.loc[index2, 'Team2']

In [None]:
data.head()

In [None]:
data['BattingTeam'].unique()

In [None]:
# Rename old team names to new names
data['BattingTeam'] = data['BattingTeam'].str.replace('Delhi Daredevils', 'Delhi Capitals')
data['BattingTeam'] = data['BattingTeam'].str.replace('Kings XI Punjab', 'Punjab Kings')
data['BattingTeam'] = data['BattingTeam'].str.replace('Deccan Chargers', 'Sunrisers Hyderabad')
data['BattingTeam'] = data['BattingTeam'].str.replace('Rising Pune Supergiants', 'Rising Pune Supergiant')

data['BowlingTeam'] = data['BowlingTeam'].str.replace('Delhi Daredevils', 'Delhi Capitals')
data['BowlingTeam'] = data['BowlingTeam'].str.replace('Kings XI Punjab', 'Punjab Kings')
data['BowlingTeam'] = data['BowlingTeam'].str.replace('Deccan Chargers', 'Sunrisers Hyderabad')
data['BowlingTeam'] = data['BowlingTeam'].str.replace('Rising Pune Supergiants', 'Rising Pune Supergiant')

data.head()
data.shape

In [None]:
# Current teams in IPl
current_teams = [ 'Rajasthan Royals',
    'Royal Challengers Bangalore',
    'Sunrisers Hyderabad', 
    'Delhi Capitals', 
    'Chennai Super Kings',
    'Gujarat Titans', 
    'Lucknow Super Giants', 
    'Kolkata Knight Riders',
    'Punjab Kings', 
    'Mumbai Indians'
]

In [None]:
# Keep data only of current teams in ipl
data = data[data['BattingTeam'].isin(current_teams)]
data = data[data['BowlingTeam'].isin(current_teams)]
data.shape

In [None]:
data.head()

### Feature Engineering

In [None]:
data.columns

In [None]:
# Important Features
features_to_set = ['BattingTeam', 'BowlingTeam', 'overs', 'current_score', 'total_score', 'wickets', 'runs_in_prev_5', 'wickets_in_prev_5']
final_data = data[features_to_set]
final_data.head()

##### One hot encoding

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

transformer = ColumnTransformer([
    ('transformer', OneHotEncoder(sparse_output=False,drop='first'),['BattingTeam','BowlingTeam'])
],
remainder = 'passthrough')

In [None]:
from sklearn.model_selection import train_test_split

X = final_data.drop('total_score', axis=1)
y = final_data['total_score']
X.shape, y.shape

In [None]:
# Split data into training data and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

pipe = Pipeline(steps=[
    ('step1',transformer),
    ('step2',RandomForestRegressor())
])

In [None]:
# Train the model
pipe.fit(X_train, y_train)

In [None]:
# make predictions
y_predictions = pipe.predict(X_test)
y_predictions

In [None]:
import seaborn as sns
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_predictions)
print('R^2 score: ', r2)
sns.distplot(y_test - y_predictions)

In [None]:
# Using Evaluation Metrics
from sklearn import metrics
import numpy as np

# Mean Absolute Error
print('MAE: ', metrics.mean_absolute_error(y_test , y_predictions))

# Mean Squared Error
print('MSE: ', metrics.mean_squared_error(y_test, y_predictions))

# Root Mean Squared Error
print('RMSE: ', np.sqrt(metrics.mean_squared_error(y_test, y_predictions)))

In [None]:
# Saving the IPL Score Predictor Model
# import pickle
# file_name = '../../static/models/ipl_score_predict_model.pkl'
# pickle.dump(pipe , open(file_name,'wb'))