In [None]:
from google.colab import drive
drive.mount('/content/drive')
import numpy as np
import pandas as pd
import warnings
import os
warnings.filterwarnings('ignore')
from sklearn.pipeline import Pipeline
df=pd.read_csv('/information.csv')
df.head()
df.shape
VISUALIZATION
import plotly.express as px
average_runs_under_pressure = df[df['pressure'] == 1].groupby('batting_team')['runs_x'].mean().reset_index()

fig = px.bar(
    data_frame=average_runs_under_pressure,
    x='batting_team',
    y='runs_x',
    color='batting_team',  # Assigning different colors based on the Batting_team column
    template='plotly_dark',
    title='Average Runs Scored by Batting Team Under Pressure'
)

fig.update_layout(
    xaxis_title='Batting Team',
    yaxis_title='Average Runs'
)

fig.show()
death_overs_runs = df[df['Death_Overs'] == 1].groupby('batting_team')['runs_x'].mean().reset_index()

fig = px.bar(
    data_frame=death_overs_runs,
    x='batting_team',
    y='runs_x',
    color='batting_team',  # Assigning different colors based on the Batting_team column
    template='plotly_dark',
    title='Average Runs Scored by Batting Team In Death Overs'
)

fig.update_layout(
    xaxis_title='Batting Team',
    yaxis_title='Average Runs'
)

fig.show()
team_counts = df['batting_team'].value_counts()
fig = px.pie(names=team_counts.index, values=team_counts.values, title='Distribution of Batting Teams')
fig.show()
average_crr_pp = df.groupby('batting_team')['crr'].mean().reset_index()
average_crr_pp = average_crr_pp.sort_values('crr', ascending=False).reset_index(drop=True)

# Plot the average CRR in PP for each batting team
fig = px.bar(average_crr_pp, x='batting_team', y='crr', title='Average CRR in Powerplay by Batting Team')
fig.show()
fig = px.pie(df, names='wickets_left', title='Distribution of Wickets Left')
fig.show()
DATA CLEANING
So this is that dataset we have. We need to create some columns and extract few to get the desired data. Eventually we want our data to have columns:

batting team

bowling team

city

current_score

balls left

wickets_left

current_run_rate

last five
Now we already have few columns as we want it in our dataset. Batting team and bowling team data we already have. we also have city column but it has some null values which we need to figure out how to handle that. For rest all we need do some manipulation.

Now we will start our feature extraction from the city column. To fill the empty values we will use venue column.

df[df['city'].isnull()]['venue'].value_counts()
Here we are checking the values in ‘venue’ column where city column has null values. If we notice carefully the first word in venue is actually the name of the city where the venue exists for e.g. Dubai in Dubai International Cricket Stadium or Melbourne in Melbourne Cricket Ground.
cities=np.where(df['city'].isnull(),df['venue'].str.split().apply(lambda x:x[0]),df['city'])
df['city']=cities
df.isnull().sum()
So we store all the first word of venue column in variable named cities and then use it to fill the the city column. Now there are no null values in our dataset. But still there is one more thing left. Our dataset is a ball-by-ball dataset which means if there are 63000 rows that means that many balls have been bowled and played.
df.drop(columns=['Unnamed: 0','venue'],inplace=True)
df
eligible_cities=df['city'].value_counts()[df['city'].value_counts()>600].index.tolist()
This shows that there are certain cities where very few deliveries have been played. So we can ignore those cities and only consider the ones which have at least 600 deliveries.
df=df[df['city'].isin(eligible_cities)]
Now our city column is complete. Coming to current_runs column which is very easy to extract from the runs column. A simple cumsum() function (used to find the cumulative sum of a column) will do the work for us.
df['current_score']=df.groupby('match_id').cumsum()['runs']
df
Now our next target is to create a ‘balls_left’ column for which firstly we would be creating to new columns: ‘overs’ and ‘balls’ which tells us how many overs have been completed and how many balls of the current over has been bowled respectively. The code is very simple for that.
df['over']=df['ball'].apply(lambda x:str(x).split(".")[0])
df['ball_no']=df['ball'].apply(lambda x:str(x).split(".")[1])
df
Now by using a simple formula we can create a ‘balls_bowled’ column that is how many balls have been bowled. Formula would be

balls_bowled = (overs * 6) + balls
df['balls_bowled']=(df['over'].astype('int')*6) + df['ball_no'].astype('int')
df
And now finally we can create our desired column ‘balls_left’ by subtracting balls_bowled from 120 because there are total 120 balls in an innings. sometimes because of extras (wide, no ball …) the ball count exceeds 120 so in such case we can simply give the value of 0.And now finally we can create our desired column ‘balls_left’ by subtracting balls_bowled from 120 because there are total 120 balls in an innings. sometimes because of extras (wide, no ball …) the ball count exceeds 120 so in such case we can simply give the value of 0.
df['balls_left']=120-df['balls_bowled']
df['balls_left']=df['balls_left'].apply(lambda x:0 if x<0 else x)
df
Now if we look at the ‘player_dismissed’ column it has either value 0 or name of the player got out at that particular ball. First we will replace all the names with 1 and then apply the cumsum() function on it so we can get the total wickets gone and we will subtract it from 10 to get the ‘wickets_left’ column.
df['player_dismissed'] = df['player_dismissed'].apply(lambda x:0 if x=='0' else 1)
df['player_dismissed'] = df['player_dismissed'].astype('int')
df['player_dismissed'] = df.groupby('match_id').cumsum()['player_dismissed']
df['wickets_left'] = 10 - df['player_dismissed']
df
df['crr']=(df['current_score']*6)/df['balls_bowled']
df
Now we need a column that has total runs scored in last five overs. Obviously we will have null values in this column for first 5 overs.
groups=df.groupby('match_id')

match_ids=df['match_id'].unique()
last_five=[]
for id in match_ids:
    last_five.extend(groups.get_group(id).rolling(window=30).sum()['runs'].values.tolist())
df['last_five']=last_five
df
Now we have to create a last column which would be our target column. Total runs scored in that innings.
final_df=df.groupby('match_id').sum()['runs'].reset_index().merge(df,on='match_id')
final_df=final_df[['batting_team','bowling_team','city','current_score','balls_left','wickets_left','crr','last_five','runs_x']]
final_df
Now we will drop all the columns which we dont want to have for our model and keep those which we created just now. Also we will shuffle the data to avoid any kind of bias.
final_df.isnull().sum()
final_df.dropna(inplace=True)
final_df.isnull().sum()
final_df=final_df.sample(final_df.shape[0])
final_df
TRAIN AND TEST DATASETS
With this we end out feature extraction part of the project. So after lot of work we finally have the exact required data we wanted at the start.

So lets now begin with model building process. For that first we will divide our dataset in training set and testing set using train_test_split module of sklearn library
#Train-Test-Split

X=final_df.drop(columns=['runs_x'])
y=final_df['runs_x']
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,y,test_size=0.2,random_state=1)
X_train
CREATING MODEL
Some preprocessing steps are required here. We will apply one hot encoding on the categorical features (batting_team, bowling_team and city) then we will create a pipleline which would be having our ml model. Also we will apply scaling on our data so that all values come in one range.

Here four our model I will be using xgboost algorithm.
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score,mean_absolute_error
trf = ColumnTransformer([
    ('trf',OneHotEncoder(sparse=False,drop='first'),['batting_team','bowling_team','city'])
]
,remainder='passthrough')
pipe=Pipeline(steps=[
    ('step1',trf),
    ('step2',StandardScaler()),
    ('step3',XGBRegressor(n_estimators=300,learning_rate=0.2,max_depth=12,random_state=1))

])
pipe.fit(X_train,Y_train)
Y_pred=pipe.predict(X_test)
print(r2_score(Y_test,Y_pred))
print(mean_absolute_error(Y_test,Y_pred))
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))
eligible_cities