# IPL Runs Prediction Model 

## 1. Import Necessary Libraries

In [1]:
#import the required libraries
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [3]:
import xgboost as xgb

## 2. Load the Dataset

In [4]:

# Load the dataset 
df = pd.read_csv("D:/intership/sample_dataset.csv")

In [5]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,season,match_id,home_team,away_team,current_innings,over,ball,wkt_batsman_runs,isRetiredHurt,cumulative_runs,wickets_lost,overs_bowled,run_rate,powerplay
0,0,2023.0,1359475,GT,CSK,CSK,1,1,0.0,False,0.0,0,1.166667,0.0,True
1,1,2023.0,1359475,GT,CSK,CSK,1,2,0.0,False,0.0,0,1.333333,0.0,True
2,2,2023.0,1359475,GT,CSK,CSK,1,3,0.0,False,0.0,0,1.5,0.0,True
3,3,2023.0,1359475,GT,CSK,CSK,1,4,0.0,False,0.0,0,1.666667,0.0,True
4,4,2023.0,1359475,GT,CSK,CSK,1,5,0.0,False,0.0,0,1.833333,0.0,True


In [6]:
df.columns

Index(['Unnamed: 0', 'season', 'match_id', 'home_team', 'away_team',
       'current_innings', 'over', 'ball', 'wkt_batsman_runs', 'isRetiredHurt',
       'cumulative_runs', 'wickets_lost', 'overs_bowled', 'run_rate',
       'powerplay'],
      dtype='object')

## 4. Model Training

In [7]:
# Select features and target variable
X = df[['overs_bowled', 'wickets_lost', 'run_rate', 'powerplay']]
y = df['cumulative_runs']

In [8]:
X

Unnamed: 0,overs_bowled,wickets_lost,run_rate,powerplay
0,1.166667,0,0.000000,True
1,1.333333,0,0.000000,True
2,1.500000,0,0.000000,True
3,1.666667,0,0.000000,True
4,1.833333,0,0.000000,True
...,...,...,...,...
242545,20.333333,2,3.786885,False
242546,20.500000,3,3.756098,False
242547,20.666667,3,3.725806,False
242548,20.666667,3,3.725806,False


In [9]:
y

0          0.0
1          0.0
2          0.0
3          0.0
4          0.0
          ... 
242545    77.0
242546    77.0
242547    77.0
242548    77.0
242549    77.0
Name: cumulative_runs, Length: 242550, dtype: float64

In [12]:
split_ratio = 0.8 
split_index = int(len(df) * split_ratio)

In [13]:
data=df.sample(frac=1,random_state=42).reset_index(drop=True)
data

Unnamed: 0.1,Unnamed: 0,season,match_id,home_team,away_team,current_innings,over,ball,wkt_batsman_runs,isRetiredHurt,cumulative_runs,wickets_lost,overs_bowled,run_rate,powerplay
0,122566,2015.0,829721,RR,MI,MI,10,4,0.0,False,26.0,0,10.666667,2.437500,False
1,147656,2014.0,734045,MI,CSK,MI,5,1,0.0,False,0.0,1,5.166667,0.000000,True
2,27099,2022.0,1304086,GT,SRH,SRH,20,5,0.0,False,148.0,6,20.833333,7.104000,False
3,42493,2021.0,1254104,CSK,MI,CSK,2,4,0.0,False,0.0,0,2.666667,0.000000,True
4,154359,2013.0,598021,SRH,KXIP,SRH,18,6,0.0,False,91.0,2,19.000000,4.789474,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242545,119879,2016.0,981015,KKR,SRH,SRH,10,5,28.0,False,69.0,2,10.833333,6.369231,False
242546,103694,2017.0,1082638,SRH,MI,SRH,12,3,0.0,False,6.0,2,12.500000,0.480000,False
242547,131932,2015.0,829803,KXIP,RCB,RCB,5,6,0.0,False,36.0,3,6.000000,6.000000,True
242548,146867,2014.0,734037,KKR,SRH,SRH,18,5,0.0,False,89.0,2,18.833333,4.725664,False


In [14]:
train_df = data[:split_index]  # Training data
test_df = data[split_index:]  

In [15]:
train_df

Unnamed: 0.1,Unnamed: 0,season,match_id,home_team,away_team,current_innings,over,ball,wkt_batsman_runs,isRetiredHurt,cumulative_runs,wickets_lost,overs_bowled,run_rate,powerplay
0,122566,2015.0,829721,RR,MI,MI,10,4,0.0,False,26.0,0,10.666667,2.437500,False
1,147656,2014.0,734045,MI,CSK,MI,5,1,0.0,False,0.0,1,5.166667,0.000000,True
2,27099,2022.0,1304086,GT,SRH,SRH,20,5,0.0,False,148.0,6,20.833333,7.104000,False
3,42493,2021.0,1254104,CSK,MI,CSK,2,4,0.0,False,0.0,0,2.666667,0.000000,True
4,154359,2013.0,598021,SRH,KXIP,SRH,18,6,0.0,False,91.0,2,19.000000,4.789474,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194035,193987,2011.0,501240,CSK,RR,RR,11,1,32.0,False,32.0,1,11.166667,2.865672,False
194036,156784,2013.0,598032,RR,SRH,SRH,7,1,4.0,False,20.0,0,7.166667,2.790698,False
194037,17807,2022.0,1304047,CSK,KKR,KKR,12,5,0.0,False,81.0,1,12.833333,6.311688,False
194038,180660,2012.0,548367,RCB,MI,RCB,14,3,0.0,False,43.0,0,14.500000,2.965517,False


In [16]:
test_df

Unnamed: 0.1,Unnamed: 0,season,match_id,home_team,away_team,current_innings,over,ball,wkt_batsman_runs,isRetiredHurt,cumulative_runs,wickets_lost,overs_bowled,run_rate,powerplay
194040,168336,2012.0,548314,SRH,MI,DC,4,3,0.0,False,2.0,3,4.500000,0.444444,True
194041,118913,2016.0,981007,GL,MI,MI,9,4,0.0,False,41.0,1,9.666667,4.241379,False
194042,136722,2014.0,729299,RCB,KKR,RCB,16,2,0.0,False,61.0,0,16.333333,3.734694,False
194043,60930,2020.0,1216524,SRH,DC,SRH,12,2,0.0,False,66.0,3,12.333333,5.351351,False
194044,124937,2015.0,829743,SRH,KKR,SRH,2,4,0.0,False,0.0,2,2.666667,0.000000,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242545,119879,2016.0,981015,KKR,SRH,SRH,10,5,28.0,False,69.0,2,10.833333,6.369231,False
242546,103694,2017.0,1082638,SRH,MI,SRH,12,3,0.0,False,6.0,2,12.500000,0.480000,False
242547,131932,2015.0,829803,KXIP,RCB,RCB,5,6,0.0,False,36.0,3,6.000000,6.000000,True
242548,146867,2014.0,734037,KKR,SRH,SRH,18,5,0.0,False,89.0,2,18.833333,4.725664,False


In [17]:
# Split data into training and test sets (80% train, 20% test)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
x_train

Unnamed: 0,overs_bowled,wickets_lost,run_rate,powerplay
199032,12.666667,2,2.605263,False
237647,17.166667,0,4.601942,False
10462,1.833333,1,0.000000,True
39475,8.500000,6,0.000000,False
137928,9.166667,4,0.763636,False
...,...,...,...,...
119879,10.833333,2,6.369231,False
103694,12.500000,2,0.480000,False
131932,6.000000,3,6.000000,True
146867,18.833333,2,4.725664,False


In [19]:
x_test

Unnamed: 0,overs_bowled,wickets_lost,run_rate,powerplay
122566,10.666667,0,2.437500,False
147656,5.166667,1,0.000000,True
27099,20.833333,6,7.104000,False
42493,2.666667,0,0.000000,True
154359,19.000000,2,4.789474,False
...,...,...,...,...
19877,17.666667,0,2.547170,False
217199,14.333333,3,5.441860,False
188029,3.000000,1,0.000000,True
220380,15.000000,7,0.466667,False


In [20]:
# --- Random Forest Model ---
rf_model = RandomForestRegressor(n_estimators=200, random_state=42)
rf_model.fit(x_train, y_train)
y_rf_pred = rf_model.predict(x_test)

In [21]:
y_rf_pred

array([ 26.   ,   0.   , 148.   , ...,   0.   ,   7.005,   0.   ])

In [22]:
# Evaluate Random Forest Model
mse_rf = mean_squared_error(y_test, y_rf_pred)
r2_rf = r2_score(y_test, y_rf_pred)


In [23]:
print(f'Random Forest - MSE: {mse_rf}, R2: {r2_rf}')

Random Forest - MSE: 0.15980264739229028, R2: 0.9999045513977854


In [24]:
# --- Linear Regression Model ---
lr_model = LinearRegression()
lr_model.fit(x_train, y_train)
y_lr_pred = lr_model.predict(x_test)

In [25]:
# Evaluate Linear Regression Model
mse_lr = mean_squared_error(y_test, y_lr_pred)
r2_lr = r2_score(y_test, y_lr_pred)

In [26]:
print(f'Linear Regression - MSE: {mse_lr}, R2: {r2_lr}')

Linear Regression - MSE: 307.86088931830994, R2: 0.816117617314475


In [28]:
# --- XGBoost Model ---
xgb_model = xgb.XGBRegressor(n_estimators=200, random_state=42)
xgb_model.fit(x_train, y_train)
y_xgb_pred = xgb_model.predict(x_test)  


In [29]:
y_xgb_pred

array([ 2.6217096e+01,  6.9227414e-03,  1.4850606e+02, ...,
       -1.6005549e-03,  7.0604496e+00,  8.6766239e-03], dtype=float32)

In [30]:
# Evaluate XGBoost Model
mse_xgb = mean_squared_error(y_test, y_xgb_pred)
r2_xgb = r2_score(y_test, y_xgb_pred)

In [31]:
print(f'XGBoost - MSE: {mse_xgb}, R2: {r2_xgb}')

XGBoost - MSE: 2.4218121772559837, R2: 0.998553474608104


In [32]:
# --- New Data Prediction ---
new_data = pd.DataFrame({
    'overs_bowled': [10.2],   # Example: 10 overs and 2 balls bowled
    'wickets_lost': [2],      # Example: 2 wickets lost
    'run_rate': [7.5],        # Example: current run rate is 7.5
    'powerplay': [0]          # Example: not in powerplay (after 6 overs)
})


In [33]:
new_data

Unnamed: 0,overs_bowled,wickets_lost,run_rate,powerplay
0,10.2,2,7.5,0


In [34]:
# Predict using Random Forest
run_predict_rf = rf_model.predict(new_data)
print(f'Random Forest Prediction: {run_predict_rf}')

Random Forest Prediction: [76.015]


In [35]:
# Predict using Linear Regression
run_predict_lr = lr_model.predict(new_data)
print(f'Linear Regression Prediction: {run_predict_lr}')

Linear Regression Prediction: [62.35215359]


In [36]:
# Predict using XGBoost
run_predict_xgb = xgb_model.predict(new_data)
print(f'XGBoost Prediction: {run_predict_xgb}')

XGBoost Prediction: [76.712395]
