#Import Libraries & Load Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
# Load the dataset
df = pd.read_csv("ipl_colab.csv")

In [None]:
# Display basic info
print("Shape:", df.shape)
df.head()

Shape: (76014, 15)


Unnamed: 0,mid,date,venue,batting_team,bowling_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
0,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,1,0,0.1,1,0,0,0,222
1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,1,0,0.2,1,0,0,0,222
2,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.2,2,0,0,0,222
3,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.3,2,0,0,0,222
4,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.4,2,0,0,0,222


In [None]:
print("Columns in dataset:")
print(df.columns.tolist())

Columns in dataset:
['mid', 'date', 'venue', 'batting_team', 'bowling_team', 'batsman', 'bowler', 'runs', 'wickets', 'overs', 'runs_last_5', 'wickets_last_5', 'striker', 'non-striker', 'total']


#Data Cleaning and Feature Extraction

In [None]:
df.columns = df.columns.str.strip()  # Strip any whitespace from column names
df = df.drop(columns=["mid", "date", "batsman", "bowler", "striker", "non-striker"])

In [None]:
# Step 4: One-hot encode categorical columns
categorical_cols = ["venue", "batting_team", "bowling_team"]
df_encoded = pd.get_dummies(df, columns=categorical_cols)

In [None]:
# Step 5: Define features and target
X = df_encoded.drop(columns=["total"])
y = df_encoded["total"]

#Training a model

In [None]:
# Step 6: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 7: Train a regression model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

#Evaluation

In [None]:
# Step 8: Make predictions
y_pred = model.predict(X_test)

In [None]:
# Step 9: Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

In [None]:
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

MAE: 4.02
RMSE: 8.20
R² Score: 0.92


#Predicting a new data

In [None]:
new_data = pd.DataFrame({
    'venue': ['M Chinnaswamy Stadium'],
    'batting_team': ['Kolkata Knight Riders'],
    'bowling_team': ['Royal Challengers Bangalore'],
    'runs': [50],
    'wickets': [1],
    'overs': [5.2],
    'runs_last_5': [40],
    'wickets_last_5': [1]
})

In [None]:
# Add missing dummy columns to match training data
new_data_encoded = pd.get_dummies(new_data)

# Align with training columns
new_data_encoded = new_data_encoded.reindex(columns=X_train.columns, fill_value=0)

In [None]:
prediction = model.predict(new_data_encoded)
print("Predicted Total Score:", prediction[0])

Predicted Total Score: 142.59
