Import Libraries

In [6]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import os

# Create data folder
os.makedirs("data", exist_ok=True)

# Load raw IPL datasets (already uploaded to Colab)
matches = pd.read_csv("matches.csv")
deliveries = pd.read_csv("deliveries.csv")

# Merge
df = deliveries.merge(matches, how="left", left_on="match_id", right_on="id")

# Cleaning
df["venue"] = df["venue"].fillna("Unknown")
df["winner"] = df["winner"].fillna("No Result")
df["player_dismissed"] = df["player_dismissed"].fillna("None")
df = df.fillna(0)

# Date formatting
df["date"] = pd.to_datetime(df["date"], errors="coerce")

# Normalization
scaler = MinMaxScaler()
df[["total_runs", "batsman_runs", "extra_runs"]] = scaler.fit_transform(
    df[["total_runs", "batsman_runs", "extra_runs"]]
)

# Save cleaned file INSIDE COLAB
df.to_csv("data/ipl_cleaned.csv", index=False)

print("✅ ipl_cleaned.csv created in Colab")



✅ ipl_cleaned.csv created in Colab


Load Cleaned Data (from Week 1–2)

In [7]:
df = pd.read_csv("data/ipl_cleaned.csv")

df["date"] = pd.to_datetime(df["date"])
df.head()




  df = pd.read_csv("data/ipl_cleaned.csv")


Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,...,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method,umpire1,umpire2
0,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.0,1.0,SC Ganguly,P Kumar,BB McCullum,0.0,...,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,0,Asad Rauf,RE Koertzen
1,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.0,2.0,BB McCullum,P Kumar,SC Ganguly,0.0,...,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,0,Asad Rauf,RE Koertzen
2,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.0,3.0,BB McCullum,P Kumar,SC Ganguly,0.0,...,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,0,Asad Rauf,RE Koertzen
3,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.0,4.0,BB McCullum,P Kumar,SC Ganguly,0.0,...,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,0,Asad Rauf,RE Koertzen
4,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.0,5.0,BB McCullum,P Kumar,SC Ganguly,0.0,...,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,0,Asad Rauf,RE Koertzen


In [8]:
df = df.rename(columns={
    "batter": "batsman"
})
df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,bowler,non_striker,batsman_runs,...,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method,umpire1,umpire2
0,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.0,1.0,SC Ganguly,P Kumar,BB McCullum,0.0,...,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,0,Asad Rauf,RE Koertzen
1,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.0,2.0,BB McCullum,P Kumar,SC Ganguly,0.0,...,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,0,Asad Rauf,RE Koertzen
2,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.0,3.0,BB McCullum,P Kumar,SC Ganguly,0.0,...,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,0,Asad Rauf,RE Koertzen
3,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.0,4.0,BB McCullum,P Kumar,SC Ganguly,0.0,...,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,0,Asad Rauf,RE Koertzen
4,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0.0,5.0,BB McCullum,P Kumar,SC Ganguly,0.0,...,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,0,Asad Rauf,RE Koertzen


Sort for Time-Series Safety

In [9]:
df = df.sort_values(by=["batsman", "date"])


AGGREGATION (Ball-by-Ball → Player-Match)

 Player-Match Level Aggregation

In [10]:
player_match = df.groupby(
    ["match_id", "date", "batsman", "batting_team", "bowling_team", "venue"]
).agg(
    runs_scored=("batsman_runs", "sum"),
    balls_faced=("ball", "count"),
    wickets_taken=("is_wicket", "sum")
).reset_index()

player_match.head()


Unnamed: 0,match_id,date,batsman,batting_team,bowling_team,venue,runs_scored,balls_faced,wickets_taken
0,335982,2008-04-18,AA Noffke,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,1.5,12,1.0
1,335982,2008-04-18,B Akhil,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,0.0,2,1.0
2,335982,2008-04-18,BB McCullum,Kolkata Knight Riders,Royal Challengers Bangalore,M Chinnaswamy Stadium,26.333333,77,0.0
3,335982,2008-04-18,CL White,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,1.0,10,1.0
4,335982,2008-04-18,DJ Hussey,Kolkata Knight Riders,Royal Challengers Bangalore,M Chinnaswamy Stadium,2.0,12,1.0


TARGET VARIABLE (Next Match Performance)

 Create Training Label

In [11]:
player_match["target_runs_next_match"] = (
player_match.groupby("batsman")["runs_scored"].shift(-1) )
player_match.dropna(inplace=True)


    #shift(-1)

#For each batsman:

#Takes the runs scored in the NEXT match

#Stores it as the target for the CURRENT match

#Example:

#Match 1 → predict Match 2 runs
#Match 2 → predict Match 3 runs
#Last match → NO next match → NaN





FEATURE ENGINEERING


 Rolling Averages (Recent Form)

In [12]:
player_match["avg_runs_last_5"] = (
    player_match.groupby("batsman")["runs_scored"]
    .rolling(5)
    .mean()
    .reset_index(level=0, drop=True)
)

player_match["avg_runs_last_10"] = (
    player_match.groupby("batsman")["runs_scored"]
    .rolling(10)
    .mean()
    .reset_index(level=0, drop=True)
)

player_match.fillna(0, inplace=True)


Venue Average Performance

In [13]:
player_match["venue_avg_runs"] = (
    player_match.groupby(["batsman", "venue"])["runs_scored"]
    .transform("mean")
)


Opponent-Specific Stats (Player vs Team)

In [14]:
player_match["pvt_avg_runs"] = (
    player_match.groupby(["batsman", "bowling_team"])["runs_scored"]
    .transform("mean")
)


Career Statistics

In [15]:
player_match["career_avg_runs"] = (
    player_match.groupby("batsman")["runs_scored"]
    .transform("mean")
)


FEATURE SELECTION


 Define Features & Target

In [16]:

features = [
    "runs_scored",
    "balls_faced",
    "wickets_taken",
    "avg_runs_last_5",
    "avg_runs_last_10",
    "venue_avg_runs",
    "pvt_avg_runs",
    "career_avg_runs",
    "batting_team",
    "bowling_team",
    "venue"
]


target = "target_runs_next_match"


TRAIN–TEST SPLIT (TIME-SERIES AWARE)


 Split by Date

In [17]:
split_date = player_match["date"].quantile(0.8)

train_df = player_match[player_match["date"] <= split_date]
test_df = player_match[player_match["date"] > split_date]

X_train = train_df[features]
y_train = train_df[target]

X_test = test_df[features]
y_test = test_df[target]


PREPROCESSING PIPELINE


Build Pipeline

In [26]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

numeric_features = [
    "avg_runs_last_5",
    "avg_runs_last_10",
    "venue_avg_runs",
    "pvt_avg_runs",
    "career_avg_runs"
]

categorical_features = ["batting_team", "bowling_team", "venue"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)



Fit & Save Pipeline

In [28]:
import joblib

preprocessor.fit(X_train)

joblib.dump(preprocessor, "feature_pipeline.pkl")
print("✅ feature_pipeline.pkl saved")


✅ feature_pipeline.pkl saved


In [29]:
player_match.columns


Index(['match_id', 'date', 'batsman', 'batting_team', 'bowling_team', 'venue',
       'runs_scored', 'balls_faced', 'wickets_taken', 'target_runs_next_match',
       'avg_runs_last_5', 'avg_runs_last_10', 'venue_avg_runs', 'pvt_avg_runs',
       'career_avg_runs'],
      dtype='object')

FINAL DATASET


 Save Feature-Engineered Dataset

In [30]:
import os

# Force create folder
os.makedirs("data", exist_ok=True)

features = [
    "runs_scored",
    "balls_faced",
    "wickets_taken",
    "avg_runs_last_5",
    "avg_runs_last_10",
    "venue_avg_runs",
    "pvt_avg_runs",
    "career_avg_runs",
    "batting_team",
    "bowling_team",
    "venue"
]

target = "target_runs_next_match"

final_dataset = player_match[features + [target]]

final_dataset.to_csv("data/dataset.csv", index=False)

print("✅ dataset.csv SAVED")


✅ dataset.csv SAVED
