02_FeatureEngineering

Objective
The goal of this notebook is to convert ball-by-ball IPL data into
player–match level datasets for machine learning.

In this step:
Each row will represent one player in one match
Separate datasets will be created for batsmen and bowlers
Rolling averages will be added to capture recent form



In [3]:
import pandas as pd

# Load cleaned ball-by-ball data
df = pd.read_csv("clean_data.csv", low_memory=False)

# Convert date column back to datetime (CSV does not preserve dtype)
df["date"] = pd.to_datetime(df["date"])

print("Dataset shape:", df.shape)
df.head()


Dataset shape: (260920, 36)


Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,...,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,year,month,day
0,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,SC Ganguly,P Kumar,BB McCullum,0,...,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,2008,4,18
1,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,2,BB McCullum,P Kumar,SC Ganguly,0,...,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,2008,4,18
2,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,BB McCullum,P Kumar,SC Ganguly,0,...,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,2008,4,18
3,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,4,BB McCullum,P Kumar,SC Ganguly,0,...,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,2008,4,18
4,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,5,BB McCullum,P Kumar,SC Ganguly,0,...,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,2008,4,18


In [None]:
# Columns required for   feature engineering 
required_columns = [
    "match_id",
    "date",
    "venue",
    "batting_team",
    "bowling_team",
    "batter",
    "bowler",
    "batsman_runs",
    "total_runs",
    "is_wicket"
]

# Check if any required column is  missing
missing_columns = [col for col in required_columns if col not in df.columns]

print("Missing columns:", missing_columns)


Missing columns: []


In [None]:
# Create batsman  performance    per match
batsman_per_match = (
    df.groupby(
        ["match_id", "date", "venue", "batting_team", "bowling_team", "batter"]
    )
    .agg(
        runs=("batsman_runs", "sum"),
        balls_faced=("ball", "count"),
        fours=("batsman_runs", lambda x: (x == 4).sum()),
        sixes=("batsman_runs", lambda x: (x == 6).sum())
    )
    .reset_index()
)

batsman_per_match.head()


Unnamed: 0,match_id,date,venue,batting_team,bowling_team,batter,runs,balls_faced,fours,sixes
0,335982,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,158,77,10,13
1,335982,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,DJ Hussey,12,12,1,0
2,335982,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,Mohammad Hafeez,5,3,1,0
3,335982,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,RT Ponting,20,20,1,1
4,335982,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,10,12,2,0


In [None]:
#  Add strike rate  feature
batsman_per_match["strike_rate"] = (
    batsman_per_match["runs"] / batsman_per_match["balls_faced"]
) * 100

batsman_per_match.head()


Unnamed: 0,match_id,date,venue,batting_team,bowling_team,batter,runs,balls_faced,fours,sixes,strike_rate
0,335982,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,158,77,10,13,205.194805
1,335982,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,DJ Hussey,12,12,1,0,100.0
2,335982,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,Mohammad Hafeez,5,3,1,0,166.666667
3,335982,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,RT Ponting,20,20,1,1,100.0
4,335982,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,10,12,2,0,83.333333


In [7]:
# Sort batsman data by player and match date
batsman_per_match = batsman_per_match.sort_values(
    by=["batter", "date"]
)

batsman_per_match.head()


Unnamed: 0,match_id,date,venue,batting_team,bowling_team,batter,runs,balls_faced,fours,sixes,strike_rate
4299,548346,2012-04-29,Wankhede Stadium,Deccan Chargers,Mumbai Indians,A Ashish Reddy,10,10,0,1,100.0
4398,548352,2012-05-04,"MA Chidambaram Stadium, Chepauk",Deccan Chargers,Chennai Super Kings,A Ashish Reddy,3,3,0,0,100.0
4496,548359,2012-05-08,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Kings XI Punjab,A Ashish Reddy,8,8,1,0,100.0
4699,548373,2012-05-18,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Rajasthan Royals,A Ashish Reddy,10,4,2,0,250.0
4747,548376,2012-05-20,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Royal Challengers Bangalore,A Ashish Reddy,4,5,0,0,80.0


In [None]:
# Rolling average of runs for last 5 matches 
batsman_per_match["runs_last_5"] = (
    batsman_per_match
    .groupby("batter")["runs"]
    .rolling(window=5, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

batsman_per_match[
    ["batter", "date", "runs", "runs_last_5"]
].head(10)


Unnamed: 0,batter,date,runs,runs_last_5
4299,A Ashish Reddy,2012-04-29,10,10.0
4398,A Ashish Reddy,2012-05-04,3,6.5
4496,A Ashish Reddy,2012-05-08,8,7.0
4699,A Ashish Reddy,2012-05-18,10,7.75
4747,A Ashish Reddy,2012-05-20,4,7.0
4877,A Ashish Reddy,2013-04-05,7,6.4
4943,A Ashish Reddy,2013-04-07,14,8.6
5602,A Ashish Reddy,2013-04-09,3,7.6
5037,A Ashish Reddy,2013-04-12,16,8.8
5081,A Ashish Reddy,2013-04-14,4,8.8


In [9]:
# Rolling average of strike rate for last 5 matches
batsman_per_match["strike_rate_last_5"] = (
    batsman_per_match
    .groupby("batter")["strike_rate"]
    .rolling(window=5, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

batsman_per_match[
    ["batter", "date", "strike_rate", "strike_rate_last_5"]
].head(10)


Unnamed: 0,batter,date,strike_rate,strike_rate_last_5
4299,A Ashish Reddy,2012-04-29,100.0,100.0
4398,A Ashish Reddy,2012-05-04,100.0,100.0
4496,A Ashish Reddy,2012-05-08,100.0,100.0
4699,A Ashish Reddy,2012-05-18,250.0,137.5
4747,A Ashish Reddy,2012-05-20,80.0,126.0
4877,A Ashish Reddy,2013-04-05,175.0,141.0
4943,A Ashish Reddy,2013-04-07,116.666667,144.333333
5602,A Ashish Reddy,2013-04-09,75.0,139.333333
5037,A Ashish Reddy,2013-04-12,177.777778,124.888889
5081,A Ashish Reddy,2013-04-14,80.0,124.888889


In [None]:
# Rolling average of runs for last 10  matches
batsman_per_match["runs_last_10"] = (
    batsman_per_match
    .groupby("batter")["runs"]
    .rolling(window=10, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

batsman_per_match[
    ["batter", "date", "runs", "runs_last_10"]
].head(10)


Unnamed: 0,batter,date,runs,runs_last_10
4299,A Ashish Reddy,2012-04-29,10,10.0
4398,A Ashish Reddy,2012-05-04,3,6.5
4496,A Ashish Reddy,2012-05-08,8,7.0
4699,A Ashish Reddy,2012-05-18,10,7.75
4747,A Ashish Reddy,2012-05-20,4,7.0
4877,A Ashish Reddy,2013-04-05,7,7.0
4943,A Ashish Reddy,2013-04-07,14,8.0
5602,A Ashish Reddy,2013-04-09,3,7.375
5037,A Ashish Reddy,2013-04-12,16,8.333333
5081,A Ashish Reddy,2013-04-14,4,7.9


In [11]:
# Rolling average of strike rate for last 10 matches
batsman_per_match["strike_rate_last_10"] = (
    batsman_per_match
    .groupby("batter")["strike_rate"]
    .rolling(window=10, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

batsman_per_match[
    ["batter", "date", "strike_rate", "strike_rate_last_10"]
].head(10)


Unnamed: 0,batter,date,strike_rate,strike_rate_last_10
4299,A Ashish Reddy,2012-04-29,100.0,100.0
4398,A Ashish Reddy,2012-05-04,100.0,100.0
4496,A Ashish Reddy,2012-05-08,100.0,100.0
4699,A Ashish Reddy,2012-05-18,250.0,137.5
4747,A Ashish Reddy,2012-05-20,80.0,126.0
4877,A Ashish Reddy,2013-04-05,175.0,134.166667
4943,A Ashish Reddy,2013-04-07,116.666667,131.666667
5602,A Ashish Reddy,2013-04-09,75.0,124.583333
5037,A Ashish Reddy,2013-04-12,177.777778,130.493827
5081,A Ashish Reddy,2013-04-14,80.0,125.444444


In [12]:
# #Create bowler performance per match
bowler_per_match = (
    df.groupby(
        ["match_id", "date", "venue", "batting_team", "bowling_team", "bowler"]
    )
    .agg(
        wickets=("is_wicket", "sum"),
        balls_bowled=("ball", "count"),
        runs_conceded=("total_runs", "sum")
    )
    .reset_index()
)

# Calculate overs and economy
bowler_per_match["overs"] = bowler_per_match["balls_bowled"] / 6
bowler_per_match["economy"] = (
    bowler_per_match["runs_conceded"] / bowler_per_match["overs"]
)

bowler_per_match.head()


Unnamed: 0,match_id,date,venue,batting_team,bowling_team,bowler,wickets,balls_bowled,runs_conceded,overs,economy
0,335982,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,AA Noffke,1,25,41,4.166667,9.84
1,335982,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,CL White,0,7,24,1.166667,20.571429
2,335982,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,JH Kallis,1,25,52,4.166667,12.48
3,335982,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,P Kumar,0,25,41,4.166667,9.84
4,335982,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SB Joshi,0,18,26,3.0,8.666667


In [None]:
#  Sort bowler data by player and  match date
bowler_per_match = bowler_per_match.sort_values(
    by=["bowler", "date"]
)

bowler_per_match.head()


Unnamed: 0,match_id,date,venue,batting_team,bowling_team,bowler,wickets,balls_bowled,runs_conceded,overs,economy
3389,548341,2012-04-26,Subrata Roy Sahara Stadium,Pune Warriors,Deccan Chargers,A Ashish Reddy,2,24,32,4.0,8.0
3448,548346,2012-04-29,Wankhede Stadium,Mumbai Indians,Deccan Chargers,A Ashish Reddy,1,14,11,2.333333,4.714286
3474,548348,2012-05-01,Barabati Stadium,Pune Warriors,Deccan Chargers,A Ashish Reddy,1,19,32,3.166667,10.105263
3518,548352,2012-05-04,"MA Chidambaram Stadium, Chepauk",Chennai Super Kings,Deccan Chargers,A Ashish Reddy,1,13,16,2.166667,7.384615
3575,548356,2012-05-06,M Chinnaswamy Stadium,Royal Challengers Bangalore,Deccan Chargers,A Ashish Reddy,1,25,36,4.166667,8.64


In [None]:
# Rolling average of wickets  for last 5 matches
bowler_per_match["wickets_last_5"] = (
    bowler_per_match
    .groupby("bowler")["wickets"]
    .rolling(window=5, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

bowler_per_match[
    ["bowler", "date", "wickets", "wickets_last_5"]
].head(10)


Unnamed: 0,bowler,date,wickets,wickets_last_5
3389,A Ashish Reddy,2012-04-26,2,2.0
3448,A Ashish Reddy,2012-04-29,1,1.5
3474,A Ashish Reddy,2012-05-01,1,1.333333
3518,A Ashish Reddy,2012-05-04,1,1.25
3575,A Ashish Reddy,2012-05-06,1,1.2
3610,A Ashish Reddy,2012-05-08,2,1.2
3272,A Ashish Reddy,2012-05-10,0,1.0
3771,A Ashish Reddy,2012-05-18,0,0.8
3805,A Ashish Reddy,2012-05-20,3,1.2
3893,A Ashish Reddy,2013-04-05,1,1.2


In [None]:
# Rolling average of economy  for last 5 matches
bowler_per_match["economy_last_5"] = (
    bowler_per_match
    .groupby("bowler")["economy"]
    .rolling(window=5, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

bowler_per_match[
    ["bowler", "date", "economy", "economy_last_5"]
].head(10)


Unnamed: 0,bowler,date,economy,economy_last_5
3389,A Ashish Reddy,2012-04-26,8.0,8.0
3448,A Ashish Reddy,2012-04-29,4.714286,6.357143
3474,A Ashish Reddy,2012-05-01,10.105263,7.606516
3518,A Ashish Reddy,2012-05-04,7.384615,7.551041
3575,A Ashish Reddy,2012-05-06,8.64,7.768833
3610,A Ashish Reddy,2012-05-08,9.6,8.088833
3272,A Ashish Reddy,2012-05-10,14.5,10.045976
3771,A Ashish Reddy,2012-05-18,8.5,9.724923
3805,A Ashish Reddy,2012-05-20,6.0,9.448
3893,A Ashish Reddy,2013-04-05,10.5,9.82


In [None]:
# Rolling average of wickets for  last 10 matches
bowler_per_match["wickets_last_10"] = (
    bowler_per_match
    .groupby("bowler")["wickets"]
    .rolling(window=10, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

bowler_per_match[
    ["bowler", "date", "wickets", "wickets_last_10"]
].head(10)


Unnamed: 0,bowler,date,wickets,wickets_last_10
3389,A Ashish Reddy,2012-04-26,2,2.0
3448,A Ashish Reddy,2012-04-29,1,1.5
3474,A Ashish Reddy,2012-05-01,1,1.333333
3518,A Ashish Reddy,2012-05-04,1,1.25
3575,A Ashish Reddy,2012-05-06,1,1.2
3610,A Ashish Reddy,2012-05-08,2,1.333333
3272,A Ashish Reddy,2012-05-10,0,1.142857
3771,A Ashish Reddy,2012-05-18,0,1.0
3805,A Ashish Reddy,2012-05-20,3,1.222222
3893,A Ashish Reddy,2013-04-05,1,1.2


In [None]:
# Rolling average of economy for last  10  matches
bowler_per_match["economy_last_10"] = (
    bowler_per_match
    .groupby("bowler")["economy"]
    .rolling(window=10, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

bowler_per_match[
    ["bowler", "date", "economy", "economy_last_10"]
].head(10)


Unnamed: 0,bowler,date,economy,economy_last_10
3389,A Ashish Reddy,2012-04-26,8.0,8.0
3448,A Ashish Reddy,2012-04-29,4.714286,6.357143
3474,A Ashish Reddy,2012-05-01,10.105263,7.606516
3518,A Ashish Reddy,2012-05-04,7.384615,7.551041
3575,A Ashish Reddy,2012-05-06,8.64,7.768833
3610,A Ashish Reddy,2012-05-08,9.6,8.074027
3272,A Ashish Reddy,2012-05-10,14.5,8.992023
3771,A Ashish Reddy,2012-05-18,8.5,8.930521
3805,A Ashish Reddy,2012-05-20,6.0,8.604907
3893,A Ashish Reddy,2013-04-05,10.5,8.794416



 Batsman features

Runs, Balls, fours, sixes

strike rate

Runs_last_5, Runs_last_10

Strike_rate_last_5,  Strike_rate_last_10"""

Bowler features

wickets, overs,  economy

wickets_last_5, wickets_last_10

economy_last_5, economy_last_10...

In [18]:
# Venue-wise batsman performance
batsman_venue_avg = (
    batsman_per_match
    .groupby(["batter", "venue"])
    .agg(
        avg_runs_venue=("runs", "mean"),
        avg_sr_venue=("strike_rate", "mean")
    )
    .reset_index()
)

batsman_venue_avg.head()


Unnamed: 0,batter,venue,avg_runs_venue,avg_sr_venue
0,A Ashish Reddy,Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket St...,14.0,165.972222
1,A Ashish Reddy,Eden Gardens,4.0,80.0
2,A Ashish Reddy,Feroz Shah Kotla,16.0,177.777778
3,A Ashish Reddy,M Chinnaswamy Stadium,17.5,126.388889
4,A Ashish Reddy,"MA Chidambaram Stadium, Chepauk",19.5,162.5


In [19]:
# Merge venue averages back into batsman_per_match
batsman_per_match = batsman_per_match.merge(
    batsman_venue_avg,
    on=["batter", "venue"],
    how="left"
)

batsman_per_match[
    ["batter", "venue", "runs", "avg_runs_venue", "strike_rate", "avg_sr_venue"]
].head()


Unnamed: 0,batter,venue,runs,avg_runs_venue,strike_rate,avg_sr_venue
0,A Ashish Reddy,Wankhede Stadium,10,10.0,100.0,100.0
1,A Ashish Reddy,"MA Chidambaram Stadium, Chepauk",3,19.5,100.0,162.5
2,A Ashish Reddy,"Rajiv Gandhi International Stadium, Uppal",8,8.454545,100.0,119.776335
3,A Ashish Reddy,"Rajiv Gandhi International Stadium, Uppal",10,8.454545,250.0,119.776335
4,A Ashish Reddy,"Rajiv Gandhi International Stadium, Uppal",4,8.454545,80.0,119.776335


In [20]:
# Venue-wise bowler performance
bowler_venue_avg = (
    bowler_per_match
    .groupby(["bowler", "venue"])
    .agg(
        avg_wickets_venue=("wickets", "mean"),
        avg_economy_venue=("economy", "mean")
    )
    .reset_index()
)

bowler_venue_avg.head()


Unnamed: 0,bowler,venue,avg_wickets_venue,avg_economy_venue
0,A Ashish Reddy,Barabati Stadium,1.0,10.105263
1,A Ashish Reddy,Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket St...,0.5,6.192308
2,A Ashish Reddy,Eden Gardens,1.0,7.5
3,A Ashish Reddy,M Chinnaswamy Stadium,0.5,11.705455
4,A Ashish Reddy,"MA Chidambaram Stadium, Chepauk",0.5,12.692308


In [None]:
# Merge venue  averages  back   into    bowler_per_match
bowler_per_match = bowler_per_match.merge(
    bowler_venue_avg,
    on=["bowler", "venue"],
    how="left"
)

bowler_per_match[
    ["bowler", "venue", "wickets", "avg_wickets_venue", "economy", "avg_economy_venue"]
].head() 


Unnamed: 0,bowler,venue,wickets,avg_wickets_venue,economy,avg_economy_venue
0,A Ashish Reddy,Subrata Roy Sahara Stadium,2,2.0,8.0,8.0
1,A Ashish Reddy,Wankhede Stadium,1,1.0,4.714286,4.714286
2,A Ashish Reddy,Barabati Stadium,1,1.0,10.105263,10.105263
3,A Ashish Reddy,"MA Chidambaram Stadium, Chepauk",1,0.5,7.384615,12.692308
4,A Ashish Reddy,M Chinnaswamy Stadium,1,0.5,8.64,11.705455


#venue features batsman and  bowler are done
opponent specific stats (PvT, PvP)

In [22]:
# Batsman vs Bowling Team (Player vs Team)
batsman_vs_team = (
    batsman_per_match
    .groupby(["batter", "bowling_team"])
    .agg(
        avg_runs_vs_team=("runs", "mean"),
        avg_sr_vs_team=("strike_rate", "mean")
    )
    .reset_index()
)

batsman_vs_team.head()


Unnamed: 0,batter,bowling_team,avg_runs_vs_team,avg_sr_vs_team
0,A Ashish Reddy,Chennai Super Kings,15.0,141.666667
1,A Ashish Reddy,Delhi Daredevils,12.0,145.568783
2,A Ashish Reddy,Kings XI Punjab,12.333333,158.333333
3,A Ashish Reddy,Kolkata Knight Riders,8.5,112.222222
4,A Ashish Reddy,Mumbai Indians,13.5,106.666667


In [None]:
# Merge b atsman vs t eam stats  back  into  batsman_per_match
batsman_per_match = batsman_per_match.merge(
    batsman_vs_team,
    on=["batter", "bowling_team"],
    how="left"
)

batsman_per_match[
    ["batter", "bowling_team", "runs", "avg_runs_vs_team", "strike_rate", "avg_sr_vs_team"]
].head()


Unnamed: 0,batter,bowling_team,runs,avg_runs_vs_team,strike_rate,avg_sr_vs_team
0,A Ashish Reddy,Mumbai Indians,10,13.5,100.0,106.666667
1,A Ashish Reddy,Chennai Super Kings,3,15.0,100.0,141.666667
2,A Ashish Reddy,Kings XI Punjab,8,12.333333,100.0,158.333333
3,A Ashish Reddy,Rajasthan Royals,10,12.333333,250.0,173.905724
4,A Ashish Reddy,Royal Challengers Bangalore,4,11.0,80.0,103.222222


In [None]:
# Bowler vs Batting Team Player vs Team
bowler_vs_team = (
    bowler_per_match
    .groupby(["bowler", "batting_team"])
    .agg(
        avg_wickets_vs_team=("wickets", "mean"),
        avg_economy_vs_team=("economy", "mean")
    )
    .reset_index()
)

bowler_vs_team.head()


Unnamed: 0,bowler,batting_team,avg_wickets_vs_team,avg_economy_vs_team
0,A Ashish Reddy,Chennai Super Kings,1.0,11.628205
1,A Ashish Reddy,Delhi Daredevils,0.5,10.942308
2,A Ashish Reddy,Kings XI Punjab,2.0,9.6
3,A Ashish Reddy,Kolkata Knight Riders,1.0,7.25
4,A Ashish Reddy,Mumbai Indians,1.0,4.714286


In [25]:
# Merge bowler vs team stats back into bowler_per_match
bowler_per_match = bowler_per_match.merge(
    bowler_vs_team,
    on=["bowler", "batting_team"],
    how="left"
)

bowler_per_match[
    ["bowler", "batting_team", "wickets", "avg_wickets_vs_team",
     "economy", "avg_economy_vs_team"]
].head()


Unnamed: 0,bowler,batting_team,wickets,avg_wickets_vs_team,economy,avg_economy_vs_team
0,A Ashish Reddy,Pune Warriors,2,1.333333,8.0,9.535088
1,A Ashish Reddy,Mumbai Indians,1,1.0,4.714286,4.714286
2,A Ashish Reddy,Pune Warriors,1,1.333333,10.105263,9.535088
3,A Ashish Reddy,Chennai Super Kings,1,1.0,7.384615,11.628205
4,A Ashish Reddy,Royal Challengers Bangalore,1,1.0,8.64,9.970303


opponent-specific stats (PvT) done
Next  career-level features

In [None]:
# Career level  stats for  batsmen
batsman_career_stats = (
    batsman_per_match
    .groupby("batter")
    .agg(
        career_runs=("runs", "sum"),
        career_matches=("match_id", "nunique"),
        career_avg_runs=("runs", "mean"),
        career_avg_sr=("strike_rate", "mean")
    )
    .reset_index()
)

batsman_career_stats.head()


Unnamed: 0,batter,career_runs,career_matches,career_avg_runs,career_avg_sr
0,A Ashish Reddy,280,23,12.173913,135.390395
1,A Badoni,634,35,18.114286,112.427011
2,A Chandila,4,2,2.0,33.333333
3,A Chopra,53,6,8.833333,59.148148
4,A Choudhary,25,3,8.333333,163.888889


In [None]:
# Merge  career stats back into batsman_per_match
batsman_per_match = batsman_per_match.merge(
    batsman_career_stats,
    on="batter",
    how="left"
)

batsman_per_match[
    ["batter", "runs", "career_avg_runs", "strike_rate", "career_avg_sr"]
].head()


Unnamed: 0,batter,runs,career_avg_runs,strike_rate,career_avg_sr
0,A Ashish Reddy,10,12.173913,100.0,135.390395
1,A Ashish Reddy,3,12.173913,100.0,135.390395
2,A Ashish Reddy,8,12.173913,100.0,135.390395
3,A Ashish Reddy,10,12.173913,250.0,135.390395
4,A Ashish Reddy,4,12.173913,80.0,135.390395


In [None]:
# Career  level stats for  bowlers
bowler_career_stats = (
    bowler_per_match
    .groupby("bowler")
    .agg(
        career_wickets=("wickets", "sum"),
        career_matches=("match_id", "nunique"),
        career_avg_wickets=("wickets", "mean"),
        career_avg_economy=("economy", "mean")
    )
    .reset_index()
)

bowler_career_stats.head()


Unnamed: 0,bowler,career_wickets,career_matches,career_avg_wickets,career_avg_economy
0,A Ashish Reddy,19,20,0.95,9.37553
1,A Badoni,2,5,0.4,7.4
2,A Chandila,11,12,0.916667,6.548611
3,A Choudhary,5,5,1.0,7.884319
4,A Dananjaya,0,1,0.0,11.28


In [None]:
# Merge career  stats back into bowler_per_match
bowler_per_match = bowler_per_match.merge(
    bowler_career_stats,
    on="bowler",
    how="left"
)

bowler_per_match[
    ["bowler", "wickets", "career_avg_wickets",
     "economy", "career_avg_economy"]
].head()


Unnamed: 0,bowler,wickets,career_avg_wickets,economy,career_avg_economy
0,A Ashish Reddy,2,0.95,8.0,9.37553
1,A Ashish Reddy,1,0.95,4.714286,9.37553
2,A Ashish Reddy,1,0.95,10.105263,9.37553
3,A Ashish Reddy,1,0.95,7.384615,9.37553
4,A Ashish Reddy,1,0.95,8.64,9.37553


In [None]:
# Sort batsman  data 
batsman_per_match = batsman_per_match.sort_values(
    by=["batter", "date"]
)

# Create next match runs label
batsman_per_match["next_match_runs"] = (
    batsman_per_match
    .groupby("batter")["runs"]
    .shift(-1)
)

batsman_per_match[
    ["batter", "date", "runs", "next_match_runs"]
].head(10)


Unnamed: 0,batter,date,runs,next_match_runs
0,A Ashish Reddy,2012-04-29,10,3.0
1,A Ashish Reddy,2012-05-04,3,8.0
2,A Ashish Reddy,2012-05-08,8,10.0
3,A Ashish Reddy,2012-05-18,10,4.0
4,A Ashish Reddy,2012-05-20,4,7.0
5,A Ashish Reddy,2013-04-05,7,14.0
6,A Ashish Reddy,2013-04-07,14,3.0
7,A Ashish Reddy,2013-04-09,3,16.0
8,A Ashish Reddy,2013-04-12,16,4.0
9,A Ashish Reddy,2013-04-14,4,19.0


In [None]:
# Ensure corrrec t order 
bowler_per_match = bowler_per_match.sort_values(
    by=["bowler", "date"]
)

# Create next  match wickets label
bowler_per_match["next_match_wickets"] = (
    bowler_per_match
    .groupby("bowler")["wickets"]
    .shift(-1)
)

bowler_per_match[
    ["bowler", "date", "wickets", "next_match_wickets"]
].head(10)


Unnamed: 0,bowler,date,wickets,next_match_wickets
0,A Ashish Reddy,2012-04-26,2,1.0
1,A Ashish Reddy,2012-04-29,1,1.0
2,A Ashish Reddy,2012-05-01,1,1.0
3,A Ashish Reddy,2012-05-04,1,1.0
4,A Ashish Reddy,2012-05-06,1,2.0
5,A Ashish Reddy,2012-05-08,2,0.0
6,A Ashish Reddy,2012-05-10,0,0.0
7,A Ashish Reddy,2012-05-18,0,3.0
8,A Ashish Reddy,2012-05-20,3,1.0
9,A Ashish Reddy,2013-04-05,1,1.0


In [None]:
# Drop rows where next  match runs are missing
batsman_final = batsman_per_match.dropna(
    subset=["next_match_runs"]
)

print("Batsman dataset shape:", batsman_final.shape)


Batsman dataset shape: (15842, 24)


In [None]:
# Drop rows where next  match wickets are missing
bowler_final = bowler_per_match.dropna(
    subset=["next_match_wickets"]
)

print("Bowler dataset shape:", bowler_final.shape)


Bowler dataset shape: (12448, 24)


In [34]:
# Select final batsman features
batsman_features = [
    "match_id", "date", "venue", "batting_team", "bowling_team",
    "batter",
    "balls_faced", "fours", "sixes",
    "strike_rate",
    "runs_last_5", "runs_last_10",
    "strike_rate_last_5", "strike_rate_last_10",
    "avg_runs_venue", "avg_sr_venue",
    "avg_runs_vs_team", "avg_sr_vs_team",
    "career_avg_runs", "career_avg_sr",
    "next_match_runs"
]

batsman_dataset = batsman_final[batsman_features]

batsman_dataset.head()


Unnamed: 0,match_id,date,venue,batting_team,bowling_team,batter,balls_faced,fours,sixes,strike_rate,...,runs_last_10,strike_rate_last_5,strike_rate_last_10,avg_runs_venue,avg_sr_venue,avg_runs_vs_team,avg_sr_vs_team,career_avg_runs,career_avg_sr,next_match_runs
0,548346,2012-04-29,Wankhede Stadium,Deccan Chargers,Mumbai Indians,A Ashish Reddy,10,0,1,100.0,...,10.0,100.0,100.0,10.0,100.0,13.5,106.666667,12.173913,135.390395,3.0
1,548352,2012-05-04,"MA Chidambaram Stadium, Chepauk",Deccan Chargers,Chennai Super Kings,A Ashish Reddy,3,0,0,100.0,...,6.5,100.0,100.0,19.5,162.5,15.0,141.666667,12.173913,135.390395,8.0
2,548359,2012-05-08,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Kings XI Punjab,A Ashish Reddy,8,1,0,100.0,...,7.0,100.0,100.0,8.454545,119.776335,12.333333,158.333333,12.173913,135.390395,10.0
3,548373,2012-05-18,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Rajasthan Royals,A Ashish Reddy,4,2,0,250.0,...,7.75,137.5,137.5,8.454545,119.776335,12.333333,173.905724,12.173913,135.390395,4.0
4,548376,2012-05-20,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Royal Challengers Bangalore,A Ashish Reddy,5,0,0,80.0,...,7.0,126.0,126.0,8.454545,119.776335,11.0,103.222222,12.173913,135.390395,7.0


In [None]:
batsman_dataset.to_csv("batsman_model_dataset.csv", index=False)
print("Saved: batsman_model_dataset.csv") 


Saved: batsman_model_dataset.csv


In [None]:
# Select final bowler features
bowler_features = [
    "match_id", "date", "venue", "batting_team", "bowling_team",
    "bowler",
    "overs", "economy",
    "wickets_last_5", "wickets_last_10",
    "economy_last_5", "economy_last_10",
    "avg_wickets_venue", "avg_economy_venue",
    "avg_wickets_vs_team", "avg_economy_vs_team",
    "career_avg_wickets", "career_avg_economy",
    "next_match_wickets"
]

bowler_dataset = bowler_final[bowler_features]

bowler_dataset.head() 


Unnamed: 0,match_id,date,venue,batting_team,bowling_team,bowler,overs,economy,wickets_last_5,wickets_last_10,economy_last_5,economy_last_10,avg_wickets_venue,avg_economy_venue,avg_wickets_vs_team,avg_economy_vs_team,career_avg_wickets,career_avg_economy,next_match_wickets
0,548341,2012-04-26,Subrata Roy Sahara Stadium,Pune Warriors,Deccan Chargers,A Ashish Reddy,4.0,8.0,2.0,2.0,8.0,8.0,2.0,8.0,1.333333,9.535088,0.95,9.37553,1.0
1,548346,2012-04-29,Wankhede Stadium,Mumbai Indians,Deccan Chargers,A Ashish Reddy,2.333333,4.714286,1.5,1.5,6.357143,6.357143,1.0,4.714286,1.0,4.714286,0.95,9.37553,1.0
2,548348,2012-05-01,Barabati Stadium,Pune Warriors,Deccan Chargers,A Ashish Reddy,3.166667,10.105263,1.333333,1.333333,7.606516,7.606516,1.0,10.105263,1.333333,9.535088,0.95,9.37553,1.0
3,548352,2012-05-04,"MA Chidambaram Stadium, Chepauk",Chennai Super Kings,Deccan Chargers,A Ashish Reddy,2.166667,7.384615,1.25,1.25,7.551041,7.551041,0.5,12.692308,1.0,11.628205,0.95,9.37553,1.0
4,548356,2012-05-06,M Chinnaswamy Stadium,Royal Challengers Bangalore,Deccan Chargers,A Ashish Reddy,4.166667,8.64,1.2,1.2,7.768833,7.768833,0.5,11.705455,1.0,9.970303,0.95,9.37553,2.0


In [None]:
bowler_dataset.to_csv("bowler_model_dataset.csv", index=False)
print("Saved: bowler_model_dataset.csv") 


Saved: bowler_model_dataset.csv


In [None]:
import pandas as pd

# Load final feature-engineered datasets
batsman_export = pd.read_csv("batsman_model_dataset.csv") 
bowler_export = pd.read_csv("bowler_model_dataset.csv") 

# Add role column
batsman_export["role"] = "batsman"
bowler_export["role"] = "bowler"

# Combine datasets 
final_dataset = pd.concat(
    [batsman_export, bowler_export],
    ignore_index=True,
    sort=False
)

# Save combined dataset
final_dataset.to_csv("dataset.csv", index=False)

print("Saved combined dataset as dataset.csv")
print("Final dataset shape:", final_dataset.shape) 


Saved combined dataset as dataset.csv
Final dataset shape: (28290, 36)
