In [2]:
import pandas as pd

batting_df = pd.read_csv("batting_feature_engineered.csv")
bowling_df = pd.read_csv("bowling_feature_engineered.csv")

batting_df['date'] = pd.to_datetime(batting_df['date'])
bowling_df['date'] = pd.to_datetime(bowling_df['date'])


In [3]:

batting_df.head()


Unnamed: 0,matchId,batsman,runs_in_match,season,venue,team1,team2,date,batting_team,opponent_team,avg_last_3,avg_last_5,avg_last_10,avg_at_venue,avg_vs_opponent,career_avg
0,548346,A Ashish Reddy,10.0,2012,Wankhede Stadium,Mumbai Indians,Deccan Chargers,2012-04-29,Deccan Chargers,Mumbai Indians,10.0,10.0,10.0,10.0,13.5,12.173913
1,548352,A Ashish Reddy,3.0,2012,"MA Chidambaram Stadium, Chepauk",Chennai Super Kings,Deccan Chargers,2012-05-04,Deccan Chargers,Chennai Super Kings,6.5,6.5,6.5,19.5,15.0,12.173913
2,548359,A Ashish Reddy,8.0,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Kings XI Punjab,2012-05-08,Deccan Chargers,Kings XI Punjab,7.0,7.0,7.0,8.454545,12.333333,12.173913
3,548373,A Ashish Reddy,10.0,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Rajasthan Royals,2012-05-18,Deccan Chargers,Rajasthan Royals,7.0,7.75,7.75,8.454545,12.333333,12.173913
4,548376,A Ashish Reddy,4.0,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Royal Challengers Bangalore,2012-05-20,Deccan Chargers,Royal Challengers Bangalore,7.333333,7.0,7.0,8.454545,11.0,12.173913


In [4]:
batting_df = batting_df.sort_values(['batsman', 'date'])
bowling_df = bowling_df.sort_values(['bowler', 'date'])


In [5]:
batting_df['avg_last_5'] = (
    batting_df.groupby('batsman')['runs_in_match']
    .shift(1)
    .rolling(5)
    .mean()
)

batting_df['avg_last_10'] = (
    batting_df.groupby('batsman')['runs_in_match']
    .shift(1)
    .rolling(10)
    .mean()
)


In [6]:
bowling_df['avg_last_5'] = (
    bowling_df.groupby('bowler')['wickets_in_match']
    .shift(1)
    .rolling(5)
    .mean()
)


In [7]:
batting_df['venue_avg'] = (
    batting_df.groupby(['batsman', 'venue'])['runs_in_match']
    .transform('mean')
)

batting_df['opponent_avg'] = (
    batting_df.groupby(['batsman', 'opponent_team'])['runs_in_match']
    .transform('mean')
)


In [8]:
batting_df['career_avg'] = (
    batting_df.groupby('batsman')['runs_in_match']
    .expanding()
    .mean()
    .shift(1)
    .reset_index(level=0, drop=True)
)


In [9]:
target = 'runs_in_match'


In [10]:
batting_df = batting_df.fillna(0)


In [11]:
batting_df.to_csv("dataset.csv", index=False)


In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
import joblib


In [13]:
categorical_cols = ['batting_team', 'opponent_team', 'venue', 'season']
numeric_cols = [c for c in batting_df.columns
                if c not in categorical_cols + ['batsman','date','matchId','runs_in_match']]


In [14]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ]
)


In [15]:
joblib.dump(preprocessor, "feature_pipeline.pkl")


['feature_pipeline.pkl']

In [16]:
batting_df = batting_df.sort_values(['batsman', 'date'])

batting_df['avg_last_5'] = (
    batting_df.groupby('batsman')['runs_in_match']
    .shift(1)
    .rolling(5)
    .mean()
)

batting_df['avg_last_10'] = (
    batting_df.groupby('batsman')['runs_in_match']
    .shift(1)
    .rolling(10)
    .mean()
)


In [17]:
batting_df['target_runs'] = (
    batting_df.groupby('batsman')['runs_in_match'].shift(-1)
)
