In [1]:
# 1. Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error
import pickle

# 2. Load and preprocess dataset
df = pd.read_csv("t20i_info.csv")



In [2]:
# Ensure numeric runs
df['runs'] = pd.to_numeric(df['runs'], errors='coerce')



In [3]:
# Feature engineering
df['current_score'] = df.groupby('match_id')['runs'].cumsum()
df['over'] = df['ball'].apply(lambda x: str(x).split(".")[0])
df['ball_no'] = df['ball'].apply(lambda x: str(x).split(".")[1])
df['ball_bowled'] = (df['over'].astype(int)*6 + df['ball_no'].astype(int))
df['balls_left'] = 120 - df['ball_bowled']
df['balls_left'] = df['balls_left'].apply(lambda x: 0 if x < 0 else x)

df['player_dismissed'] = df['player_dismissed'].apply(lambda x: 1 if x != '0' else 0)
df['player_dismissed'] = df['player_dismissed'].astype(int)
df['player_dismissed'] = df.groupby('match_id')['player_dismissed'].cumsum()
df['wicket_left'] = 10 - df['player_dismissed']

df['current_run_rate'] = (df['current_score']*6) / df['ball_bowled']



In [4]:
df

Unnamed: 0.1,Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue,current_score,over,ball_no,ball_bowled,balls_left,wicket_left,current_run_rate
0,0,2,Australia,Sri Lanka,0.1,0,0,,Melbourne Cricket Ground,0,0,1,1,119,10,0.000000
1,1,2,Australia,Sri Lanka,0.2,0,0,,Melbourne Cricket Ground,0,0,2,2,118,10,0.000000
2,2,2,Australia,Sri Lanka,0.3,1,0,,Melbourne Cricket Ground,1,0,3,3,117,10,2.000000
3,3,2,Australia,Sri Lanka,0.4,2,0,,Melbourne Cricket Ground,3,0,4,4,116,10,4.500000
4,4,2,Australia,Sri Lanka,0.5,0,0,,Melbourne Cricket Ground,3,0,5,5,115,10,3.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63883,121,964,Sri Lanka,Australia,19.3,1,8,Colombo,R Premadasa Stadium,125,19,3,117,3,2,6.410256
63884,122,964,Sri Lanka,Australia,19.4,0,8,Colombo,R Premadasa Stadium,125,19,4,118,2,2,6.355932
63885,123,964,Sri Lanka,Australia,19.5,0,9,Colombo,R Premadasa Stadium,125,19,5,119,1,1,6.302521
63886,124,964,Sri Lanka,Australia,19.6,2,9,Colombo,R Premadasa Stadium,127,19,6,120,0,1,6.350000


In [5]:
# Rolling last five overs (30 balls)
groups = df.groupby('match_id')
match_id = df['match_id'].unique()
last_five = []
for id in match_id:
    group_data = groups.get_group(id).copy()
    group_data['runs'] = pd.to_numeric(group_data['runs'], errors='coerce')
    rolling_sum = group_data['runs'].rolling(window=30).sum()
    last_five.extend(rolling_sum.values.tolist())
df['last_five'] = last_five



In [6]:
df

Unnamed: 0.1,Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue,current_score,over,ball_no,ball_bowled,balls_left,wicket_left,current_run_rate,last_five
0,0,2,Australia,Sri Lanka,0.1,0,0,,Melbourne Cricket Ground,0,0,1,1,119,10,0.000000,
1,1,2,Australia,Sri Lanka,0.2,0,0,,Melbourne Cricket Ground,0,0,2,2,118,10,0.000000,
2,2,2,Australia,Sri Lanka,0.3,1,0,,Melbourne Cricket Ground,1,0,3,3,117,10,2.000000,
3,3,2,Australia,Sri Lanka,0.4,2,0,,Melbourne Cricket Ground,3,0,4,4,116,10,4.500000,
4,4,2,Australia,Sri Lanka,0.5,0,0,,Melbourne Cricket Ground,3,0,5,5,115,10,3.600000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63883,121,964,Sri Lanka,Australia,19.3,1,8,Colombo,R Premadasa Stadium,125,19,3,117,3,2,6.410256,32.0
63884,122,964,Sri Lanka,Australia,19.4,0,8,Colombo,R Premadasa Stadium,125,19,4,118,2,2,6.355932,32.0
63885,123,964,Sri Lanka,Australia,19.5,0,9,Colombo,R Premadasa Stadium,125,19,5,119,1,1,6.302521,32.0
63886,124,964,Sri Lanka,Australia,19.6,2,9,Colombo,R Premadasa Stadium,127,19,6,120,0,1,6.350000,33.0


In [7]:
# Final dataset
final_df = df.groupby('match_id').sum()['runs'].reset_index().merge(df, on='match_id')
final_df['current_run_rate'] = final_df['current_score'] / (120 - final_df['balls_left']) * 6
final_df = final_df[['batting_team','bowling_team','city',
                     'current_score','balls_left','wicket_left',
                     'current_run_rate','last_five','runs_x']]
final_df.dropna(inplace=True)



In [8]:
final_df

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wicket_left,current_run_rate,last_five,runs_x
154,Australia,Sri Lanka,Victoria,35,92,9,7.500000,35.0,173
155,Australia,Sri Lanka,Victoria,36,91,9,7.448276,36.0,173
156,Australia,Sri Lanka,Victoria,38,90,9,7.600000,37.0,173
157,Australia,Sri Lanka,Victoria,42,89,9,8.129032,41.0,173
158,Australia,Sri Lanka,Victoria,42,88,9,7.875000,37.0,173
...,...,...,...,...,...,...,...,...,...
63883,Sri Lanka,Australia,Colombo,125,3,2,6.410256,32.0,128
63884,Sri Lanka,Australia,Colombo,125,2,2,6.355932,32.0,128
63885,Sri Lanka,Australia,Colombo,125,1,1,6.302521,32.0,128
63886,Sri Lanka,Australia,Colombo,127,0,1,6.350000,33.0,128


In [9]:
# 3. Split data
X = final_df.drop(columns=['runs_x'])
y = final_df['runs_x']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




In [10]:
# 4. Define pipeline using column names
categorical_features = ['batting_team', 'bowling_team', 'city']
numeric_features = ['current_score', 'balls_left', 'wicket_left', 'current_run_rate', 'last_five']

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore'), categorical_features),
    ('num', StandardScaler(), numeric_features)
])

pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(n_estimators=1000, learning_rate=0.2, max_depth=12, random_state=1))
])



In [11]:
# 5. Train pipeline
pipe.fit(X_train, y_train)



0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [12]:
# 6. Evaluate
y_pred = pipe.predict(X_test)
print("R2 Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))



R2 Score: 0.988996684551239
MAE: 1.544480562210083


In [13]:
# 7. Save pipeline
pickle.dump(pipe, open('pipe.pkl', 'wb'))