In [None]:
!pip install scikit-learn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [None]:
class DatasetProcessor:
    def __init__(self, name):
        self.df = pd.read_csv(name).fillna(0)

    def calculate_innings_stats(self):
        self.df["innings_total"] = self.df.groupby(["match_id", "innings"])["runs_off_bat"].transform(lambda x : x.shift().fillna(0).cumsum()) + self.df.groupby(["match_id", "innings"])["extras"].transform(lambda x : x.shift().fillna(0).cumsum())
        self.df["wicket"] = (self.df["wicket_type"] != 0).astype(int)
        self.df["innings_wickets"] = self.df.groupby(["match_id", "innings"])["wicket"].transform(lambda x : x.shift().fillna(0).cumsum())
        
        self.df["ball"] = self.df["ball"].astype(int) * 6 + (self.df["ball"] - self.df["ball"].astype(int)) * 10
        self.df["over"] = self.df["ball"] // 6
        self.df["outcome"] = self.df.apply(lambda ball : 7 if ball["wicket"] == 1 else ball["runs_off_bat"], axis=1)

    def calculate_striker_stats(self):
        self.df["striker_runs"] = self.df.groupby(["match_id", "striker"])["runs_off_bat"].transform(lambda x : x.shift().fillna(0).cumsum())
        self.df["striker_balls"] = self.df.groupby(["match_id", "striker"])["runs_off_bat"].transform("cumcount")

    def calculate_bowler_stats(self):
        self.df["bowler_runs"] = self.df.groupby(["match_id", "bowler"])["runs_off_bat"].transform(lambda x : x.shift().fillna(0).cumsum())
        self.df["bowler_balls"] = self.df.groupby(["match_id", "bowler"])["runs_off_bat"].transform("cumcount")
        self.df["bowler_wickets"] = self.df.groupby(["match_id", "bowler"])["wicket"].transform(lambda x : x.shift().fillna(0).cumsum())

    def select_season(self, season):
        self.df = self.df[self.df.season == season]

    def process(self):
        self.select_season(2023)
        self.calculate_innings_stats()
        self.calculate_striker_stats()
        self.calculate_bowler_stats()
        self.df = self.df[self.df.outcome != 5]

        return self.df[["season", 
                        "venue", 
                        "innings", 
                        "innings_total",
                        "innings_wickets",
                        "ball", 
                        "over",
                        "striker",
                        "striker_runs", 
                        "striker_balls",
                        "bowler",
                        "bowler_runs",
                        "bowler_balls",
                        "bowler_wickets", 
                        "runs_off_bat",
                        "outcome"]]

In [None]:
class ExpectedModel:
    def __init__(self, dataset):
        self.dataset = dataset

    def train(self):
        X = ["innings_total",
             "innings_wickets",
             "over", 
             "striker_runs", 
             "striker_balls",
             "bowler_runs",
             "bowler_balls",
             "bowler_wickets"]
        
        Y = "outcome"

        pipeline = make_pipeline(StandardScaler(),
                                 MLPClassifier())
        
        dummies = self.dataset[Y]
        X_train, X_test, y_train, y_test = train_test_split(self.dataset[X], dummies, test_size=0.3)
        pipeline.fit(X_train, y_train)

        return pipeline

In [None]:
p = DatasetProcessor("IPL_ball_by_ball_updated.csv")
df = p.process()
trainer = ExpectedModel(df)

In [None]:
df

In [None]:
model = trainer.train()

In [None]:
X = ["innings_total",
             "innings_wickets",
             "over", 
             "striker_runs", 
             "striker_balls",
             "bowler_runs",
             "bowler_balls",
             "bowler_wickets"]

xR = (model.predict_proba(df[X]) * np.array([[0, 1, 2, 3, 4, 6, 0]])).sum(axis=1)
df["preds"] = xR

In [None]:
ball_group = df.groupby("ball", as_index=False).agg({ "runs_off_bat": "mean", "preds": "mean" })
sns.scatterplot(data=ball_group, x="ball", y="runs_off_bat")
sns.scatterplot(data=ball_group, x="ball", y="preds")

In [None]:
runs_group = df.groupby("striker_runs", as_index=False).agg({ "runs_off_bat": "mean", "preds": "mean" })
sns.scatterplot(data=runs_group, x="striker_runs", y="runs_off_bat")
sns.scatterplot(data=runs_group, x="striker_runs", y="preds")

In [None]:
total_group = df.groupby("innings_total", as_index=False).agg({ "runs_off_bat": "mean", "preds": "mean" })
sns.scatterplot(data=total_group, x="innings_total", y="runs_off_bat")
sns.scatterplot(data=total_group, x="innings_total", y="preds")

In [None]:
import pickle
file = open("model.pkl", "wb")
pickle.dump(model, file)