In [2]:
import pandas as pd

In [3]:
def compute_current_innings_score(df):
    df = df.copy()

    df = df.sort_values(["match_id", "innings", "ball"])

    df["total_runs"] = df["runs_off_bat"].fillna(0) + df["extras"].fillna(0)
    df["is_wicket"] = df["player_dismissed"].notna().astype(int)

    g = df.groupby(["match_id", "innings"], sort=False)

    df["current_innings_runs"] = g["total_runs"].cumsum() - df["total_runs"]
    df["current_innings_wickets"] = g["is_wicket"].cumsum() - df["is_wicket"]
    df["over"] = df["ball"].astype(int) + 1

    return df

def compute_final_innings_runs(df):
    df = df.copy()

    df = df.sort_values(["match_id", "innings", "ball"])

    df["total_runs"] = df["runs_off_bat"].fillna(0) + df["extras"].fillna(0)

    g = df.groupby(["match_id", "innings"], sort=False)

    df["final_innings_runs"] = g["total_runs"].transform("sum")

    return df
  


In [50]:
df = pd.read_csv("../data/all_matches.csv")

  df = pd.read_csv("../data/all_matches.csv")


In [51]:
df = compute_current_innings_score(df)
df = compute_final_innings_runs(df)

In [52]:
table = df[["current_innings_runs", "current_innings_wickets", "over", "final_innings_runs"]]

In [53]:
# Drop missing values just in case
table = table.dropna()
X = table[[
    "current_innings_runs",
    "current_innings_wickets",
    "over",
]]

y = table["final_innings_runs"]

In [58]:
table[(table["current_innings_wickets"] <= 1) & (table["over"] == 7)]["final_innings_runs"].mean()

np.float64(171.27215189873417)

In [61]:
table[(table["current_innings_wickets"] <= 0) & (table["over"] == 7)]["final_innings_runs"].mean()

np.float64(176.94529123571039)

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)

print("Test MAE:", mae)

Test MAE: 26.188358389805877


In [33]:
coefficients = pd.Series(
    model.coef_,
    index=X.columns,
    name="coefficient"
)

intercept = model.intercept_

print("Intercept:", intercept)
print(coefficients)

Intercept: 165.07250807703747
current_innings_runs       1.018721
current_innings_wickets   -7.101213
over                      -6.385890
Name: coefficient, dtype: float64


In [29]:
model.predict([[100, 9, 10]])



array([162.92551378])

In [11]:
import joblib
joblib.dump(model, "../data/expected_runs_model.joblib")

['../data/expected_runs_model.joblib']

In [12]:
(6*df[["total_runs", "is_wicket"]]).describe()

Unnamed: 0,total_runs,is_wicket
count,468268.0,468268.0
mean,8.20885,0.32689
std,9.847296,1.361795
min,0.0,0.0
25%,0.0,0.0
50%,6.0,0.0
75%,6.0,0.0
max,48.0,6.0


In [13]:
df.groupby(["match_id", "over", "innings"])[["total_runs", "is_wicket"]] \
  .sum() \
  .describe(percentiles=[0.05, 0.10, 0.25, 0.5, 0.75, 0.90, 0.95])


Unnamed: 0,total_runs,is_wicket
count,75771.0,75771.0
mean,8.455174,0.336699
std,4.70408,0.560644
min,0.0,0.0
5%,2.0,0.0
10%,3.0,0.0
25%,5.0,0.0
50%,8.0,0.0
75%,11.0,1.0
90%,15.0,1.0
