<a href="https://colab.research.google.com/github/anubhab16/Hinglish-AI-assistant/blob/main/ipl_prediction_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

matches = pd.read_csv("/content/matches.csv")
deliveries = pd.read_csv("/content/deliveries.csv")

# Clean up data
matches.dropna(subset=["winner"], inplace=True)
matches = matches.rename(columns={"id": "match_id"})

# Total runs per team per match
total_scores = deliveries.groupby(["match_id", "batting_team"])["total_runs"].sum().reset_index()
total_scores = total_scores.rename(columns={"total_runs": "final_score"})

# Merge scores into match data
df = matches.merge(total_scores, left_on=["match_id", "team1"], right_on=["match_id", "batting_team"], how="left")
df = df.rename(columns={"final_score": "team1_score"}).drop("batting_team", axis=1)

df = df.merge(total_scores, left_on=["match_id", "team2"], right_on=["match_id", "batting_team"], how="left")
df = df.rename(columns={"final_score": "team2_score"}).drop("batting_team", axis=1)

# Remove matches where scores are missing
df = df.dropna(subset=["team1_score", "team2_score"])


In [None]:
# Sort by team and date
matches["date"] = pd.to_datetime(matches["date"])
matches = matches.sort_values("date")

# Compute rolling stats (3 recent matches) for each team
recent_stats = deliveries.groupby(["batting_team", "match_id"])["total_runs"].sum().reset_index()
rolling_stats = recent_stats.groupby("batting_team")["total_runs"].rolling(window=3, min_periods=1).mean().reset_index()
rolling_stats = rolling_stats.rename(columns={"total_runs": "avg_runs_last3"})

# Ensure 'match_id' is available for merging
# The 'total_runs' column was renamed to 'avg_runs_last3', so we need to use that for merging
rolling_stats = rolling_stats.merge(recent_stats[["batting_team", "match_id", "total_runs"]], on=["batting_team"], how="left").drop_duplicates(subset=["batting_team", "level_1"]) # Merge to get match_id back # Changed merge 'on' condition
rolling_stats = rolling_stats[["batting_team", "match_id", "avg_runs_last3"]] # Keep only necessary columns


# Merge rolling stats to matches
df = df.merge(rolling_stats, left_on=["team1", "match_id"], right_on=["batting_team", "match_id"], how="left")
df = df.rename(columns={"avg_runs_last3": "team1_recent_avg"}).drop("batting_team", axis=1)

df = df.merge(rolling_stats, left_on=["team2", "match_id"], right_on=["batting_team", "match_id"], how="left")
df = df.rename(columns={"avg_runs_last3": "team2_recent_avg"}).drop("batting_team", axis=1)

df = df.dropna()

In [None]:
# Match winner: binary
df["match_winner"] = np.where(df["winner"] == df["team1"], 1, 0)

# Winning and losing score
df["winning_score"] = df[["team1_score", "team2_score"]].max(axis=1)
df["losing_score"] = df[["team1_score", "team2_score"]].min(axis=1)


In [None]:
import pandas as pd
import numpy as np

matches = pd.read_csv("/content/matches.csv")
deliveries = pd.read_csv("/content/deliveries.csv")

# Clean up data
matches.dropna(subset=["winner"], inplace=True)
matches = matches.rename(columns={"id": "match_id"})

# Total runs per team per match
total_scores = deliveries.groupby(["match_id", "batting_team"])["total_runs"].sum().reset_index()
total_scores = total_scores.rename(columns={"total_runs": "final_score"})

# Merge scores into match data
df = matches.merge(total_scores, left_on=["match_id", "team1"], right_on=["match_id", "batting_team"], how="left")
df = df.rename(columns={"final_score": "team1_score"}).drop("batting_team", axis=1)

df = df.merge(total_scores, left_on=["match_id", "team2"], right_on=["match_id", "batting_team"], how="left")
df = df.rename(columns={"final_score": "team2_score"}).drop("batting_team", axis=1)

# Remove matches where scores are missing
df = df.dropna(subset=["team1_score", "team2_score"])

# Sort by team and date
matches["date"] = pd.to_datetime(matches["date"])
matches = matches.sort_values("date")

# Compute rolling stats (3 recent matches) for each team
recent_stats = deliveries.groupby(["batting_team", "match_id"])["total_runs"].sum().reset_index()
rolling_stats = recent_stats.groupby("batting_team")["total_runs"].rolling(window=3, min_periods=1).mean().reset_index()
rolling_stats = rolling_stats.rename(columns={"total_runs": "avg_runs_last3"})

# Ensure 'match_id' is available for merging
rolling_stats = rolling_stats.merge(recent_stats[["batting_team", "match_id"]], on=["batting_team"], how="left").drop_duplicates(subset=["batting_team", "match_id"])  # Merge to get match_id back
rolling_stats = rolling_stats[["batting_team", "match_id", "avg_runs_last3"]]  # Keep only necessary columns


# Merge rolling stats to matches
df = df.merge(rolling_stats, left_on=["team1", "match_id"], right_on=["batting_team", "match_id"], how="left")
df = df.rename(columns={"avg_runs_last3": "team1_recent_avg"}).drop("batting_team", axis=1)

df = df.merge(rolling_stats, left_on=["team2", "match_id"], right_on=["batting_team", "match_id"], how="left")
df = df.rename(columns={"avg_runs_last3": "team2_recent_avg"}).drop("batting_team", axis=1)

# Check if df is empty and print a message if it is
if df.empty:
    print("DataFrame 'df' is empty after merging rolling stats. Check your data and merging logic.")
else:
    df = df.dropna()
    # ... (rest of your code) ...

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
# Assuming 'df' contains your data and 'winning_score', 'losing_score' are your target variables

# **Create winning_score and losing_score columns before using them**
df["match_winner"] = np.where(df["winner"] == df["team1"], 1, 0)
df["winning_score"] = df[["team1_score", "team2_score"]].max(axis=1)
df["losing_score"] = df[["team1_score", "team2_score"]].min(axis=1)

# Features for prediction
features = ['team1_recent_avg', 'team2_recent_avg', 'team1_score', 'team2_score']  # Replace with your actual feature columns

# Split the data
X = df[features]
y_win = df['winning_score']
y_lose = df['losing_score']
X_train, X_test, y_win_train, y_win_test, y_lose_train, y_lose_test = train_test_split(
    X, y_win, y_lose, test_size=0.2, random_state=42
)

# Create and train the models
reg_win = LinearRegression()
reg_win.fit(X_train, y_win_train)

reg_lose = LinearRegression()
reg_lose.fit(X_train, y_lose_train)


import scipy.stats as st

def get_confidence_interval(model, X, n_bootstraps=100):
    preds = []
    for _ in range(n_bootstraps):
        sample = X.sample(frac=1, replace=True)
        pred = model.predict(sample)
        preds.append(pred)
    preds = np.array(preds)
    lower = np.percentile(preds, 2.5, axis=0)
    upper = np.percentile(preds, 97.5, axis=0)
    return lower, upper

lower_win, upper_win = get_confidence_interval(reg_win, X_test)
lower_lose, upper_lose = get_confidence_interval(reg_lose, X_test)

# Print for first prediction
print(f"Winning score CI: {lower_win[0]:.1f} - {upper_win[0]:.1f}")
print(f"Losing score CI: {lower_lose[0]:.1f} - {upper_lose[0]:.1f}")

Winning score CI: 129.8 - 187.6
Losing score CI: 46.2 - 129.4


In [None]:
# Top player stats
player_stats = deliveries.groupby("batter")["batsman_runs"].agg(["mean", "sum"]).sort_values("sum", ascending=False)
print(player_stats.head())

               mean   sum
batter                   
V Kohli    1.285119  8014
S Dhawan   1.234543  6769
RG Sharma  1.279182  6630
DA Warner  1.354300  6567
SK Raina   1.325353  5536
