## IPL Transformer Model for Match Outcome Prediction
### Initialize Environment

In [3]:
# === CELL 1: Imports and Setup ===
import os
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

In [4]:
DATA_PATH = "../ipl_json"
SEQUENCE_LENGTH = 100
BATCH_SIZE = 32
EMBEDDING_DIM = 64
NUM_HEADS = 4
NUM_LAYERS = 2
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [5]:
def extract_deliveries(json_path):
    deliveries = []
    with open(json_path, "r", encoding="utf-8") as f:
        match = json.load(f)
    for innings in match.get("innings", []):
        for over in innings.get("overs", []):
            over_num = over.get("over", 0)
            for delivery in over.get("deliveries", []):
                deliveries.append({
                    'over': over_num,
                    'batter': delivery.get("batter"),
                    'bowler': delivery.get("bowler"),
                    'runs': delivery.get("runs", {}).get("total", 0),
                    'wicket': 1 if "wickets" in delivery else 0
                })
    return deliveries


In [6]:
all_files = [os.path.join(DATA_PATH, f) for f in os.listdir(DATA_PATH) if f.endswith(".json")]
raw_data = []
for f in all_files:
    raw_data.extend(extract_deliveries(f))

df = pd.DataFrame(raw_data)
batter_enc = LabelEncoder()
bowler_enc = LabelEncoder()
df['batter_id'] = batter_enc.fit_transform(df['batter'])
df['bowler_id'] = bowler_enc.fit_transform(df['bowler'])

In [7]:
class IPLDataset(Dataset):
    def __init__(self, df, seq_len):
        self.seq_len = seq_len
        self.data = []
        for i in range(len(df) - seq_len):
            seq = df.iloc[i:i+seq_len]
            x = seq[['batter_id', 'bowler_id', 'over', 'runs', 'wicket']].values
            y = df.iloc[i+seq_len]['runs']
            self.data.append((x, y))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x, y = self.data[idx]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.float)


In [8]:
class IPLTransformer(nn.Module):
    def __init__(self, vocab_size, emb_dim, num_heads, num_layers):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, emb_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=emb_dim, nhead=num_heads)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(emb_dim, 1)

    def forward(self, x):
        # x shape: [batch_size, seq_len, features]
        x = self.embed(x[:,:,0]) + self.embed(x[:,:,1])  # embed batter and bowler
        x = x.permute(1, 0, 2)  # Transformer expects [seq_len, batch_size, emb_dim]
        x = self.transformer(x)
        out = self.fc(x[-1])
        return out.squeeze()

In [9]:
dataset = IPLDataset(df, SEQUENCE_LENGTH)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

model = IPLTransformer(vocab_size=len(batter_enc.classes_), emb_dim=EMBEDDING_DIM,
                       num_heads=NUM_HEADS, num_layers=NUM_LAYERS).to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.MSELoss()




In [9]:
epochs = 3
for epoch in range(epochs):
    model.train()
    for batch_x, batch_y in dataloader:
        batch_x, batch_y = batch_x.to(DEVICE), batch_y.to(DEVICE)
        optimizer.zero_grad()
        preds = model(batch_x)
        loss = criterion(preds, batch_y)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}: Loss = {loss.item():.4f}")

Epoch 1: Loss = 3.8633
Epoch 2: Loss = 1.4995
Epoch 3: Loss = 1.9061


In [10]:
torch.save(model.state_dict(), "ipl_transformer.pt")
import joblib
joblib.dump((batter_enc, bowler_enc), "encoders.pkl")

['encoders.pkl']

In [10]:
model.eval()
test_results = []

with torch.no_grad():
    for batch_x, batch_y in dataloader:
        batch_x = batch_x.to(DEVICE)
        preds = model(batch_x).cpu().numpy()
        for i in range(len(preds)):
            test_results.append({
                'expected_runs': round(preds[i], 2),
                'actual_runs': batch_y[i].item()
            })

# Create a DataFrame of predictions vs actuals
log_df = pd.DataFrame(test_results)
log_df['error'] = abs(log_df['expected_runs'] - log_df['actual_runs'])
accuracy = 1 - (log_df['error'].mean() / (log_df['actual_runs'].mean() + 1e-6))

print(f"Model Accuracy (1 - MAE / Mean Actual): {accuracy:.4f}")
display(log_df.head(10)) 

Model Accuracy (1 - MAE / Mean Actual): -0.0939


Unnamed: 0,expected_runs,actual_runs,error
0,0.83,1.0,0.17
1,0.27,6.0,5.73
2,0.4,0.0,0.4
3,-0.42,0.0,0.42
4,0.1,1.0,0.9
5,-0.81,0.0,0.81
6,0.01,0.0,0.01
7,0.02,0.0,0.02
8,0.14,0.0,0.14
9,0.07,1.0,0.93


In [26]:
# === CELL X: Load a Real Match and Simulate Prediction ===
import os
import json
import numpy as np
import torch

# Step 1: Choose a real match JSON file from the folder
json_files = [os.path.join("../ipl_json", f) for f in os.listdir("../ipl_json") if f.endswith(".json")]
if not json_files:
    raise Exception("No JSON files found in ../ipl_json")

# For this example, we use the first JSON file.
match_file = json_files[0]

# Use the provided extraction utility to get ball-by-ball deliveries
match_deliveries = extract_deliveries(match_file)

# Convert the list of deliveries to a DataFrame
match_df = pd.DataFrame(match_deliveries)

# For simulation purposes, we assume we're using data from one innings.
# If your JSON contains multiple innings, you might want to filter for one.
# (For example, you could select deliveries where over < some_threshold)

# Ensure the match has at least SEQUENCE_LENGTH deliveries.
if len(match_df) < SEQUENCE_LENGTH:
    raise Exception("Not enough deliveries in this match for simulation.")

# Step 2: Encode the batter and bowler columns using the existing encoders.
# (Assuming batter_enc and bowler_enc are already fit on a larger dataset.)
match_df['batter_id'] = batter_enc.transform(match_df['batter'])
match_df['bowler_id'] = bowler_enc.transform(match_df['bowler'])

# Step 3: Prepare the seed sequence.
# Here, we take the last SEQUENCE_LENGTH deliveries as the seed.
seed_seq = match_df[['batter_id', 'bowler_id', 'over', 'runs', 'wicket']].values[-SEQUENCE_LENGTH:]

# Determine the current state from the seed: using the last ball's info.
batter_id = seed_seq[-1, 0]
bowler_id = seed_seq[-1, 1]
current_over = int(seed_seq[-1, 2])

# For a T20 match, there are 120 balls in an innings.
num_balls_remaining = 120 - SEQUENCE_LENGTH

# Step 4: Define the simulation function (if not already defined)
def simulate_innings(model, seed_seq, num_balls_remaining, batter_id, bowler_id, start_over):
    """
    Simulates an innings by rolling the seed sequence forward, predicting each next ball.
    
    Parameters:
      model: Trained Transformer model.
      seed_seq: Numpy array of shape (seq_len, 5) containing the seed deliveries.
      num_balls_remaining: Number of balls left to simulate.
      batter_id: Batter identifier to use for simulated balls.
      bowler_id: Bowler identifier to use for simulated balls.
      start_over: Starting over number for the simulation.
    
    Returns:
      predicted_runs: List of predicted runs for each simulated ball.
    """
    model.eval()
    predicted_runs = []
    # Copy the seed sequence to avoid modifying the original.
    seq = seed_seq.copy()
    current_over = start_over
    ball_in_over = 0

    for _ in range(num_balls_remaining):
        # Prepare input with shape [1, seq_len, features]
        input_seq = torch.tensor(seq, dtype=torch.long).unsqueeze(0).to(DEVICE)
        with torch.no_grad():
            pred = model(input_seq).item()
        # Round the prediction to the nearest integer (runs are discrete)
        run = int(round(pred))
        predicted_runs.append(run)
        
        # Create a new ball record:
        # Using the same batter_id and bowler_id for simplicity,
        # the current over, predicted run, and wicket set to 0.
        new_ball = [batter_id, bowler_id, current_over, run, 0]
        
        # Slide the window: remove the first ball and append the new one.
        seq = np.vstack([seq[1:], new_ball])
        
        ball_in_over += 1
        if ball_in_over == 6:
            ball_in_over = 0
            current_over += 1

    return predicted_runs

# Step 5: Generate predictions for the remaining balls.
predicted_remaining_runs = simulate_innings(model, seed_seq, num_balls_remaining, batter_id, bowler_id, current_over)

# Step 6: Compute the predicted final innings score.
# We assume the seed sequence represents the score so far.
seed_runs = seed_seq[:, 3].sum()  # Sum of runs in the seed sequence
predicted_total_score = seed_runs + sum(predicted_remaining_runs)

print("Predicted runs for remaining balls:", predicted_remaining_runs)
print("Predicted final innings score:", predicted_total_score)


Predicted runs for remaining balls: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Predicted final innings score: 154
