In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib
import os

In [6]:
print("Columns in data:", list(data.columns))

Columns in data: ['full_scorecard', 'team1', 'team2', 'team1_score', 'team2_score', 'toss_winner', 'toss_choice', 'winner', 'margin', 'man_of_the_match', 'stadium', 'place']


In [9]:
file_path = '/content/ipl_dataset_cleaned.csv'
if os.path.exists(file_path):
    data = pd.read_csv(file_path)
    print("Data loaded. Shape:", data.shape)
    print("Columns:", list(data.columns))
else:
    print("Error: File not found. Ensure 'ipl_dataset_cleaned.csv' is in /content/.")
    raise FileNotFoundError("Upload or generate the cleaned data first.")

Data loaded. Shape: (958, 12)
Columns: ['full_scorecard', 'team1', 'team2', 'team1_score', 'team2_score', 'toss_winner', 'toss_choice', 'winner', 'margin', 'man_of_the_match', 'stadium', 'place']


In [10]:
# Add derived columns for wins and scores
data['team1_win'] = (data['winner'] == data['team1']).astype(int)
data['team2_win'] = (data['winner'] == data['team2']).astype(int)

# Sort for rolling calculations (using index as proxy for order)
data = data.reset_index(drop=True)
# 1. Rolling Averages (Team Form): Last 5 matches' wins and scores
data['team1_rolling_wins'] = data.groupby('team1')['team1_win'].rolling(5, min_periods=1).mean().reset_index(0, drop=True)
data['team1_rolling_score'] = data.groupby('team1')['team1_score'].rolling(5, min_periods=1).mean().reset_index(0, drop=True)
data['team2_rolling_wins'] = data.groupby('team2')['team2_win'].rolling(5, min_periods=1).mean().reset_index(0, drop=True)
data['team2_rolling_score'] = data.groupby('team2')['team2_score'].rolling(5, min_periods=1).mean().reset_index(0, drop=True)

# 2. Venue Averages: Average wins and scores at stadium
venue_wins = data.groupby('stadium')['team1_win'].mean().to_dict()
venue_scores = data.groupby('stadium')['team1_score'].mean().to_dict()
data['venue_win_avg'] = data['stadium'].map(venue_wins)
data['venue_score_avg'] = data['stadium'].map(venue_scores)

# 3. Opponent-Specific Stats (Team vs Team - TvT): Average wins against opponent
tvt_wins = data.groupby(['team1', 'team2'])['team1_win'].mean().to_dict()
data['tvt_win_avg'] = data.apply(lambda x: tvt_wins.get((x['team1'], x['team2']), 0), axis=1)

# 4. Career Stats: Cumulative averages per team
data['team1_career_wins'] = data.groupby('team1')['team1_win'].expanding().mean().reset_index(0, drop=True)
data['team1_career_score'] = data.groupby('team1')['team1_score'].expanding().mean().reset_index(0, drop=True)
data['team2_career_wins'] = data.groupby('team2')['team2_win'].expanding().mean().reset_index(0, drop=True)
data['team2_career_score'] = data.groupby('team2')['team2_score'].expanding().mean().reset_index(0, drop=True)

# Fill NaNs with 0
data.fillna(0, inplace=True)

print("Features engineered. Shape:", data.shape)

Features engineered. Shape: (958, 25)


In [11]:
# Labels: Next match scores and wins (shift by 1 for each team)
data['label_team1_score_next'] = data.groupby('team1')['team1_score'].shift(-1)
data['label_team2_score_next'] = data.groupby('team2')['team2_score'].shift(-1)
data['label_team1_win_next'] = data.groupby('team1')['team1_win'].shift(-1)
data['label_team2_win_next'] = data.groupby('team2')['team2_win'].shift(-1)
# Drop rows with NaN labels (last match for each team)
data.dropna(subset=['label_team1_score_next'], inplace=True)

print("Labels created. Shape after dropping NaNs:", data.shape)


Labels created. Shape after dropping NaNs: (943, 29)


In [12]:
 # No 'season' for time-series, so use random split (80% train, 20% test)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data['winner'] if 'winner' in data.columns else None)

print("Train shape:", train_data.shape, "Test shape:", test_data.shape)

# Features and labels
features = ['team1_rolling_wins', 'team1_rolling_score', 'team2_rolling_wins', 'team2_rolling_score', 'venue_win_avg', 'venue_score_avg', 'tvt_win_avg', 'team1_career_wins', 'team1_career_score', 'team2_career_wins', 'team2_career_score']
X_train = train_data[features]
y_train_score = train_data['label_team1_score_next']
y_train_win = train_data['label_team1_win_next']
X_test = test_data[features]
y_test_score = test_data['label_team1_score_next']
y_test_win = test_data['label_team1_win_next']

Train shape: (754, 29) Test shape: (189, 29)


In [18]:
# Cell 6: Preprocessing and Save (Fixed for Mixed Data Types)

# Make explicit copies of train_data and test_data to ensure transformations apply correctly
train_data_processed = train_data.copy()
test_data_processed = test_data.copy()

# Encode categorical columns in the main 'data' DataFrame (for saving later)
# Note: train_data and test_data will retain original categorical values unless explicitly processed
encoder = LabelEncoder()
categorical_cols = ['team1', 'team2', 'toss_winner', 'toss_choice', 'winner', 'man_of_the_match', 'stadium', 'place', 'full_scorecard']
for col in categorical_cols:
    if col in data.columns:
        data[col] = encoder.fit_transform(data[col].astype(str))  # Convert to strings and encode

# Convert 'margin' column to numeric, extracting only the integer part for the main 'data' DataFrame
data['margin'] = data['margin'].astype(str).str.extract(r'(\d+)').astype(float).fillna(0)

# Apply the same margin cleaning to the processed train_data and test_data copies
train_data_processed['margin'] = train_data_processed['margin'].astype(str).str.extract(r'(\d+)').astype(float).fillna(0)
test_data_processed['margin'] = test_data_processed['margin'].astype(str).str.extract(r'(\d+)').astype(float).fillna(0)

# Define numerical features (for scaling; include engineered and original numericals)
numerical_features = ['team1_rolling_wins', 'team1_rolling_score', 'team2_rolling_wins', 'team2_rolling_score', 'venue_win_avg', 'venue_score_avg', 'tvt_win_avg', 'team1_career_wins', 'team1_career_score', 'team2_career_wins', 'team2_career_score', 'team1_score', 'team2_score', 'margin']

# Update X_train and X_test to use numerical features from the processed data
X_train_num = train_data_processed[numerical_features]
y_train_score = train_data_processed['label_team1_score_next']
y_train_win = train_data_processed['label_team1_win_next']
X_test_num = test_data_processed[numerical_features]
y_test_score = test_data_processed['label_team1_score_next']
y_test_win = test_data_processed['label_team1_win_next']

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_num)
X_test_scaled = scaler.transform(X_test_num)

# Save feature-engineered dataset
data.to_csv('/content/dataset.csv', index=False)
print("Feature-engineered dataset saved as 'dataset.csv'.")

# Save preprocessor pipeline
joblib.dump({'scaler': scaler, 'encoder': encoder}, '/content/feature_pipeline.pkl')
print("Preprocessor saved as 'feature_pipeline.pkl'.")

# Optional: Quick check
print("Sample scaled features shape:", X_train_scaled.shape)
print("Sample scaled features:", X_train_scaled[:3])

Feature-engineered dataset saved as 'dataset.csv'.
Preprocessor saved as 'feature_pipeline.pkl'.
Sample scaled features shape: (754, 14)
Sample scaled features: [[ 0.          1.13561035  0.         -0.15244438  0.          0.09906591
   0.          0.          0.47448872  0.          0.11889257  0.293608
   0.3687034  -0.31247129]
 [ 0.          0.20554507  0.          0.77356908  0.         -0.18333829
   0.          0.         -0.73987822  0.         -0.02026064  0.72678795
   1.34301075 -0.36014497]
 [ 0.          0.32512489  0.          1.20209659  0.          0.31978101
   0.          0.          0.35301453  0.          1.32667181  0.7777503
   1.25443736 -0.50316603]]
