In [13]:
import pandas as pd

# Load the dataset
df = pd.read_csv('IPL.csv', low_memory=False)

# Filter for first 3 overs (over starts at 0, so 0, 1, 2)
df_first_3 = df[df['over'] <= 2]

# Calculate total runs per ball
df_first_3['total_runs'] = df_first_3['runs_batter'] + df_first_3['runs_extras']

# Group by match_id and innings
runs_per_match = df_first_3.groupby(['match_id', 'innings'])['total_runs'].sum().reset_index()

# Create target variable
runs_per_match['target'] = (runs_per_match['total_runs'] >= 30).astype(int)

# Check distribution and stats
print("Target Distribution:")
print(runs_per_match['target'].value_counts())
print("\nRuns Stats:")
print(runs_per_match['total_runs'].describe())
print("\nSample Data:")
print(runs_per_match.head())

Target Distribution:
target
0    1886
1     331
Name: count, dtype: int64

Runs Stats:
count    2217.000000
mean       21.519621
std         8.055185
min         2.000000
25%        16.000000
50%        21.000000
75%        26.000000
max        66.000000
Name: total_runs, dtype: float64

Sample Data:
   match_id  innings  total_runs  target
0    335982        1          27       0
1    335982        2          12       0
2    335983        1          27       0
3    335983        2          27       0
4    335984        1          17       0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_first_3['total_runs'] = df_first_3['runs_batter'] + df_first_3['runs_extras']


In [3]:
# Filter for first 3 overs (adjust column names based on your dataset)
# Assuming 'over' starts at 1 (1, 2, 3) and 'inning' differentiates innings
df_first_3 = df[df['over'] <= 3]  # Modify 'over' to match your column name

# Calculate total runs per match and innings
# Assuming 'runs_off_bat' and 'extras' are columns for runs
df_first_3['total_runs'] = df_first_3.get('runs_off_bat', 0) + df_first_3.get('extras', 0)

# Group by match_id and inning to get runs in first 3 overs
runs_per_match = df_first_3.groupby(['match_id', 'innings'])['total_runs'].sum().reset_index()

# Create target variable: 1 if runs >= 30, 0 if < 30
runs_per_match['target'] = (runs_per_match['total_runs'] >= 30).astype(int)

print(runs_per_match.head())

   match_id  innings  total_runs  target
0    335982        1           0       0
1    335982        2           0       0
2    335983        1           0       0
3    335983        2           0       0
4    335984        1           0       0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_first_3['total_runs'] = df_first_3.get('runs_off_bat', 0) + df_first_3.get('extras', 0)


In [6]:
# Merge with original dataset to get metadata (e.g., teams, venue)
match_info = df[['match_id', 'innings', 'batting_team', 'bowling_team', 'venue']].drop_duplicates()

# Combine with runs data
data = runs_per_match.merge(match_info, on=['match_id', 'innings'], how='left')

# Example features
# 1. Historical team performance in first 3 overs
team_avg_runs = data.groupby('batting_team')['total_runs'].mean().reset_index()
team_avg_runs.columns = ['batting_team', 'avg_team_runs']

# Merge back to main data
data = data.merge(team_avg_runs, on='batting_team', how='left')

# 2. Venue scoring average
venue_avg_runs = data.groupby('venue')['total_runs'].mean().reset_index()
venue_avg_runs.columns = ['venue', 'avg_venue_runs']
data = data.merge(venue_avg_runs, on='venue', how='left')

print(data.head())

   match_id  innings  total_runs  target                 batting_team  \
0    335982        1           0       0        Kolkata Knight Riders   
1    335982        2           0       0  Royal Challengers Bangalore   
2    335983        1           0       0          Chennai Super Kings   
3    335983        2           0       0              Kings XI Punjab   
4    335984        1           0       0             Rajasthan Royals   

                  bowling_team                                       venue  \
0  Royal Challengers Bangalore                       M Chinnaswamy Stadium   
1        Kolkata Knight Riders                       M Chinnaswamy Stadium   
2              Kings XI Punjab  Punjab Cricket Association Stadium, Mohali   
3          Chennai Super Kings  Punjab Cricket Association Stadium, Mohali   
4             Delhi Daredevils                            Feroz Shah Kotla   

   avg_team_runs  avg_venue_runs  
0            0.0             0.0  
1            0.0      

In [7]:
# Select features and target
features = ['avg_team_runs', 'avg_venue_runs']  # Add more features as needed
X = data[features]
y = data['target']

# Handle missing values (if any)
X = X.fillna(X.mean())

# Split into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape)

(1773, 2) (444, 2)


In [10]:
# Merge with match metadata
match_info = df[['match_id', 'innings', 'batting_team', 'bowling_team', 'venue']].drop_duplicates()
data = runs_per_match.merge(match_info, on=['match_id', 'innings'], how='left')

# Feature 1: Average runs by batting team in first 3 overs
team_avg_runs = data.groupby('batting_team')['total_runs'].mean().reset_index()
team_avg_runs.columns = ['batting_team', 'avg_team_runs']
data = data.merge(team_avg_runs, on='batting_team', how='left')

# Feature 2: Average runs at venue in first 3 overs
venue_avg_runs = data.groupby('venue')['total_runs'].mean().reset_index()
venue_avg_runs.columns = ['venue', 'avg_venue_runs']
data = data.merge(venue_avg_runs, on='venue', how='left')

# Feature 3: Average runs conceded by bowling team in first 3 overs
bowler_avg_runs = data.groupby('bowling_team')['total_runs'].mean().reset_index()
bowler_avg_runs.columns = ['bowling_team', 'avg_bowler_runs']
data = data.merge(bowler_avg_runs, on='bowling_team', how='left')

print(data.head())

   match_id  innings  total_runs  target                 batting_team  \
0    335982        1           0       0        Kolkata Knight Riders   
1    335982        2           0       0  Royal Challengers Bangalore   
2    335983        1           0       0          Chennai Super Kings   
3    335983        2           0       0              Kings XI Punjab   
4    335984        1           0       0             Rajasthan Royals   

                  bowling_team                                       venue  \
0  Royal Challengers Bangalore                       M Chinnaswamy Stadium   
1        Kolkata Knight Riders                       M Chinnaswamy Stadium   
2              Kings XI Punjab  Punjab Cricket Association Stadium, Mohali   
3          Chennai Super Kings  Punjab Cricket Association Stadium, Mohali   
4             Delhi Daredevils                            Feroz Shah Kotla   

   avg_team_runs  avg_venue_runs  avg_bowler_runs  
0            0.0             0.0        

In [11]:
# Select features and target
features = ['avg_team_runs', 'avg_venue_runs', 'avg_bowler_runs']
X = data[features]
y = data['target']

# Handle missing values (shouldn’t be any, but just in case)
X = X.fillna(X.mean())

# Split into training and testing sets with stratification
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Check class distribution in y_train
print(y_train.value_counts())

target
0    1773
Name: count, dtype: int64


In [14]:
# Feature engineering
match_info = df[['match_id', 'innings', 'batting_team', 'bowling_team', 'venue']].drop_duplicates()
data = runs_per_match.merge(match_info, on=['match_id', 'innings'], how='left')

team_avg_runs = data.groupby('batting_team')['total_runs'].mean().reset_index()
team_avg_runs.columns = ['batting_team', 'avg_team_runs']
data = data.merge(team_avg_runs, on='batting_team', how='left')

venue_avg_runs = data.groupby('venue')['total_runs'].mean().reset_index()
venue_avg_runs.columns = ['venue', 'avg_venue_runs']
data = data.merge(venue_avg_runs, on='venue', how='left')

bowler_avg_runs = data.groupby('bowling_team')['total_runs'].mean().reset_index()
bowler_avg_runs.columns = ['bowling_team', 'avg_bowler_runs']
data = data.merge(bowler_avg_runs, on='bowling_team', how='left')

# Prepare data
features = ['avg_team_runs', 'avg_venue_runs', 'avg_bowler_runs']
X = data[features]
y = data['target']
X = X.fillna(X.mean())

# Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print(y_train.value_counts())

# Train
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

target
0    1508
1     265
Name: count, dtype: int64
Accuracy: 0.86


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Initialize and train
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.86


In [16]:
from sklearn.metrics import classification_report

# Predict and evaluate
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.86

Classification Report:
              precision    recall  f1-score   support

           0       0.86      1.00      0.92       378
           1       1.00      0.03      0.06        66

    accuracy                           0.86       444
   macro avg       0.93      0.52      0.49       444
weighted avg       0.88      0.86      0.79       444



In [17]:
# Example prediction
new_match = [[25, 24, 22]]  # Adjust based on realistic values
probability = model.predict_proba(new_match)[0][1]
prediction = model.predict(new_match)
print(f"\nProbability of 30+ runs: {probability:.2f}")
print(f"Prediction: {'Yes' if prediction[0] == 1 else 'No'}")


Probability of 30+ runs: 0.31
Prediction: No




In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv('IPL.csv', low_memory=False)

# Filter for first 3 overs
df_first_3 = df[df['over'] <= 2].copy()
df_first_3.loc[:, 'total_runs'] = df_first_3['runs_batter'] + df_first_3['runs_extras']

# Group by match_id and innings
runs_per_match = df_first_3.groupby(['match_id', 'innings'])['total_runs'].sum().reset_index()
runs_per_match['target'] = (runs_per_match['total_runs'] >= 30).astype(int)

# Feature engineering
match_info = df[['match_id', 'innings', 'batting_team', 'bowling_team', 'venue']].drop_duplicates()
data = runs_per_match.merge(match_info, on=['match_id', 'innings'], how='left')

# Create lookup tables
team_avg_runs = data.groupby('batting_team')['total_runs'].mean().reset_index()
team_avg_runs.columns = ['batting_team', 'avg_team_runs']

venue_avg_runs = data.groupby('venue')['total_runs'].mean().reset_index()
venue_avg_runs.columns = ['venue', 'avg_venue_runs']

bowler_avg_runs = data.groupby('bowling_team')['total_runs'].mean().reset_index()
bowler_avg_runs.columns = ['bowling_team', 'avg_bowler_runs']

# Prepare data for training
data = data.merge(team_avg_runs, on='batting_team', how='left')
data = data.merge(venue_avg_runs, on='venue', how='left')
data = data.merge(bowler_avg_runs, on='bowling_team', how='left')

features = ['avg_team_runs', 'avg_venue_runs', 'avg_bowler_runs']
X = data[features]
y = data['target']
X = X.fillna(X.mean())

# Split and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Prediction function
def predict_runs(batting_team, bowling_team, venue):
    try:
        # Fetch feature values from lookup tables
        avg_team_runs = team_avg_runs.loc[team_avg_runs['batting_team'] == batting_team, 'avg_team_runs'].values[0]
        avg_venue_runs = venue_avg_runs.loc[venue_avg_runs['venue'] == venue, 'avg_venue_runs'].values[0]
        avg_bowler_runs = bowler_avg_runs.loc[bowler_avg_runs['bowling_team'] == bowling_team, 'avg_bowler_runs'].values[0]
        
        # Create input for model
        new_match = [[avg_team_runs, avg_venue_runs, avg_bowler_runs]]
        
        # Predict
        probability = model.predict_proba(new_match)[0][1]
        prediction = model.predict(new_match)[0]
        
        # Output
        print(f"\nPrediction for {batting_team} vs {bowling_team} at {venue}:")
        print(f"Probability of 30+ runs: {probability:.2f}")
        print(f"Prediction: {'Yes' if prediction == 1 else 'No'}")
        
        return probability, prediction
    except IndexError:
        print("Error: One or more inputs not found in the dataset. Check team names and venue.")
        return None, None


Accuracy: 0.86

Classification Report:
              precision    recall  f1-score   support

           0       0.86      1.00      0.92       378
           1       1.00      0.03      0.06        66

    accuracy                           0.86       444
   macro avg       0.93      0.52      0.49       444
weighted avg       0.88      0.86      0.79       444



In [3]:

# Usage
predict_runs("Kolkata Knight Riders", "Chennai Super Kings", "Eden Gardens")



Prediction for Kolkata Knight Riders vs Chennai Super Kings at Eden Gardens:
Probability of 30+ runs: 0.18
Prediction: No




(0.1805725360595095, 0)

In [5]:
import joblib
# Save the model and lookup tables
joblib.dump(model, 'ipl_runs_model.pkl')
joblib.dump(team_avg_runs, 'team_avg_runs.pkl')
joblib.dump(venue_avg_runs, 'venue_avg_runs.pkl')
joblib.dump(bowler_avg_runs, 'bowler_avg_runs.pkl')
print("\nModel and lookup tables saved successfully!")


Model and lookup tables saved successfully!


In [6]:
import joblib

# Load the model and lookup tables
model = joblib.load('ipl_runs_model.pkl')
team_avg_runs = joblib.load('team_avg_runs.pkl')
venue_avg_runs = joblib.load('venue_avg_runs.pkl')
bowler_avg_runs = joblib.load('bowler_avg_runs.pkl')

# Prediction function
def predict_runs(batting_team, bowling_team, venue):
    try:
        avg_team_runs = team_avg_runs.loc[team_avg_runs['batting_team'] == batting_team, 'avg_team_runs'].values[0]
        avg_venue_runs = venue_avg_runs.loc[venue_avg_runs['venue'] == venue, 'avg_venue_runs'].values[0]
        avg_bowler_runs = bowler_avg_runs.loc[bowler_avg_runs['bowling_team'] == bowling_team, 'avg_bowler_runs'].values[0]
        
        new_match = [[avg_team_runs, avg_venue_runs, avg_bowler_runs]]
        probability = model.predict_proba(new_match)[0][1]
        prediction = model.predict(new_match)[0]
        
        print(f"\nPrediction for {batting_team} vs {bowling_team} at {venue}:")
        print(f"Probability of 30+ runs: {probability:.2f}")
        print(f"Prediction: {'Yes' if prediction == 1 else 'No'}")
        
        return probability, prediction
    except IndexError:
        print("Error: One or more inputs not found in the dataset. Check team names and venue.")
        return None, None

# Example usage
predict_runs("Mumbai Indians", "Royal Challengers Bangalore", "Wankhede Stadium")
predict_runs("Delhi Capitals", "Rajasthan Royals", "Arun Jaitley Stadium")


Prediction for Mumbai Indians vs Royal Challengers Bangalore at Wankhede Stadium:
Probability of 30+ runs: 0.10
Prediction: No

Prediction for Delhi Capitals vs Rajasthan Royals at Arun Jaitley Stadium:
Probability of 30+ runs: 0.21
Prediction: No




(0.21398450345480413, 0)