In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import json


### Prediction Pipeline Flow

##### Toss Prediction --> Decision Prediction --> Final Prediction

In [None]:
### Loading Data Cleaning files

with open('team_name_correction.json', 'r') as f:
    team_name_correction = json.load(f)

with open('team_mapping.json', 'r') as f:
    team_mapping = json.load(f)

with open('stadium_mapping.json', 'r') as f:
    stadium_mapping = json.load(f)

with open('stadium_name_correction.json', 'r') as f:
    stadium_name_correction = json.load(f)


with open('reverse_team_mapping.json', 'r') as f:
    reverse_team_mapping = json.load(f)

In [5]:
balls_df=pd.read_csv('deliveries.csv')
balls_df = balls_df.replace(team_name_correction)

In [6]:
matches_df=pd.read_csv('matches.csv')
matches_df = matches_df.replace(team_name_correction)
matches_df = matches_df.replace(stadium_name_correction)


###  Toss_Model

In [7]:
toss_data=matches_df[['team1','team2','toss_winner']]
toss_data=toss_data.replace(team_mapping)

Toss Training

In [8]:
from sklearn.ensemble import RandomForestClassifier
Toss_Model = RandomForestClassifier(
    n_estimators=20,  
    max_depth=15,        
    random_state=42
)

In [9]:
mai=toss_data.drop('toss_winner',axis=1)
target=toss_data['toss_winner']

In [10]:
Toss_Model.fit(mai,target)

Toss Prediction

In [11]:
a="Kolkata Knight Riders"
b="Chennai Super Kings"
print(reverse_team_mapping.get(str(Toss_Model.predict([[team_mapping[a],team_mapping[b]]])[0])))


Chennai Super Kings


### Decision after wining Toss

Toss_Model Training

In [14]:
decision_making=matches_df[['team1','team2','toss_winner','toss_decision','venue']]

In [15]:
decision_making=decision_making.replace(team_mapping)
decision_making=decision_making.replace(stadium_mapping)
decision_making['toss_decision']=decision_making['toss_decision'].map({'field':0,'bat':1})

In [16]:
decision_making

Unnamed: 0,team1,team2,toss_winner,toss_decision,venue
0,3,0,3,0,1
1,5,1,1,1,2
2,6,2,2,1,3
3,7,3,7,1,4
4,0,4,4,1,5
...,...,...,...,...,...
1090,5,4,5,1,7
1091,4,0,4,1,35
1092,3,2,2,0,35
1093,4,2,2,0,8


In [17]:
inp=decision_making.drop('toss_decision',axis=1)
out=decision_making['toss_decision']

In [18]:
Toss_Decision_Model = RandomForestClassifier(
    n_estimators=20,  
    max_depth=15,        
    random_state=42
)

In [19]:
Toss_Decision_Model.fit(inp,out)

Decision Prediction

In [20]:
a="Kolkata Knight Riders"
b="Chennai Super Kings"
toss_won="Chennai Super Kings"
venue="Nehru Stadium"

decision=int(Toss_Decision_Model.predict([[team_mapping[a],team_mapping[b],team_mapping[toss_won],stadium_mapping[venue]]])[0])

print("Field" if decision==0 else "Bat")

Field


### Final Toss_Model 

# Preparing Dataset

In [22]:
matches_df.head()

Unnamed: 0,id,season,city,date,match_type,player_of_match,venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method,umpire1,umpire2
0,335982,2007/08,Bangalore,2008-04-18,League,BB McCullum,"M. Chinnaswamy Stadium, Bengaluru",Royal Challengers Bengaluru,Kolkata Knight Riders,Royal Challengers Bengaluru,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,,Asad Rauf,RE Koertzen
1,335983,2007/08,Chandigarh,2008-04-19,League,MEK Hussey,"Punjab Cricket Association IS Bindra Stadium, ...",Punjab Kings,Chennai Super Kings,Chennai Super Kings,bat,Chennai Super Kings,runs,33.0,241.0,20.0,N,,MR Benson,SL Shastri
2,335984,2007/08,Delhi,2008-04-19,League,MF Maharoof,"Arun Jaitley Stadium, Delhi",Delhi Capitals,Rajasthan Royals,Rajasthan Royals,bat,Delhi Capitals,wickets,9.0,130.0,20.0,N,,Aleem Dar,GA Pratapkumar
3,335985,2007/08,Mumbai,2008-04-20,League,MV Boucher,"Wankhede Stadium, Mumbai",Mumbai Indians,Royal Challengers Bengaluru,Mumbai Indians,bat,Royal Challengers Bengaluru,wickets,5.0,166.0,20.0,N,,SJ Davis,DJ Harper
4,335986,2007/08,Kolkata,2008-04-20,League,DJ Hussey,"Eden Gardens, Kolkata",Kolkata Knight Riders,Sunrisers Hyderabad,Sunrisers Hyderabad,bat,Kolkata Knight Riders,wickets,5.0,111.0,20.0,N,,BF Bowden,K Hariharan


##### Team stats, while making final inference we can take some results from here directly 

In [23]:
team_stats = {}


for _, row in matches_df.iterrows():
    team1 = row['team1']
    team2 = row['team2']
    toss_winner = row['toss_winner']
    toss_decision = row['toss_decision']
    match_winner = row['winner']

    toss_loser = team1 if toss_winner == team2 else team2
    
    for team in [team1, team2]:
        if team not in team_stats:
            team_stats[team] = {
                'bat_first_won': 0, 'bat_first_lost': 0,
                'chase_won': 0, 'chase_lost': 0, '4_hp': 0, '6_hp': 0, 'w_tp': 0
            }

    batting_first_team = toss_winner if toss_decision == 'bat' else toss_loser
    chasing_team = toss_loser if toss_decision == 'bat' else toss_winner
    
    if match_winner == batting_first_team:
        team_stats[batting_first_team]['bat_first_won'] += 1
        team_stats[chasing_team]['chase_lost'] += 1
    else:
        team_stats[batting_first_team]['bat_first_lost'] += 1
        team_stats[chasing_team]['chase_won'] += 1

In [24]:
deliveries_df=pd.read_csv('deliveries.csv')
deliveries_df=deliveries_df.replace(team_name_correction)

##### adding team 4s and 6s data to team stats

In [25]:

boundaries = deliveries_df[deliveries_df["batsman_runs"].isin([4, 6])].groupby(["batting_team", "batsman_runs"])["batsman_runs"].count().unstack()
matches_played = deliveries_df.groupby("batting_team")["match_id"].nunique()
boundaries_normalized = boundaries.div(matches_played, axis=0)
boundaries_normalized = boundaries_normalized.fillna(0)
boundaries_normalized = boundaries_normalized.rename(columns={4: "avg_fours", 6: "avg_sixes"})

In [26]:
for team, row in boundaries_normalized.iterrows():
    team_stats[team]['4_hp'] = row['avg_fours'].round(2)
    team_stats[team]['6_hp'] = row['avg_sixes'].round(2)

##### adding wicket per match

In [None]:


wickets_per_team = deliveries_df[deliveries_df["is_wicket"] == 1].groupby("bowling_team")["is_wicket"].count()

matches_per_team = deliveries_df.groupby("bowling_team")["match_id"].nunique()
avg_wickets_per_match = (wickets_per_team / matches_per_team).round(2)
avg_wickets_df = avg_wickets_per_match.reset_index()
avg_wickets_df.columns = ["bowling_team", "avg_wickets_per_match"]
for i,(j,e) in (avg_wickets_df.iterrows()):
    team_stats[j]['w_tp'] = e


In [28]:
team_stats_Dataframe = pd.DataFrame(team_stats).T

In [30]:
team_stats_Dataframe.head()

Unnamed: 0,bat_first_won,bat_first_lost,chase_won,chase_lost,4_hp,6_hp,w_tp
Royal Challengers Bengaluru,58.0,72.0,66.0,59.0,13.25,6.48,0.0
Kolkata Knight Riders,53.0,67.0,78.0,53.0,13.79,5.96,0.0
Punjab Kings,51.0,77.0,61.0,57.0,13.93,6.16,0.0
Chennai Super Kings,70.0,60.0,69.0,39.0,13.49,6.37,0.0
Delhi Capitals,48.0,65.0,67.0,72.0,13.92,5.36,0.0


checking consistency, (Bat_first_won + bat_first_lost == chase_won + chase_lost)

In [74]:
team_stats_Dataframe['bat_first_won'].sum() + team_stats_Dataframe['bat_first_lost'].sum()


np.float64(1095.0)

In [75]:
team_stats_Dataframe['chase_won'].sum() + team_stats_Dataframe['chase_lost'].sum()

np.float64(1095.0)

In [76]:
team_stats_Dataframe.to_csv('team_stats.csv')

#### while making infer we can use Head to Head probablities

In [32]:
head_to_head = {}

for _, row in matches_df.iterrows():
    team1, team2, winner = row["team1"], row["team2"], row["winner"]

    
    if team1 not in head_to_head:
        head_to_head[team1] = {}
    if team2 not in head_to_head:
        head_to_head[team2] = {}

    
    head_to_head[team1].setdefault(team2, 0)
    head_to_head[team2].setdefault(team1, 0)

   
    if winner == team1:
        head_to_head[team1][team2] += 1
    elif winner == team2:
        head_to_head[team2][team1] += 1


In [33]:
head_to_head_df= pd.DataFrame(head_to_head)
head_to_head_df.to_csv("head_to_head.csv")

##### Main Dataset

## Final dataset on which final model is trained 


##### coloumns are ["teamA", "teamB", "batting_first", "first_inning_win_A", "second_inning_win_B", "4s_hitting_a", "4s_hitting_b","6s_hitting_a""6s_hitting_b", "wicket_taking_power_a","wicket_taking_power_b", "prev_probability_A","prev_probability_B","stadium","winner"]

In [34]:
main_data=[]



for _, row in matches_df.iterrows():
    temp=[]
    team1, team2 = row["team1"], row["team2"]
    venue = row["venue"]
    toss_winner = row["toss_winner"]
    toss_decision = row["toss_decision"]

    temp.append(team1)
    temp.append(team2)

    
    if ((toss_winner==team1) and (toss_decision=='bat')) or ((toss_winner==team2) and (toss_decision=='field')):
        temp.append(team_mapping[team1])
        temp.append(team_stats[team1]['bat_first_won'] / team_stats[team1]['bat_first_lost'])
        temp.append(team_stats[team2]['chase_won'] / team_stats[team2]['chase_lost'])

    else:
        temp.append(team_mapping[team2])
        temp.append(team_stats[team1]['chase_won'] / team_stats[team1]['chase_lost'])
        temp.append(team_stats[team2]['bat_first_won'] / team_stats[team2]['bat_first_lost'])    

    temp.append((team_stats[team1]['4_hp']))
    temp.append(team_stats[team2]['4_hp'])
    temp.append(team_stats[team1]['6_hp'])
    temp.append(team_stats[team2]['6_hp'])
    temp.append(team_stats[team1]['w_tp'])
    temp.append(team_stats[team2]['w_tp'])
    temp.append(head_to_head[team1][team2])
    temp.append(head_to_head[team2][team1])
    
    temp.append(row['venue'])
    temp.append(row['winner'])

    main_data.append(temp)    
        

In [35]:
import numpy as np
np_data=np.array(main_data)

columns = ["teamA", "teamB", "batting_first",
           "first_inning_win_A", "second_inning_win_B",
           "4s_hitting_a", "4s_hitting_b",
           "6s_hitting_a", "6s_hitting_b",
           "wicket_taking_power_a","wicket_taking_power_b", "prev_probability_A","prev_probability_B",
           "stadium","winner"]


df = pd.DataFrame(np_data, columns=columns)


In [36]:
df["first_inning_win_A"] = df["first_inning_win_A"].astype(float).round(2)
df["second_inning_win_B"] = df["second_inning_win_B"].astype(float).round(2)
df.to_csv("final_data_for_model_training.csv")


### Final_Toss_Model_Training


In [37]:
Final_Model = RandomForestClassifier(
    n_estimators=20,  
    max_depth=15,        
    random_state=42
)

In [38]:
feat=df.drop("winner",axis=1)
label=df["winner"]

In [39]:
feat['teamA'] = feat['teamA'].map(team_mapping)

In [40]:
feat['teamB'] = feat['teamB'].map(team_mapping)

In [41]:
feat['stadium']=feat['stadium'].map(stadium_mapping)

In [42]:
Final_Model.fit(feat,label)

In [43]:
Final_Model.score(feat,label)

0.817351598173516

## Prediction for 2025 ipl winner

In [44]:
points_table={}

In [50]:
fixture=pd.read_csv("ipl-2025-UTC.csv")

In [51]:
fixture['Home Team']=fixture['Home Team'].map(team_mapping)
fixture['Away Team']=fixture['Away Team'].map(team_mapping)

#### removing playoffs because we dont have venue decided for those matches

In [52]:
fixture=fixture[fixture['Match Number']<=70] 

In [53]:
fixture.drop("Result",axis=1,inplace=True)
fixture=fixture.fillna(0)  ## Missing stadium will marked as 0

In [54]:
fixture['Location']=fixture['Location'].map(stadium_name_correction)

In [55]:
fixture['Location']=fixture['Location'].map(stadium_mapping)

In [56]:
fixture.head()

Unnamed: 0,Match Number,Round Number,Date,Location,Home Team,Away Team
0,1,1,22/03/2025 14:00,5.0,0.0,3.0
1,2,1,23/03/2025 10:00,7.0,4.0,2.0
2,3,1,23/03/2025 14:00,8.0,1.0,7.0
3,4,1,24/03/2025 14:00,,6.0,11.0
4,5,1,25/03/2025 14:00,,10.0,5.0


## final Pipeline

 ##### Making infer on each match before playoffs and preparing points table

In [59]:
def generate_points_table(fixture):
    points_table={}
    matches_interupted=0
    for i,(j,e,t,venue,teamA,teamB) in fixture.iterrows():
        
        teamA=int(teamA)
        teamB=int(teamB)
        final_infer=[]
        final_infer.append(teamA)
        final_infer.append(teamB)

        if reverse_team_mapping.get(str(teamA)) not in points_table:
            
            points_table[reverse_team_mapping.get(str(teamA))]={'win':0,'loss':0}

        if reverse_team_mapping.get(str(teamB)) not in points_table:
            points_table[reverse_team_mapping.get(str(teamB))]={'win':0,'loss':0}

        toss_winner=(int(Toss_Model.predict([[teamA,teamB]])[0]))
        winner_decision= int(Toss_Decision_Model.predict([[teamA,teamB,toss_winner,venue]])[0])

        batter=0
        chaser=0

        if(winner_decision):
            batter=toss_winner
            chaser=teamB if batter==teamA else teamA
        else:
            chaser=toss_winner
            batter=teamB if chaser==teamA else teamA

        final_infer.append(batter)
        batter=int(batter)
        chaser=int(chaser)
        final_infer.append(team_stats[reverse_team_mapping.get(str(batter))]['bat_first_won'] / team_stats[reverse_team_mapping.get(str(batter))]['bat_first_lost'])
        final_infer.append(team_stats[reverse_team_mapping.get(str(chaser))]['chase_won'] / team_stats[reverse_team_mapping.get(str(chaser))]['chase_lost'])

        final_infer.append(team_stats[reverse_team_mapping.get(str(teamA))]['4_hp'])
        final_infer.append(team_stats[reverse_team_mapping.get(str(teamB))]['4_hp'])
        final_infer.append(team_stats[reverse_team_mapping.get(str(teamA))]['6_hp'])
        final_infer.append(team_stats[reverse_team_mapping.get(str(teamB))]['6_hp'])
        final_infer.append(team_stats[reverse_team_mapping.get(str(teamA))]['w_tp'])
        final_infer.append(team_stats[reverse_team_mapping.get(str(teamB))]['w_tp'])
        final_infer.append(head_to_head[reverse_team_mapping.get(str(teamA))][reverse_team_mapping.get(str(teamB))])
        final_infer.append(head_to_head[reverse_team_mapping.get(str(teamB))][reverse_team_mapping.get(str(teamA))])
        final_infer.append(venue)

        out=Final_Model.predict([final_infer])

        if str(out[0]).lower() == 'nan':
            matches_interupted+=1
        else:
            points_table[out[0]]['win'] += 1
            losser = teamB if out[0] == reverse_team_mapping.get(str(teamA)) else teamA
            points_table[reverse_team_mapping.get(str(losser))]['loss'] += 1
    print("Number of matches interupted :",matches_interupted)        
    return points_table
     

In [60]:
points_table=generate_points_table(fixture)

Number of matches interupted : 1


In [61]:
points_table

{'Kolkata Knight Riders': {'win': 6, 'loss': 8},
 'Royal Challengers Bengaluru': {'win': 6, 'loss': 8},
 'Sunrisers Hyderabad': {'win': 4, 'loss': 10},
 'Rajasthan Royals': {'win': 7, 'loss': 7},
 'Chennai Super Kings': {'win': 9, 'loss': 4},
 'Mumbai Indians': {'win': 8, 'loss': 6},
 'Delhi Capitals': {'win': 6, 'loss': 8},
 'Lucknow Super Giants': {'win': 8, 'loss': 4},
 'Gujarat Titans': {'win': 9, 'loss': 6},
 'Punjab Kings': {'win': 6, 'loss': 8}}

In [79]:
teams_with_difference = [
    (team, (data['win'] - data['loss'])) for team, data in points_table.items()
]
sorted_teams = sorted(teams_with_difference, key=lambda x: x[1])
top_2_teams = sorted_teams[8:]
print(top_2_teams)

[('Lucknow Super Giants', 4), ('Chennai Super Kings', 5)]


#### Prepared a Dataset manually with random location and simulating it

In [None]:
match_data = {
    'Match Number': 1,
    'Round Number': 1,
    'Date': '22/03/2025 14:00',
    'Location': 5.0,  
    'Home Team': 'Chennai Super Kings',
    'Away Team': 'Lucknow Super Giants'
}

finals= pd.DataFrame([match_data])
finals['Home Team']=finals['Home Team'].map(team_mapping)
finals['Away Team']=finals['Away Team'].map(team_mapping)


##### Final Prediction

In [None]:
fg=generate_points_table(finals)

Number of matches interupted : 0


In [77]:
teams_with_difference = [
    (team, (data['win'] - data['loss'])) for team, data in fg.items()
]
final_winners = sorted(teams_with_difference, key=lambda x: x[1])

In [None]:
print("Our Final Winnner of IPL 2025 is: ",final_winners[-1][0])

Our Final Winnner of IPL 2025 is:  Chennai Super Kings
