In [10]:
import os
import glob
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np
from collections import defaultdict
import chardet
from sklearn.preprocessing import MinMaxScaler


## Extracting the data from folder

In [11]:
file_pattern = '66*.csv'
files = glob.glob(file_pattern)
dataframes = {}
for file in files:
    base_name = os.path.basename(file)
    df_name = base_name.split('_')[1].split('.')[0]
    df = pd.read_csv(file)
    dataframes[df_name]=df
    
bowler=dataframes['bowler']
bowler['match_dt'] = pd.to_datetime(bowler['match_dt'])
batsman=dataframes['batsman']
batsman['match_dt'] = pd.to_datetime(batsman['match_dt'],format="%d-%m-%Y")
match=dataframes['match']
match['match_dt'] = pd.to_datetime(match['match_dt'])
train=dataframes['train']
# train['match_dt'] = pd.to_datetime(train['match_dt'])


## Calculate the effect of toss on the venue


In [12]:
def tossEffect(df,venue_name):
    def calculate_toss_win_percentage(group):
        total_matches = len(group)
        toss_wins = (group['toss winner'] == group['winner']).sum()
        return (toss_wins / total_matches) * 100
    result=df.groupby('venue')
    percentage_df = result.apply(calculate_toss_win_percentage).reset_index(name='toss_win_percentage')   
    x=percentage_df[percentage_df["venue"]==venue_name]
    return x['toss_win_percentage']
x=tossEffect(match,"Sd Vr Nn Sh Il Sm Rr")


## Team percentage win


In [13]:
match = match.sort_values(by='match_dt')

def calculate_win_percentage_last_n(team_name,df, n=5,):
    # Filter matches where team_name is in team1 or team2
    team_matches = df[(df['team1'] == team_name) | (df['team2'] == team_name)]
    
    # Get the last n matches
    last_n_matches = team_matches.tail(n)
    
    # Calculate win percentage
    total_matches = len(last_n_matches)
    if total_matches == 0:
        return 0.0
    
    wins = (last_n_matches['winner'] == team_name).sum()
    win_percentage = (wins / total_matches) * 100
    
    return win_percentage

# Team name to analyze
team_name = 'Kt'

# Calculate win percentage for 'Team A' in their last 5 matches
win_percentage = calculate_win_percentage_last_n(team_name,match)
win_percentage

20.0

## Calculate the effect of first bat or first balling on the venue

In [14]:
def toss_decision(df,venue_name,toss_decision):
    def calculate_decision_win_percentage(group):
        # Initialize a dictionary to store the results
        results = {
            'bat_win_percentage': 0,
            'bowl_win_percentage': 0
        }

        bat_matches = group[group['toss decision'] == 'bat']
        bowl_matches = group[group['toss decision'] == 'field']
        if not bat_matches.empty:
            bat_wins = (bat_matches['toss winner'] == bat_matches['winner']).sum()
            results['bat_win_percentage'] = (bat_wins / len(bat_matches)) * 100

        if not bowl_matches.empty:
            bowl_wins = (bowl_matches['toss winner'] == bowl_matches['winner']).sum()
            results['bowl_win_percentage'] = (bowl_wins / len(bowl_matches)) * 100

        return pd.Series(results)
    result_toss_decision = df.groupby('venue').apply(calculate_decision_win_percentage).reset_index()
    x=result_toss_decision[result_toss_decision["venue"]==venue_name]
    if toss_decision =="bat":
        return x['bat_win_percentage']
    return x['bowl_win_percentage']
    return result_toss_decision


##  Calculate effect of season in cricket

In [15]:
def season_var(df,season,toss_decision):
    def calculate_season_win_percentage(group):
        results = {
            'bat_win_percentage': 0,
            'bowl_win_percentage': 0
        }
        n=len(group)
        bat_matches = group[group['toss decision'] == 'bat']
        bowl_matches = group[group['toss decision'] == 'field']
        if not bat_matches.empty:
            bat_wins = (bat_matches['toss winner'] == bat_matches['winner']).sum()
            results['bat_win_percentage'] = (bat_wins / len(bat_matches)) * 100

        if not bowl_matches.empty:
            bowl_wins = (bowl_matches['toss winner'] == bowl_matches['winner']).sum()
            results['bowl_win_percentage'] = (bowl_wins / len(bowl_matches)) * 100

        return pd.Series(results)

    grouped = df.groupby(['season'])

    result_season_effect = grouped.apply(calculate_season_win_percentage).reset_index()
    x=result_season_effect[result_season_effect["season"]==season]
    if toss_decision=='bat':
        return x['bat_win_percentage']
    return x['bowl_win_percentage']


## Average runs scored by the team in last 10 match

In [16]:
def avgScoreTeam(df,team): 
    df = df.sort_values(by='match_dt')
    def calculate_average_last_10(group):
        # Filter matches where team was toss winner and chose to bat or field
        filtered_matches = group[group['toss winner'] == group['team1']]  # Assuming team1 is always the toss winner

        # Calculate number of matches available
        total_matches = len(filtered_matches)

        # If total matches available is less than 10, take all matches
        if total_matches < 10:
            last_10_matches = filtered_matches
        else:
            last_10_matches = filtered_matches.tail(10)

        # Calculate average innings balls
        if group['toss decision'].iloc[0] == 'bat':
            average_score = last_10_matches['inning1_runs'].mean()
        else:
            average_score = last_10_matches['inning2_runs'].mean()
        
        return average_score

    # Group by 'team1' and apply the function to calculate average score in last 10 matches
    result_df = df.groupby('team1').apply(calculate_average_last_10).reset_index(name='average_innings_Score')
    return result_df


## Bowlers record creation 

In [17]:
def bowlerInfo(df,bowler_id=None, inning=None):
    group=df.groupby(['bowler_id','inning'])
    def calculate_economy(group):
        total_runs = group['runs'].sum()
        total_balls = group['balls_bowled'].sum()
        economy_rate = (total_runs / total_balls) * 6
        bowler_style = group['bowler_details'].iloc[0].split(':')[2].strip()
        return pd.Series({
            'strike_rate':total_runs/group['wicket_count'].sum(), 
            'total_runs': total_runs,
            'total_wickets': group['wicket_count'].sum(),
            'total_balls_bowled': total_balls,
            'economy_rate': economy_rate,
            
            'total_maidens': group['maiden'].sum(),
            'total_dots': group['dots'].sum(),
            'total_fours': group['Fours'].sum(),
            'total_sixes': group['Sixes'].sum(),
            'total_wides': group['wides'].sum(),
            'total_noballs': group['noballs'].sum(),
            'bowler_style': bowler_style
        })

    group_summary_bowler=group.apply(calculate_economy).reset_index()
    group_summary_bowler.loc[np.isinf(group_summary_bowler['strike_rate']), 'strike_rate'] = -1
    dropcol=[ 'total_maidens', 'total_dots',
           'total_fours', 'total_sixes', 'total_wides', 'total_noballs']
    group_summary_bowler=group_summary_bowler.drop(dropcol,axis=1)
    if bowler_id is not None and inning is not None:
        filtered_data = group_summary_bowler[(group_summary_bowler['bowler_id'] == bowler_id) & 
                                             (group_summary_bowler['inning'] == inning)]
        return filtered_data
    else:
        return group_summary_bowler
bowlerInfo(bowler,34061.0,1)

  'strike_rate':total_runs/group['wicket_count'].sum(),


Unnamed: 0,bowler_id,inning,strike_rate,total_runs,total_wickets,total_balls_bowled,economy_rate,bowler_style
0,34061.0,1,30.428571,639,21,476,8.054622,Right-arm fast-medium


## batsman record creation 

In [18]:
batt=batsman.rename(columns={'bowler_details': 'batsman_bowler_details'})

# Merge the dataframes
batsman_match = pd.merge(batt, match, on='match id', how='left')
combined_df = pd.merge(batsman_match, bowler, on=['match id', 'bowler_id'], how='left')

# Create a 'team' column which indicates which team the batsman played against
combined_df['opponent_team'] = combined_df.apply(
    lambda row: row['team1'] if row['batsman'] in row['team2'] else row['team2'], axis=1)

# Function to split bowler details
def split_bowler_details(row):
    if pd.isna(row):
        return np.nan, np.nan, np.nan
    details = row.split(':')
    return details[0], details[1], details[2]

# Splitting the bowler details into separate columns
combined_df[['bowler_country', 'bowler_bat_style', 'bowler_bowl_style']] = combined_df['bowler_details'].apply(lambda x: pd.Series(split_bowler_details(x)))
combined_df[['batsman_bowler_country', 'batsman_bowler_bat_style', 'batsman_bowler_bowl_style']] = combined_df['batsman_bowler_details'].apply(lambda x: pd.Series(split_bowler_details(x)))

# Group by batsman and opponent_team
grouped = combined_df.groupby(['batsman_id', 'opponent_team'])

# Aggregate function
def aggregate_batsman(group):
    total_runs = group['runs_x'].sum()
    total_balls = group['balls_faced'].sum()
    not_outs = group['wicket kind'].isna().sum()
    
    # Filtering the rows where the batsman got out
    out_records = group.dropna(subset=['wicket kind'])
    
    # Counting dismissals by different bowling styles
    dismissals = out_records.groupby(['bowler_country', 'bowler_bowl_style']).size().reset_index(name='dismissal_count')
    
    # Converting the dismissals DataFrame to a dictionary
    dismissals_dict = dismissals.to_dict('records')
    
    return pd.Series({
        'total_runs': total_runs,
        'total_balls': total_balls,
        'strike_rate':total_runs/total_balls,
        'not_outs': not_outs,
        'average':total_runs/(len(group)-not_outs),
        'dismissals': dismissals_dict
    })

# Applying the aggregate function
opponent_summary = grouped.apply(aggregate_batsman).reset_index()

# Grouping by batsman_id again to nest opponent team details
final_summary = opponent_summary.groupby('batsman_id').apply(
    lambda df: pd.Series({
        'total_runs': df['total_runs'].sum(),
        'total_balls': df['total_balls'].sum(),
        'not_outs': df['not_outs'].sum(),
        'opponent_team_summary': df[['opponent_team', 'total_runs', 'total_balls', 'not_outs', 'dismissals']].to_dict('records')
    })
).reset_index()
final_summary['strike_rate']=(final_summary['total_runs']/final_summary['total_balls'])*100
final_summary['opponent_team_summary']=final_summary['opponent_team_summary'].apply(convert)

  'average':total_runs/(len(group)-not_outs),
  'average':total_runs/(len(group)-not_outs),
  'average':total_runs/(len(group)-not_outs),
  'average':total_runs/(len(group)-not_outs),
  'average':total_runs/(len(group)-not_outs),
  'average':total_runs/(len(group)-not_outs),
  'average':total_runs/(len(group)-not_outs),
  'average':total_runs/(len(group)-not_outs),
  'average':total_runs/(len(group)-not_outs),
  'average':total_runs/(len(group)-not_outs),
  'average':total_runs/(len(group)-not_outs),
  'average':total_runs/(len(group)-not_outs),
  'average':total_runs/(len(group)-not_outs),
  'average':total_runs/(len(group)-not_outs),
  'average':total_runs/(len(group)-not_outs),
  'average':total_runs/(len(group)-not_outs),
  'average':total_runs/(len(group)-not_outs),
  'average':total_runs/(len(group)-not_outs),
  'average':total_runs/(len(group)-not_outs),
  'average':total_runs/(len(group)-not_outs),
  'average':total_runs/(len(group)-not_outs),
  'average':total_runs/(len(group)

NameError: name 'convert' is not defined

In [None]:
final_summary

In [None]:
def season_var(df,season,toss_decision):
    def calculate_season_win_percentage(group):
        results = {
            'bat_win_percentage': 0,
            'bowl_win_percentage': 0
        }
        n=len(group)
        bat_matches = group[group['toss decision'] == 'bat']
        bowl_matches = group[group['toss decision'] == 'field']
        if not bat_matches.empty:
            bat_wins = (bat_matches['toss winner'] == bat_matches['winner']).sum()
            results['bat_win_percentage'] = (bat_wins / len(bat_matches)) * 100

        if not bowl_matches.empty:
            bowl_wins = (bowl_matches['toss winner'] == bowl_matches['winner']).sum()
            results['bowl_win_percentage'] = (bowl_wins / len(bowl_matches)) * 100

        return pd.Series(results)

    grouped = df.groupby(['season'])

    result_season_effect = grouped.apply(calculate_season_win_percentage).reset_index()
    x=result_season_effect[result_season_effect["season"]==season]
    if toss_decision=='bat':
        return x['bat_win_percentage']
    return x['bowl_win_percentage']

In [None]:
def avgScoreTeam(df,team): 
    df = df.sort_values(by='match_dt')
    def calculate_average_last_10(group):
        # Filter matches where team was toss winner and chose to bat or field
        filtered_matches = group[group['toss winner'] == group['team1']]  # Assuming team1 is always the toss winner

        # Calculate number of matches available
        total_matches = len(filtered_matches)

        # If total matches available is less than 10, take all matches
        if total_matches < 10:
            last_10_matches = filtered_matches
        else:
            last_10_matches = filtered_matches.tail(10)

        # Calculate average innings balls
        if group['toss decision'].iloc[0] == 'bat':
            average_score = last_10_matches['inning1_runs'].mean()
        else:
            average_score = last_10_matches['inning2_runs'].mean()
        
        return average_score

    # Group by 'team1' and apply the function to calculate average score in last 10 matches
    result_df = df.groupby('team1').apply(calculate_average_last_10).reset_index(name='average_innings_Score')
    return result_df[result_df['team1']==team]['average_innings_Score']


In [None]:
def toss_decision_(df,venue_name,toss_decision):
    def calculate_decision_win_percentage(group):
        # Initialize a dictionary to store the results
        results = {
            'bat_win_percentage': 0,
            'bowl_win_percentage': 0
        }

        bat_matches = group[group['toss decision'] == 'bat']
        bowl_matches = group[group['toss decision'] == 'field']
        if not bat_matches.empty:
            bat_wins = (bat_matches['toss winner'] == bat_matches['winner']).sum()
            results['bat_win_percentage'] = (bat_wins / len(bat_matches)) * 100

        if not bowl_matches.empty:
            bowl_wins = (bowl_matches['toss winner'] == bowl_matches['winner']).sum()
            results['bowl_win_percentage'] = (bowl_wins / len(bowl_matches)) * 100

        return pd.Series(results)
    result_toss_decision = df.groupby('venue').apply(calculate_decision_win_percentage).reset_index()
    x=result_toss_decision[result_toss_decision["venue"]==venue_name]
    if toss_decision =="bat":
        return x['bat_win_percentage'][0]
    return x['bowl_win_percentage']
    return result_toss_decision[0]


In [None]:
testing_df=train.head(1)
detail={}
detail['team1']=testing_df['team1'][0]
detail['team2']=testing_df['team2'][0]
detail['season']=testing_df['season'][0]
detail['venue']=testing_df['venue'][0]
detail['match_id']=testing_df['match id'][0]
detail['match_dt']=testing_df['match_dt'][0]
detail['toss_winner']=testing_df['toss winner'][0]
detail['toss_decision']=testing_df['toss decision'][0]
if (detail['toss_winner']==detail['team1'] and detail['toss_decision']=='bat') and detail['toss_winner']!=detail['team1'] and detail['toss_decision']=='feild' :
    bowl_innin_team1=2
    bowl_innin_team2=1
else:
    bowl_innin_team1=1
    bowl_innin_team2=2
play1_list=testing_df['team1_roster_ids'][0].split(":")
play2_list=testing_df['team2_roster_ids'][0].split(":")

match_temp=match[match['match_dt']< pd.to_datetime(detail['match_dt'])]
bowler_temp=bowler[bowler['match_dt']< pd.to_datetime(detail['match_dt'])]
win_chance_toss=toss_decision_(match,detail['venue'],detail['toss_decision']).iloc[0]
win_chance_season=season_var(match_temp,detail['season'],detail['toss_decision']).iloc[0]
# Ratio of average score scored by a team in last 10 games 
chance=(avgScoreTeam(match_temp,detail['team1']).iloc[0])/avgScoreTeam(match_temp,detail['team1']).iloc[0]
bowlerList1=bowler_info_extraxt(bowler_temp,play1_list,bowl_innin_team1)
bowlerList2=bowler_info_extraxt(bowler_temp,play2_list,bowl_innin_team2)
bowl_power=score_cal(bowlerList1,bowlerList2)
bowl_strong=(1+bowl_power[0])/(1+bowl_power[1])
batsman_list1=batsmen_info_extraxt(play1_list,detail['team2'],final_summary)
batsman_list2=batsmen_info_extraxt(play2_list,detail['team1'],final_summary)
bat_power=bats_score(batsman_list1,batsman_list2)
bat_strong=(1+bat_power[0])/(1+bat_power[1])


In [None]:
def bats_score(df1,df2):
    team1,team2=0,0
    x1=df1.drop(columns=['batsman_id','opponent_team_summary']).sum()
    x2=df2.drop(columns=['batsman_id','opponent_team_summary']).sum()
    i1,j1,k1,s1=x1['total_runs'],x1['total_balls'],x1['not_outs'],x1['total_runs']/x1['total_balls']
    i2,j2,k2,s2=x2['total_runs'],x2['total_balls'],x2['not_outs'],x2['total_runs']/x2['total_balls']
    ta1,tb1=x1['total_runs_against'],x1['total_balls_against']
    ta2,tb2=x2['total_runs_against'],x2['total_balls_against']
    team1 += 1 if i1 > i2 else 0
    team2 += 1 if i1 < i2 else 0
    team1 += 1 if k1 > k2 else 0
    team2 += 1 if k1 < k2 else 0
    team1 += 1 if s1 > s2 else 0
    team2 += 1 if s1 < s2 else 0
    team1 += 1 if ta1 > ta2 else 0
    team2 += 1 if ta1 < ta2 else 0
    team1 += 1 if tb1 > tb2 else 0
    team2 += 1 if tb1 < tb2 else 0
    return [team1,team2]

In [None]:
def score_cal(df1,df2):
    team1=0
    team2=0
    x1=df1.sum()
    i1=x1['total_wickets']
    j1=x1['total_balls_bowled']
    k1=x1['total_runs']
    team1eco=(k1/j1)*6
    x2=df2.sum()
    i2=x2['total_wickets']
    j2=x2['total_balls_bowled']
    k2=x2['total_runs']
    team2eco=(k2/j2)*6
    cnt1=len(df1[(df1['strike_rate'] < 20) & (df1['strike_rate'] != -1)])
    cnt2=len(df2[(df2['strike_rate'] < 20) & (df2['strike_rate'] != -1)])
    team1 += 1 if cnt1 > cnt2 else 0
    team2 += 1 if cnt2 > cnt1 else 0
    team1 += 1 if team1eco < team2eco else 0 
    team2 += 1 if team1eco > team2eco else 0
    team1 += 1 if i1 > i2 else  0
    team2 += 1 if i1 < i2 else 0
    team1 += 1 if k1/i1 < k2/i2 else 0
    team2 += 1 if k1/i1 > k2/i2 else 0
    cnt1=x1['inning']
    cnt2=x2['inning']
    team1 += 1 if cnt1 > cnt2 else 0
    team2 += 1 if cnt2 > cnt1 else 0
    return [team1 ,team2]

In [None]:
def bowlerInfo(df,bowler_id=None, inning=None):
    group=df.groupby(['bowler_id','inning'])
    def calculate_economy(group):
        total_runs = group['runs'].sum()
        total_balls = group['balls_bowled'].sum()
        economy_rate = (total_runs / total_balls) * 6
        bowler_style = group['bowler_details'].iloc[0].split(':')[2].strip()
        if group['wicket_count'].sum()!=0:
            strike_rate=total_runs/group['wicket_count'].sum()
        else:
            strike_rate=-1
        return pd.Series({
            'strike_rate':strike_rate, 
            'total_runs': total_runs,
            'total_wickets': group['wicket_count'].sum(),
            'total_balls_bowled': total_balls,
            'economy_rate': economy_rate,
            
            'total_maidens': group['maiden'].sum(),
            'total_dots': group['dots'].sum(),
            'total_fours': group['Fours'].sum(),
            'total_sixes': group['Sixes'].sum(),
            'total_wides': group['wides'].sum(),
            'total_noballs': group['noballs'].sum(),
            'bowler_style': bowler_style
        })

    group_summary_bowler=group.apply(calculate_economy).reset_index()
    dropcol=[ 'total_maidens', 'total_dots',
           'total_fours', 'total_sixes', 'total_wides', 'total_noballs']
    group_summary_bowler=group_summary_bowler.drop(dropcol,axis=1)
    if bowler_id is not None and inning is not None:
        filtered_data = group_summary_bowler[(group_summary_bowler['bowler_id'] == bowler_id) & 
                                             (group_summary_bowler['inning'] == inning)]
        return filtered_data
    else:
        return group_summary_bowler
bowlerInfo(bowler,34061.0,1)

In [None]:
def batsmen_info_extract(listA,opponent,final_summary):
    final=pd.DataFrame()
    for x in listA:
        x=float(x)
        temp=final_summary[final_summary['batsman_id']==x]
        against_opp=temp['opponent_team_summary']
        if opponent in against_opp:
            record=against_opp[opponent]
            temp['total_runs_against']=record['total_runs']
            temp['total_balls_against']=record['total_balls']
        else:
            temp['total_runs_against']=0
            temp['total_balls_against']=0
        final=pd.concat([final, temp], axis=0)
    return final

In [None]:
def bowler_info_extract(df,listA,innings):
    temp=bowlerInfo(df)
    final=pd.DataFrame()
    for x in play1_list:
        x=float(x)
        filtered_data = temp[(temp['bowler_id'] == x) & (temp['inning'] == innings)]
        final=pd.concat([final, filtered_data], axis=0)
    return final

In [None]:
batt=batsman.rename(columns={'bowler_details': 'batsman_bowler_details'})

# Merge the dataframes
batsman_match = pd.merge(batt, match, on='match id', how='left')
combined_df = pd.merge(batsman_match, bowler, on=['match id', 'bowler_id'], how='left')

# Create a 'team' column which indicates which team the batsman played against
combined_df['opponent_team'] = combined_df.apply(
    lambda row: row['team1'] if row['batsman'] in row['team2'] else row['team2'], axis=1)

# Function to split bowler details
def split_bowler_details(row):
    if pd.isna(row):
        return np.nan, np.nan, np.nan
    details = row.split(':')
    return details[0], details[1], details[2]

# Splitting the bowler details into separate columns
combined_df[['bowler_country', 'bowler_bat_style', 'bowler_bowl_style']] = combined_df['bowler_details'].apply(lambda x: pd.Series(split_bowler_details(x)))
combined_df[['batsman_bowler_country', 'batsman_bowler_bat_style', 'batsman_bowler_bowl_style']] = combined_df['batsman_bowler_details'].apply(lambda x: pd.Series(split_bowler_details(x)))

# Group by batsman and opponent_team
grouped = combined_df.groupby(['batsman_id', 'opponent_team'])

# Aggregate function
def aggregate_batsman(group):
    total_runs = group['runs_x'].sum()
    total_balls = group['balls_faced'].sum()
    not_outs = group['wicket kind'].isna().sum()
    
    # Filtering the rows where the batsman got out
    out_records = group.dropna(subset=['wicket kind'])
    
    # Counting dismissals by different bowling styles
    dismissals = out_records.groupby(['bowler_country', 'bowler_bowl_style']).size().reset_index(name='dismissal_count')
    
    # Converting the dismissals DataFrame to a dictionary
    dismissals_dict = dismissals.to_dict('records')
    if total_balls!=0:
        strike_rate=total_runs/total_balls
    else:
        strike_rate=total_runs
    if (len(group)-not_outs)!=0:
        average=total_runs/(len(group)-not_outs)
    else:
        average=total_runs
    return pd.Series({
        'total_runs': total_runs,
        'total_balls': total_balls,
        'strike_rate':strike_rate,
        'not_outs': not_outs,
        'average':average,
        'dismissals': dismissals_dict
    })

# Applying the aggregate function
opponent_summary = grouped.apply(aggregate_batsman).reset_index()

# Grouping by batsman_id again to nest opponent team details
final_summary = opponent_summary.groupby('batsman_id').apply(
    lambda df: pd.Series({
        'total_runs': df['total_runs'].sum(),
        'total_balls': df['total_balls'].sum(),
        'not_outs': df['not_outs'].sum(),
        'opponent_team_summary': df[['opponent_team', 'total_runs', 'total_balls', 'not_outs', 'dismissals']].to_dict('records')
    })
).reset_index()
final_summary['strike_rate']=(final_summary['total_runs']/final_summary['total_balls'])*100
final_summary['opponent_team_summary']=final_summary['opponent_team_summary'].apply(convert)


def season_var(df,season,toss_decision):
    def calculate_season_win_percentage(group):
        results = {
            'bat_win_percentage': 0,
            'bowl_win_percentage': 0
        }
        n=len(group)
        bat_matches = group[group['toss decision'] == 'bat']
        bowl_matches = group[group['toss decision'] == 'field']
        if not bat_matches.empty:
            bat_wins = (bat_matches['toss winner'] == bat_matches['winner']).sum()
            results['bat_win_percentage'] = (bat_wins / len(bat_matches)) * 100

        if not bowl_matches.empty:
            bowl_wins = (bowl_matches['toss winner'] == bowl_matches['winner']).sum()
            results['bowl_win_percentage'] = (bowl_wins / len(bowl_matches)) * 100

        return pd.Series(results)

    grouped = df.groupby(['season'])

    result_season_effect = grouped.apply(calculate_season_win_percentage).reset_index()
    x=result_season_effect[result_season_effect["season"]==season]
    if toss_decision=='bat':
        return x['bat_win_percentage']
    return x['bowl_win_percentage']
def toss_decision(df,venue_name,toss_decision):
    def calculate_decision_win_percentage(group):
        # Initialize a dictionary to store the results
        results = {
            'bat_win_percentage': 0,
            'bowl_win_percentage': 0
        }

        bat_matches = group[group['toss decision'] == 'bat']
        bowl_matches = group[group['toss decision'] == 'field']
        if not bat_matches.empty:
            bat_wins = (bat_matches['toss winner'] == bat_matches['winner']).sum()
            results['bat_win_percentage'] = (bat_wins / len(bat_matches)) * 100

        if not bowl_matches.empty:
            bowl_wins = (bowl_matches['toss winner'] == bowl_matches['winner']).sum()
            results['bowl_win_percentage'] = (bowl_wins / len(bowl_matches)) * 100

        return pd.Series(results)
    result_toss_decision = df.groupby('venue').apply(calculate_decision_win_percentage).reset_index()
    x=result_toss_decision[result_toss_decision["venue"]==venue_name]
    if toss_decision =="bat":
        return x['bat_win_percentage']
    return x['bowl_win_percentage']
    return result_toss_decision

def avgScoreTeam(df,team): 
    df = df.sort_values(by='match_dt')
    def calculate_average_last_10(group):
        # Filter matches where team was toss winner and chose to bat or field
        filtered_matches = group[group['toss winner'] == group['team1']]  # Assuming team1 is always the toss winner

        # Calculate number of matches available
        total_matches = len(filtered_matches)

        # If total matches available is less than 10, take all matches
        if total_matches < 10:
            last_10_matches = filtered_matches
        else:
            last_10_matches = filtered_matches.tail(10)

        # Calculate average innings balls
        if group['toss decision'].iloc[0] == 'bat':
            average_score = last_10_matches['inning1_runs'].mean()
        else:
            average_score = last_10_matches['inning2_runs'].mean()
        
        return average_score

    # Group by 'team1' and apply the function to calculate average score in last 10 matches
    result_df = df.groupby('team1').apply(calculate_average_last_10).reset_index(name='average_innings_Score')
    return result_df[result_df['team1']==team]['average_innings_Score']

def score_cal(df1,df2):
    team1=0
    team2=0
    x1=df1.sum()
    i1=x1['total_wickets']
    j1=x1['total_balls_bowled']
    k1=x1['total_runs']
    team1eco=(k1/j1)*6
    x2=df2.sum()
    i2=x2['total_wickets']
    j2=x2['total_balls_bowled']
    k2=x2['total_runs']
    team2eco=(k2/j2)*6
    cnt1=len(df1[(df1['strike_rate'] < 20) & (df1['strike_rate'] != -1)])
    cnt2=len(df2[(df2['strike_rate'] < 20) & (df2['strike_rate'] != -1)])
    team1 += 1 if cnt1 > cnt2 else 0
    team2 += 1 if cnt2 > cnt1 else 0
    team1 += 1 if team1eco < team2eco else 0 
    team2 += 1 if team1eco > team2eco else 0
    team1 += 1 if i1 > i2 else  0
    team2 += 1 if i1 < i2 else 0
    team1 += 1 if k1/i1 < k2/i2 else 0
    team2 += 1 if k1/i1 > k2/i2 else 0
    cnt1=x1['inning']
    cnt2=x2['inning']
    team1 += 1 if cnt1 > cnt2 else 0
    team2 += 1 if cnt2 > cnt1 else 0
    return [team1 ,team2]
def bats_score(df1,df2):
    team1,team2=0,0
    x1=df1.drop(columns=['batsman_id','opponent_team_summary']).sum()
    x2=df2.drop(columns=['batsman_id','opponent_team_summary']).sum()
    i1,j1,k1,s1=x1['total_runs'],x1['total_balls'],x1['not_outs'],x1['total_runs']/x1['total_balls']
    i2,j2,k2,s2=x2['total_runs'],x2['total_balls'],x2['not_outs'],x2['total_runs']/x2['total_balls']
    ta1,tb1=x1['total_runs_against'],x1['total_balls_against']
    ta2,tb2=x2['total_runs_against'],x2['total_balls_against']
    team1 += 1 if i1 > i2 else 0
    team2 += 1 if i1 < i2 else 0
    team1 += 1 if k1 > k2 else 0
    team2 += 1 if k1 < k2 else 0
    team1 += 1 if s1 > s2 else 0
    team2 += 1 if s1 < s2 else 0
    team1 += 1 if ta1 > ta2 else 0
    team2 += 1 if ta1 < ta2 else 0
    team1 += 1 if tb1 > tb2 else 0
    team2 += 1 if tb1 < tb2 else 0
    return [team1,team2]
def bowlerInfo(df,bowler_id=None, inning=None):
    group=df.groupby(['bowler_id','inning'])
    def calculate_economy(group):
        total_runs = group['runs'].sum()
        total_balls = group['balls_bowled'].sum()
        economy_rate = (total_runs / total_balls) * 6
        bowler_style = group['bowler_details'].iloc[0].split(':')[2].strip()
        if group['wicket_count'].sum()!=0:
            strike_rate=total_runs/group['wicket_count'].sum()
        else:
            strike_rate=-1
        return pd.Series({
            'strike_rate':strike_rate, 
            'total_runs': total_runs,
            'total_wickets': group['wicket_count'].sum(),
            'total_balls_bowled': total_balls,
            'economy_rate': economy_rate,
            
            'total_maidens': group['maiden'].sum(),
            'total_dots': group['dots'].sum(),
            'total_fours': group['Fours'].sum(),
            'total_sixes': group['Sixes'].sum(),
            'total_wides': group['wides'].sum(),
            'total_noballs': group['noballs'].sum(),
            'bowler_style': bowler_style
        })

    group_summary_bowler=group.apply(calculate_economy).reset_index()
    dropcol=[ 'total_maidens', 'total_dots',
           'total_fours', 'total_sixes', 'total_wides', 'total_noballs']
    group_summary_bowler=group_summary_bowler.drop(dropcol,axis=1)
    if bowler_id is not None and inning is not None:
        filtered_data = group_summary_bowler[(group_summary_bowler['bowler_id'] == bowler_id) & 
                                             (group_summary_bowler['inning'] == inning)]
        return filtered_data
    else:
        return group_summary_bowler
def batsmen_info_extract(listA,opponent,final_summary):
    final=pd.DataFrame()
    for x in listA:
        x=float(x)
        temp=final_summary[final_summary['batsman_id']==x]
        against_opp=temp['opponent_team_summary']
        if opponent in against_opp:
            record=against_opp[opponent]
            temp['total_runs_against']=record['total_runs']
            temp['total_balls_against']=record['total_balls']
        else:
            temp['total_runs_against']=0
            temp['total_balls_against']=0
        final=pd.concat([final, temp], axis=0)
    return final
def bowler_info_extract(df,listA,innings):
    temp=bowlerInfo(df)
    final=pd.DataFrame()
    for x in play1_list:
        x=float(x)
        filtered_data = temp[(temp['bowler_id'] == x) & (temp['inning'] == innings)]
        final=pd.concat([final, filtered_data], axis=0)
    return final
def toss_decision_(df,venue_name,toss_decision):
    def calculate_decision_win_percentage(group):
        # Initialize a dictionary to store the results
        results = {
            'bat_win_percentage': 0,
            'bowl_win_percentage': 0
        }

        bat_matches = group[group['toss decision'] == 'bat']
        bowl_matches = group[group['toss decision'] == 'field']
        if not bat_matches.empty:
            bat_wins = (bat_matches['toss winner'] == bat_matches['winner']).sum()
            results['bat_win_percentage'] = (bat_wins / len(bat_matches)) * 100

        if not bowl_matches.empty:
            bowl_wins = (bowl_matches['toss winner'] == bowl_matches['winner']).sum()
            results['bowl_win_percentage'] = (bowl_wins / len(bowl_matches)) * 100

        return pd.Series(results)
    result_toss_decision = df.groupby('venue').apply(calculate_decision_win_percentage).reset_index()
    x=result_toss_decision[result_toss_decision["venue"]==venue_name]
    if toss_decision =="bat":
        return x['bat_win_percentage'][0]
    return x['bowl_win_percentage']
    return result_toss_decision[0]

def major_traindata(df,match,bowler,batsman,final_summary):
    detail={}
    detail['team1']=df['team1'][0]
    detail['team2']=df['team2'][0]
    detail['season']=df['season'][0]
    detail['venue']=df['venue'][0]
    detail['match_id']=df['match id'][0]
    detail['match_dt']=df['match_dt'][0]
    detail['toss_winner']=df['toss winner'][0]
    detail['toss_decision']=df['toss decision'][0]
    if (detail['toss_winner']==detail['team1'] and detail['toss_decision']=='bat') and detail['toss_winner']!=detail['team1'] and detail['toss_decision']=='feild' :
        bowl_innin_team1=2
        bowl_innin_team2=1
    else:
        bowl_innin_team1=1
        bowl_innin_team2=2
    play1_list=df['team1_roster_ids'][0].split(":")
    play2_list=df['team2_roster_ids'][0].split(":")

    match_temp=match[match['match_dt']< pd.to_datetime(detail['match_dt'])]
    bowler_temp=bowler[bowler['match_dt']< pd.to_datetime(detail['match_dt'])]
    batsman_temp=batsman[batsman['match_dt']< pd.to_datetime(detail['match_dt'])]
    win_chance_toss=toss_decision_(match_temp,detail['venue'],detail['toss_decision']).iloc[0]
    win_chance_season=season_var(match_temp,detail['season'],detail['toss_decision']).iloc[0]
    # Ratio of average score scored by a team in last 10 games 
    chance=(avgScoreTeam(match_temp,detail['team1']).iloc[0])/avgScoreTeam(match_temp,detail['team1']).iloc[0]
    bowlerList1=bowler_info_extraxt(bowler_temp,play1_list,bowl_innin_team1)
    bowlerList2=bowler_info_extraxt(bowler_temp,play2_list,bowl_innin_team2)
    bowl_power=score_cal(bowlerList1,bowlerList2)
    bowl_strong=(1+bowl_power[0])/(1+bowl_power[1])
    batsman_list1=batsmen_info_extraxt(play1_list,detail['team2'],final_summary)
    batsman_list2=batsmen_info_extraxt(play2_list,detail['team1'],final_summary)
    bat_power=bats_score(batsman_list1,batsman_list2)
    bat_strong=(1+bat_power[0])/(1+bat_power[1])
    df['bowl_strong']=bowl_strong
    df['bat_strong']=bat_strong
    df['chance']=chance
    df['win_chance_toss']=win_chance_toss/(1-win_chance_toss)
    df['win_chance_season']=win_chance_season/(1-win_chance_season)
    dropCol=['match id','team1_id','team1_roster_ids','team2_id','team2_roster_ids','match_dt','ground_id','season',
            'series_name','lighting','venue','city','winner_id']
    dff=df.drop(dropCol,axis=1)
    return dff

In [None]:
import pandas as pd

# Define your function
def major_traindata(df, match, bowler, batsman, final_summary):
    detail = {}
    detail['team1'] = df['team1'].values[0]
    detail['team2'] = df['team2'].values[0]
    detail['season'] = df['season'].values[0]
    detail['venue'] = df['venue'].values[0]
    detail['match_id'] = df['match id'].values[0]
    detail['match_dt'] = df['match_dt'].values[0]
    detail['toss_winner'] = df['toss winner'].values[0]
    detail['toss_decision'] = df['toss decision'].values[0]
    
    if (detail['toss_winner'] == detail['team1'] and detail['toss_decision'] == 'bat') or (detail['toss_winner'] == detail['team2'] and detail['toss_decision'] == 'field'):
        bowl_innin_team1 = 2
        bowl_innin_team2 = 1
    else:
        bowl_innin_team1 = 1
        bowl_innin_team2 = 2
    
    play1_list = df['team1_roster_ids'].values[0].split(":")
    play2_list = df['team2_roster_ids'].values[0].split(":")

    match_temp = match[match['match_dt'] < pd.to_datetime(detail['match_dt'])]
    bowler_temp = bowler[bowler['match_dt'] < pd.to_datetime(detail['match_dt'])]
    batsman_temp = batsman[batsman['match_dt'] < pd.to_datetime(detail['match_dt'])]
    
    win_chance_toss = toss_decision_(match_temp, detail['venue'], detail['toss_decision']).iloc[0]
    win_chance_season = season_var(match_temp, detail['season'], detail['toss_decision']).iloc[0]
    
    chance = avgScoreTeam(match_temp, detail['team1']).iloc[0] / avgScoreTeam(match_temp, detail['team2']).iloc[0]
    
    bowlerList1 = bowler_info_extract(bowler_temp, play1_list, bowl_innin_team1)
    bowlerList2 = bowler_info_extract(bowler_temp, play2_list, bowl_innin_team2)
    bowl_power = score_cal(bowlerList1, bowlerList2)
    bowl_strong = (1 + bowl_power[0]) / (1 + bowl_power[1])
    
    batsman_list1 = batsmen_info_extract(play1_list, detail['team2'], final_summary)
    batsman_list2 = batsmen_info_extract(play2_list, detail['team1'], final_summary)
    bat_power = bats_score(batsman_list1, batsman_list2)
    bat_strong = (1 + bat_power[0]) / (1 + bat_power[1])
    
    df['bowl_strong'] = bowl_strong
    df['bat_strong'] = bat_strong
    df['chance'] = chance
    df['win_chance_toss'] = win_chance_toss / (1 - win_chance_toss)
    df['win_chance_season'] = win_chance_season / (1 - win_chance_season)
    
    dropCol = ['match id', 'team1_id', 'team1_roster_ids', 'team2_id', 'team2_roster_ids', 'match_dt', 'ground_id', 'season',
               'series_name', 'lighting', 'venue', 'city', 'winner_id']
    dff=df.drop(dropCol, axis=1 )
    
    return dff

# Apply the function to each row
def apply_to_each_row(df, match, bowler, batsman, final_summary):
    results = df.apply(lambda row: major_traindata(row.to_frame().T, match, bowler, batsman, final_summary), axis=1)
    return pd.concat(results.values, ignore_index=True)

# Example usage
# Assuming df, match, bowler, batsman, and final_summary are already defined
result_df = apply_to_each_row(df, match, bowler, batsman, final_summary)


In [None]:
train.head(1)['team1'].val