In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib
from matplotlib import pyplot as plt

Reading from the CSVs

In [None]:
batsman = pd.read_csv('/content/663e2b548c98c_batsman_level_scorecard (1).csv')
bowler = pd.read_csv('/content/663e2b2c60743_bowler_level_scorecard (1).csv')
matches = pd.read_csv('/content/663e2b6d54457_train_data_with_samplefeatures (1).csv')

In [None]:
matches.dtypes
matches['match_dt'] = pd.to_datetime(matches['match_dt'], dayfirst=True, errors='coerce')
batsman['match_dt'] = pd.to_datetime(batsman['match_dt'], dayfirst=True, errors='coerce')
bowler['match_dt'] = pd.to_datetime(bowler['match_dt'], dayfirst=True, errors='coerce')


matches.columns

  matches['match_dt'] = pd.to_datetime(matches['match_dt'], dayfirst=True, errors='coerce')


Index(['match id', 'team1', 'team1_id', 'team1_roster_ids', 'team2',
       'team2_id', 'team2_roster_ids', 'winner', 'winner_id', 'toss winner',
       'toss decision', 'venue', 'city', 'match_dt', 'lighting', 'series_name',
       'season', 'ground_id', 'team_count_50runs_last15', 'team_winp_last5',
       'team1only_avg_runs_last15', 'team1_winp_team2_last15',
       'ground_avg_runs_last15'],
      dtype='object')

One hot encoding of day and Night

In [None]:

lighting_dummies = pd.get_dummies(matches['lighting'], prefix='lighting')

# Concatenate the original DataFrame with the new one-hot encoded columns
matches = pd.concat([matches, lighting_dummies], axis=1)
matches.columns

Index(['match id', 'team1', 'team1_id', 'team1_roster_ids', 'team2',
       'team2_id', 'team2_roster_ids', 'winner', 'winner_id', 'toss winner',
       'toss decision', 'venue', 'city', 'match_dt', 'lighting', 'series_name',
       'season', 'ground_id', 'team_count_50runs_last15', 'team_winp_last5',
       'team1only_avg_runs_last15', 'team1_winp_team2_last15',
       'ground_avg_runs_last15', 'lighting_day match',
       'lighting_day/night match', 'lighting_night match'],
      dtype='object')

In [None]:
batsman[['country', 'bat-type', 'ball-type']] = batsman['batsman_details'].str.split(':', expand=True).iloc[:, 0:3]
bowler[['country', 'bat-type', 'ball-type']] = bowler['bowler_details'].str.split(':', expand=True).iloc[:, 0:3]
batsman.columns
bowler.columns

Index(['match id', 'bowler', 'bowler_id', 'bowler_details',
       'is_bowler_captain', 'is_bowler_keeper', 'inning', 'runs',
       'wicket_count', 'balls_bowled', 'economy', 'maiden', 'dots', 'Fours',
       'Sixes', 'wides', 'noballs', 'match_dt', 'country', 'bat-type',
       'ball-type'],
      dtype='object')

In [None]:
match_lvl_data = matches
batsman_lvl_data= batsman
bowler_lvl_data = bowler

Average Strike Rate in last n matches

In [None]:
def avgstrikerate(player_id, date, n):
    # Ensure date is a pandas datetime object
    date = pd.to_datetime(date)

    # Filter and sort the DataFrame
    filtered_df = batsman[(batsman['match_dt'] < date) & (batsman['batsman_id'] == player_id)].sort_values(by='match_dt', ascending=False).head(n)

    # Calculate the mean of the 'strike_rate' column
    mean_strike_rate = filtered_df['strike_rate'].mean()
    return mean_strike_rate

In [None]:
def avgStrikeRateTeam(match_id, n, date, team, row):
    # Get the list of players for the team
    if row['team1'] == team:
        player_list = row['team1_roster_ids']
    else:
        player_list = row['team2_roster_ids']

    # Convert the player list from string to a list of integers
    player_list = [int(float(x)) for x in player_list.split(':')]

    count = 0
    total_strike_rate = 0

    for player_id in player_list:
        if player_id in batsman['batsman_id'].values:
            count += 1
            total_strike_rate += avgstrikerate(player_id, date, n)

    # Return average strike rate
    return total_strike_rate / max(count, 1)  # Avoid division by zero


In [None]:
def compute_avg_strike_rate(row, n):
    match_id = row['match id']
    date = row['match_dt']
    team1 = row['team1']
    team2 = row['team2']

    team1_avg_strike_rate = avgStrikeRateTeam(match_id, n, date, team1, row)
    team2_avg_strike_rate = avgStrikeRateTeam(match_id, n, date, team2, row)

    return pd.Series([team1_avg_strike_rate, team2_avg_strike_rate])

Win rate at that stadium in last n matches for that team

In [None]:
def avgRunsGroundTeam(ground_id, date, n, matchid):
    '''
    Function to calculate average runs scored in ground/venue.

    Input-
    1. ground_id: ID of the ground to calculate the feature for.
    2. date: match date of the current game to calculate the feature for.
    3. n: look-back window of games for the ground.

    Output- None

    Returns- Average runs scored in the ground.
    '''
    # filter out games with ground_id being the input ground_id and date earlier than current game's input date. Sort desc by date, and select top n rows (games).
    df_rel = matches[(matches['match_dt']<pd.to_datetime(date))&(matches['ground_id']==ground_id)].sort_values(by='match_dt', ascending=False).head(n)
    df_rel['avg_runs_inn'] = (df_rel['inning1_runs']+df_rel['inning2_runs'])/2 # take the mean of inning1_runs and inning2_runs in a separate column.
    return df_rel['avg_runs_inn'].mean() # Return the mean value of the computed column above.


Function to calculate percentage of matches won during d/n/(d/n)

In [None]:
def calculate_win_percentages(team_name, date, n):
    # Convert date to pandas datetime
    date = pd.to_datetime(date)

    # Filter matches for the given team and date
    team_matches = matches[(matches['team1'] == team_name) | (matches['team2'] == team_name)]
    team_matches = team_matches[pd.to_datetime(team_matches['match_dt']) <= date].tail(n)

    # Total matches played by the team
    total_matches = len(team_matches)

    if total_matches == 0:
        return {'day match': 0, 'night match': 0, 'day/night match': 0}

    # Initialize win counts
    win_counts = {'day match': 0, 'night match': 0, 'day/night match': 0}

    # Iterate through the filtered matches
    for _, match in team_matches.iterrows():
        if match['winner'] == team_name:
            # Accessing the boolean columns directly
            if match['lighting_day match']:
                win_counts['day match'] += 1
            elif match['lighting_night match']:
                win_counts['night match'] += 1
            elif match['lighting_day/night match']:
                win_counts['day/night match'] += 1

    # Calculate win percentages
    win_percentages = {key: (count / total_matches) * 100 for key, count in win_counts.items()}

    return win_percentages


In [None]:
def compute_win_percentage(row, n):
    match_id = row['match id']
    date = row['match_dt']
    team1 = row['team1']
    team2 = row['team2']

    team1_avg_win_percentage = calculate_win_percentages(team1, date, n)
    team1_avg_win_percent = team1_avg_win_percentage[row['lighting']]

    team2_avg_win_percentage = calculate_win_percentages(team2, date, n)
    team2_avg_win_percent = team2_avg_win_percentage[row['lighting']]

    return pd.Series([team1_avg_win_percent, team2_avg_win_percent])


Tester

In [None]:
print(calculate_win_percentages('Ae Ss', '26-12-2022', 10))

{'day match': 10.0, 'night match': 40.0, 'day/night match': 10.0}


  date = pd.to_datetime(date)


Win ratio in direct encounters

In [None]:
# Function to calculate win ratio between two teams before a given date
def calculate_win_ratio( team1, team2, date):
    # Convert date to pandas datetime
    date = pd.to_datetime(date)

    # Filter matches where team1 and team2 have played against each other before the given date
    relevant_matches = matches[((matches['team1'] == team1) & (matches['team2'] == team2)) |
                               ((matches['team1'] == team2) & (matches['team2'] == team1))]
    relevant_matches = relevant_matches[pd.to_datetime(relevant_matches['match_dt']) < date]

    # Total matches played between the two teams
    total_matches = len(relevant_matches)

    if total_matches == 0:
        return  pd.Series([0, 0])

    # Initialize win counts
    win_counts = {team1: 0, team2: 0}
    # print(relevant_matches)
    # Iterate through the filtered matches
    for _, match1 in relevant_matches.iterrows():
        if match1['winner'] == team1:
            win_counts[team1] += 1
        elif match1['winner'] == team2:
            win_counts[team2] += 1

    # Calculate win ratios
    win_ratio = {team: (count / total_matches) * 100 for team, count in win_counts.items()}

    return pd.Series([win_ratio[team1],win_ratio[team2]])


In [None]:
print(calculate_win_ratio('Ae Ss','Ph Ss', '26-12-2022'))

0    0
1    0
dtype: int64


  date = pd.to_datetime(date)


%matches won on that venue

In [None]:
def calculate_win_percentage_ong(team, date, ground):
    # Convert date to pandas datetime
    date = pd.to_datetime(date, dayfirst=True, errors='coerce')
    # print(date.year)
    if pd.isna(date):
        raise ValueError("Incorrect date format. Please provide the date in a valid format.")

    # Filter matches where the specified team played at the specified ground before the given date
    relevant_matches = matches[((matches['team1'] == team) | (matches['team2'] == team)) & (matches['venue'] == ground) & (matches['match_dt'] < date)]
    # print(relevant_matches)

    # Total matches played by the team at the specified ground
    total_matches = len(relevant_matches)
    # print(total_matches)

    if total_matches == 0:
        return 0  # If no matches are found, return 0

    # Count the number of wins for the specified team
    win_count = sum(relevant_matches['winner'] == team)

    # Calculate win percentage
    win_percentage = (win_count / total_matches) * 100

    return win_percentage


In [None]:
def compute_venue(row):
    match_id = row['match id']
    date = row['match_dt']
    team1 = row['team1']
    team2 = row['team2']
    ground= row['venue']
    team1_venue = calculate_win_percentage_ong(team1, date, ground)
    team2_venue = calculate_win_percentage_ong(team2, date, ground)

    return pd.Series([team1_venue, team2_venue])

In [None]:
print(calculate_win_percentage_ong('Mx', '02-06-2023','Taunton' ))

0


Avg wicket taken by that bowler in Last n  games

In [None]:
def calculate_avg_wickets( players, date, n):
    player_id=players.split(':')
    player_ids = [np.int64(float(id_str)) for id_str in player_id]
    wickets=0
    date = pd.to_datetime(date, dayfirst=True, errors='coerce')

    if pd.isna(date):
        raise ValueError("Incorrect date format. Please provide the date in a valid format.")
    for bowler_id in player_ids:
      # Convert date to pandas datetime


      # Filter data for the specified bowler and games before the given date
      relevant_games = bowler[(bowler['bowler_id'] == bowler_id) &
                                  (pd.to_datetime(bowler['match_dt'], dayfirst=True, errors='coerce') < date)]

      # Sort games by date in descending order (most recent first)
      relevant_games = relevant_games.sort_values(by='match_dt', ascending=False)

      # Select the last n games
      last_n_games = relevant_games.head(n)

      # Calculate the average number of wickets
      wickets += last_n_games['wicket_count'].sum()

    return wickets/n


In [None]:
def compute_avg_wickets(row,n):
    match_id = row['match id']
    date = row['match_dt']
    players1 = row['team1_roster_ids']
    players2 = row['team2_roster_ids']
    # ground= row['venue']
    team1_venue = calculate_avg_wickets( players1, date, n)
    team2_venue = calculate_avg_wickets( players2, date, n)

    return pd.Series([team1_venue, team2_venue])

In [None]:
print(calculate_avg_wickets('3220566.0:5229230.0:2082499.0:5764576.0:4005406.0:2083227.0:1482249.0:1707628.0:2538983.0:6060844.0:3973528.0','01-06-2022', 5))

6.2


%Win of a team till now in that season

In [None]:
def calculate_win_percentage_for_season( team, season, date):
    # Convert date to pandas datetime
    date = pd.to_datetime(date, dayfirst=True, errors='coerce')

    if pd.isna(date):
        raise ValueError("Incorrect date format. Please provide the date in a valid format.")

    # Filter matches for the specified team and season up to the given date
    relevant_matches = matches[((matches['team1'] == team) | (matches['team2'] == team)) &
                               (matches['season'] == season) &
                               (pd.to_datetime(matches['match_dt'], dayfirst=True, errors='coerce') <= date)]

    # Total matches played by the team in the specified season up to the given date
    total_matches = len(relevant_matches)

    if total_matches == 0:
        return 0  # If no matches are found, return 0

    # Count the number of wins for the specified team
    win_count = sum(relevant_matches['winner'] == team)

    # Calculate win percentage
    win_percentage = (win_count / total_matches) * 100

    return win_percentage


In [None]:
def compute_avg_win_season(row,n):
    match_id = row['match id']
    date = row['match_dt']
    team1=row['team1']
    team2=row['team2']
    players1 = row['team1_roster_ids']
    players2 = row['team2_roster_ids']
    season= row['season']
    # ground= row['venue']
    team1_season = calculate_win_percentage_for_season( team1, season, date)
    team2_season = calculate_win_percentage_for_season( team2, season, date)

    return pd.Series([team1_season, team2_season])

In [None]:
print(calculate_win_percentage_for_season( 'Ge', '2023', '20-06-2023'))

42.857142857142854


Avg Dot balls in last n

In [None]:
def avg_dot_balls_last_n_matches(n,team_id,match_date):
    match_date=pd.to_datetime(match_date, dayfirst=True)
    last_n_matches = matches[((matches['team1'] == team_id) | (matches['team2'] == team_id)) & ( pd.to_datetime(matches['match_dt'], dayfirst=True) < match_date)].sort_values('match_dt', ascending=False).head(n)
    total_dots = 0
    for _, match in last_n_matches.iterrows():  # Iterate over DataFrame rows
        if team_id == match['team2']:
            players = match['team1_roster_ids'].split('.0:')  # Access roster_ids from the row
        else:
            players = match['team2_roster_ids'].split('.0:')  # Access roster_ids from the row

        int_players = [int(float(player_id)) for player_id in players if player_id.strip()]  # Convert to int
        for player_id in int_players:
            player_data = bowler[(bowler['bowler_id'] == player_id) & (bowler['match id'] == match['match id'])]
            if not player_data.empty :
              total_dots += player_data['dots'].sum()

    total_dots = total_dots / n if n > 0 else 1
    return total_dots

In [None]:
def compute_avg_dot_balls_last(row,n):
    match_id = row['match id']
    date = row['match_dt']
    team1=row['team1']
    team2=row['team2']
    players1 = row['team1_roster_ids']
    players2 = row['team2_roster_ids']
    season= row['season']
    # ground= row['venue']
    team1_dot = avg_dot_balls_last_n_matches(n,team1,date)
    team2_dot = avg_dot_balls_last_n_matches(n,team2,date)

    return pd.Series([team1_dot, team2_dot])

Win%_in_series

In [None]:
def extract_player_ids(match_id, team):
    # Filter dataset for the given match id
    match_row = match_lvl_data[match_lvl_data['match id'] == match_id].iloc[0]  # Assuming match_id is unique

    # Determine which team's roster to extract based on team_id
    if team == match_row['team1']:
        players = match_row['team1_roster_ids'].split('.0:')  # Access roster_ids from the row
    else:
        players = match_row['team2_roster_ids'].split('.0:')  # Access roster_ids from the ro
    int_players = [int(float(player_id)) for player_id in players if player_id.strip()]  # Convert to int
    return int_players

In [None]:
print(avg_dot_balls_last_n_matches(5,'Mx','02-06-2023'))

35.8


In [None]:
# Feature 1 :: Ratio of win % of team 1 and team 2 in this series
def ratio_of_win_percentage_in_series(team1,series):
    # Filter the dataframe for the specific series
    series_df = match_lvl_data[match_lvl_data['series_name'] == series]
    # Count total matches the team1 participated in
    total_matches_1 = series_df[(series_df['team1'] == team1) | (series_df['team2'] == team1)].shape[0]
    # Count total matches the team2 participated in
    wins1 = series_df[series_df['winner'] == team1].shape[0]
    # Count matches won by the team2
    # Calculate the win percentage
    win_percentage1 = (wins1 / max(total_matches_1,1)) * 100

    return win_percentage1

In [None]:
def compute_win_percent_series(row,n):
    match_id = row['match id']
    date = row['match_dt']
    team1=row['team1']
    team2=row['team2']
    players1 = row['team1_roster_ids']
    players2 = row['team2_roster_ids']
    season= row['season']
    series=row['series_name']
    # ground= row['venue']
    team1_series = ratio_of_win_percentage_in_series(team1,series)
    team2_series = ratio_of_win_percentage_in_series(team2,series)

    return pd.Series([team1_series, team2_series])

Avg_boundaries

In [None]:
def avg_boundaries_last_n_matches(n ,match_date,team_id):
    match_date = pd.to_datetime(match_date, dayfirst=True)
    last_n_matches = match_lvl_data[((match_lvl_data['team1'] == team_id) | (match_lvl_data['team2'] == team_id)) & (match_lvl_data['match_dt'] < match_date)].sort_values('match_dt', ascending=False).head(n)
    total_boundaries = 0
    for _, match in last_n_matches.iterrows():  # Iterate over DataFrame rows
        int_players = extract_player_ids(match['match id'],team_id)  # Convert to int
        for player_id in int_players:
            player_data = batsman_lvl_data[(batsman_lvl_data['batsman_id'] == player_id) & (batsman_lvl_data['match id'] == match['match id'])]
            if not player_data.empty :
              total_boundaries += player_data['Fours'].sum()
              total_boundaries += player_data['Sixes'].sum()

    total_boundaries = total_boundaries / n if n > 0 else 1
    return total_boundaries

In [None]:
def compute_avg_boundaries(row,n):
    match_id = row['match id']
    date = row['match_dt']
    team1=row['team1']
    team2=row['team2']
    players1 = row['team1_roster_ids']
    players2 = row['team2_roster_ids']
    season= row['season']
    series=row['series_name']
    # ground= row['venue']
    team1_bound = avg_boundaries_last_n_matches(n , date , team1)
    team2_bound = avg_boundaries_last_n_matches(n , date , team2)

    return pd.Series([team1_bound, team2_bound])

Avg runs of top3 players

In [None]:
def avg_runs_scored_by_top3_batsman_n_matches(n ,match_date,team_id):
    match_date = pd.to_datetime(match_date, dayfirst=True)
    last_n_matches = match_lvl_data[((match_lvl_data['team1'] == team_id) | (match_lvl_data['team2'] == team_id)) & (match_lvl_data['match_dt'] < match_date)].sort_values('match_dt', ascending=False).head(n)
    total_runs_top3 = 0 # Initialize total_runs_top3 to zero before incrementing
    matches_counted = 0 # Initialize matches_counted to zero before incrementing
    for _, match in last_n_matches.iterrows():  # Iterate over DataFrame rows
        int_players = extract_player_ids(match['match id'],team_id)
        match_runs = []
        for player_id in int_players:
            player_data = batsman_lvl_data[
                (batsman_lvl_data['batsman_id'] == player_id) &
                (batsman_lvl_data['match id'] == match['match id'])
            ]

            if not player_data.empty:
                match_runs.append(player_data['runs'].sum())

        if match_runs:
            top3_runs = sorted(match_runs, reverse=True)[:3]
            total_runs_top3 += sum(top3_runs) # Now you can safely increment
            matches_counted += 1 # Now you can safely increment

    if matches_counted > 0:
        avg_runs_top3 = total_runs_top3 / matches_counted
    else:
        avg_runs_top3 = 0

    return avg_runs_top3

In [None]:
def compute_avg_top3_batsman(row,n):
    match_id = row['match id']
    date = row['match_dt']
    team1=row['team1']
    team2=row['team2']
    players1 = row['team1_roster_ids']
    players2 = row['team2_roster_ids']
    season= row['season']
    series=row['series_name']
    # ground= row['venue']
    team1_top3 = avg_runs_scored_by_top3_batsman_n_matches(n ,date,team1)
    team2_top3 = avg_runs_scored_by_top3_batsman_n_matches(n ,date,team2)

    return pd.Series([team1_top3, team2_top3])

avg_economy

In [None]:
def avg_economy_rate_last_n_matches(n,team_id, date):
    date = pd.to_datetime(date, dayfirst=True)
    last_n_matches = match_lvl_data[((match_lvl_data['team1'] == team_id) | (match_lvl_data['team2'] == team_id)) & (match_lvl_data['match_dt'] < date)].sort_values('match_dt', ascending=False).head(n)
    total_runs_conceded = 0
    total_balls_bowled = 0
    for _, match in last_n_matches.iterrows():  # Iterate over DataFrame rows
        if team_id == match['team1']:
            players = match['team1_roster_ids'].split('.0:')  # Access roster_ids from the row
        else:
            players = match['team2_roster_ids'].split('.0:')  # Access roster_ids from the row

        int_players = [int(float(player_id)) for player_id in players if player_id.strip()]  # Convert to int
        for player_id in int_players:
            player_data = bowler_lvl_data[(bowler_lvl_data['bowler_id'] == player_id) & (bowler_lvl_data['match id'] == match['match id'])]
            if not player_data.empty :
                total_runs_conceded += player_data['runs'].sum()
                total_balls_bowled += player_data['balls_bowled'].sum()
    if total_balls_bowled > 0:
        economy_rate = 6 * total_runs_conceded / total_balls_bowled
    else:
        economy_rate = 0

    return economy_rate


In [None]:
def compute_avg_economy(row,n):
    match_id = row['match id']
    date = row['match_dt']
    team1=row['team1']
    team2=row['team2']
    players1 = row['team1_roster_ids']
    players2 = row['team2_roster_ids']
    season= row['season']
    series=row['series_name']
    # ground= row['venue']
    team1_economy = avg_economy_rate_last_n_matches(n,team1, date)
    team2_economy = avg_economy_rate_last_n_matches(n,team2, date)

    return pd.Series([team1_economy, team2_economy])

Toss-Win ratio

In [None]:
def calculate_toss_win_ratio():
    # Total matches where the toss winner is the match winner
    matches_won_toss = matches[matches['toss winner'] == matches['winner']]

    # Total matches where the toss winner is not the match winner
    total_matches = len(matches)

    # Calculate the ratio of matches won when the toss is won
    toss_win_ratio = len(matches_won_toss) / total_matches if total_matches > 0 else 0

    return toss_win_ratio

In [None]:
def assign_toss_ratio(row, toss_win_ratio):
    if row['toss winner'] == row['team1']:
        team1_toss = toss_win_ratio
        team2_toss = 1 - toss_win_ratio
    else:
        team1_toss = 1 - toss_win_ratio
        team2_toss = toss_win_ratio
    return pd.Series([team1_toss, team2_toss])

In [None]:
batsman.columns

Index(['match id', 'batsman', 'batsman_id', 'batsman_details',
       'is_batsman_captain', 'is_batsman_keeper', 'inning', 'runs',
       'balls_faced', 'over_faced_first', 'wicket kind', 'out_by_bowler',
       'out_by_fielder', 'bowler_id', 'bowler_details', 'is_bowler_keeper',
       'is_bowler_captain', 'strike_rate', 'Fours', 'Sixes', 'match_dt',
       'country', 'bat-type', 'ball-type'],
      dtype='object')

In [None]:
batsman['ball-type']  .unique()

array(['Right-arm medium-fast', 'None', 'Right-arm offbreak',
       'Right-arm fast-medium', 'Right-arm medium',
       'Slow left-arm orthodox', 'Left-arm fast-medium', 'Legbreak',
       'Left-arm wrist-spin', 'Left-arm medium-fast', 'Right-arm fast',
       'Legbreak googly', 'Left-arm medium', 'Left-arm fast',
       'Right-arm slow-medium', 'Right-arm slow', 'Left-arm slow-medium',
       'Left-arm slow', 'Right-arm bowler'], dtype=object)

In [None]:
batsman['bat-type']  .unique()

array(['Right-hand bat', 'Left-hand bat', 'None'], dtype=object)

In [None]:
batsman['country']  .unique()

array(['NZ', 'AUS', 'PAK', 'SA', 'AFG', 'ENG', 'WI', 'ITA', 'NED', 'IND',
       'NEP', 'NAM', 'IRE', 'SL', 'ZIM', 'BAN', 'USA', 'SCOT', 'BER',
       'UAE', 'PNG', 'OMA', 'HKG', 'JER', 'CAY', 'COK', 'GER', 'RWN',
       'UGA', 'CZK-R', 'Aut', 'MOZ', 'GRC', 'BOT', 'BHR', 'None', 'SGP',
       'NGA', 'KENYA', 'TAN', 'GHA', 'DEN', 'QAT', 'CAN', 'KUW', 'MAS',
       'MWI'], dtype=object)

In [None]:
train_data_set = matches[['match id', 'team1', 'team2', 'match_dt', 'winner','lighting', 'team1_roster_ids', 'team2_roster_ids', 'ground_id','venue', 'city', 'lighting_day match','lighting_day/night match', 'lighting_night match','season', 'series_name', 'toss winner']
       ]
n=5
train_data_set['match_dt'] = pd.to_datetime(train_data_set['match_dt'], dayfirst=True, errors='coerce')
train_data_set.columns


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_set['match_dt'] = pd.to_datetime(train_data_set['match_dt'], dayfirst=True, errors='coerce')


Index(['match id', 'team1', 'team2', 'match_dt', 'winner', 'lighting',
       'team1_roster_ids', 'team2_roster_ids', 'ground_id', 'venue', 'city',
       'lighting_day match', 'lighting_day/night match',
       'lighting_night match', 'season', 'series_name', 'toss winner'],
      dtype='object')

In [None]:
train_data_set[['team1_avg_strike_rate', 'team2_avg_strike_rate']] = train_data_set.apply(lambda row: compute_avg_strike_rate(row, n), axis=1)
train_data_set[['team1_avg_strike_rate', 'team2_avg_strike_rate']].describe()
train_data_set.columns

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_set[['team1_avg_strike_rate', 'team2_avg_strike_rate']] = train_data_set.apply(lambda row: compute_avg_strike_rate(row, n), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_set[['team1_avg_strike_rate', 'team2_avg_strike_rate']] = train_data_set.apply(lambda row: compute_avg_strike_rate(row, n), axis=1)


Index(['match id', 'team1', 'team2', 'match_dt', 'winner', 'lighting',
       'team1_roster_ids', 'team2_roster_ids', 'ground_id', 'venue', 'city',
       'lighting_day match', 'lighting_day/night match',
       'lighting_night match', 'season', 'series_name', 'toss winner',
       'team1_avg_strike_rate', 'team2_avg_strike_rate'],
      dtype='object')

In [None]:
n=5
train_data_set['avg_Runs_Ground_Team']= train_data_set.apply(lambda row: avgRunsGroundTeam(int(row['ground_id']),row['match_dt'], n,row['match id']), axis=1)
train_data_set['avg_Runs_Ground_Team'].describe()

KeyError: 'inning1_runs'

In [None]:
train_data_set.columns

In [None]:
train_data_set[['team1_avg_win_percent', 'team2_avg_win_percent']] = train_data_set.apply(lambda row: compute_win_percentage(row, n), axis=1)
train_data_set[['team1_avg_win_percent', 'team2_avg_win_percent']].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_set[['team1_avg_win_percent', 'team2_avg_win_percent']] = train_data_set.apply(lambda row: compute_win_percentage(row, n), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_set[['team1_avg_win_percent', 'team2_avg_win_percent']] = train_data_set.apply(lambda row: compute_win_percentage(row, n), axis=1)


Unnamed: 0,team1_avg_win_percent,team2_avg_win_percent
count,948.0,948.0
mean,33.350914,34.998242
std,28.371245,28.88595
min,0.0,0.0
25%,20.0,20.0
50%,25.0,33.333333
75%,50.0,50.0
max,100.0,100.0


In [None]:
train_data_set[['direct_ecounter_1', 'direct_ecounter_2']] = train_data_set.apply(lambda row: calculate_win_ratio(row['team1'], row['team2'], row['match_dt']), axis=1)
train_data_set['direct_ecounter_1'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_set[['direct_ecounter_1', 'direct_ecounter_2']] = train_data_set.apply(lambda row: calculate_win_ratio(row['team1'], row['team2'], row['match_dt']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_set[['direct_ecounter_1', 'direct_ecounter_2']] = train_data_set.apply(lambda row: calculate_win_ratio(row['team1'], row['team2'], row['match_dt']), axis=1)


count    948.000000
mean      28.236970
std       41.006966
min        0.000000
25%        0.000000
50%        0.000000
75%       50.000000
max      100.000000
Name: direct_ecounter_1, dtype: float64

In [None]:
train_data_set[['team1_venue', 'team2_venue']] = train_data_set.apply(lambda row: compute_venue(row), axis=1)
train_data_set['team1_venue'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_set[['team1_venue', 'team2_venue']] = train_data_set.apply(lambda row: compute_venue(row), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_set[['team1_venue', 'team2_venue']] = train_data_set.apply(lambda row: compute_venue(row), axis=1)


count    948.000000
mean      29.683396
std       39.260551
min        0.000000
25%        0.000000
50%        0.000000
75%       51.388889
max      100.000000
Name: team1_venue, dtype: float64

In [None]:
train_data_set[['team1_avg_wickets', 'team2_avg_wickets']] = train_data_set.apply(lambda row: compute_avg_wickets(row,n), axis=1)
train_data_set['team1_avg_wickets'].describe()

count    948.000000
mean       5.722785
std        2.012345
min        0.000000
25%        4.600000
50%        6.000000
75%        7.000000
max       11.000000
Name: team1_avg_wickets, dtype: float64

In [None]:
train_data_set[['team1_season', 'team2_season']] = train_data_set.apply(lambda row: compute_avg_win_season(row,n), axis=1)
train_data_set['team1_season'].describe()

count    948.000000
mean      51.078046
std       32.164979
min        0.000000
25%       32.500000
50%       50.000000
75%       75.000000
max      100.000000
Name: team1_season, dtype: float64

In [None]:
train_data_set[['team1_dot', 'team2_dot']] = train_data_set.apply(lambda row: compute_avg_dot_balls_last(row,n), axis=1)
train_data_set['team1_dot'].describe()

count    948.000000
mean      31.103586
std       14.488845
min        0.000000
25%       23.000000
50%       36.000000
75%       41.600000
max       57.600000
Name: team1_dot, dtype: float64

In [None]:
train_data_set[['team1_series', 'team2_series']] = train_data_set.apply(lambda row: compute_win_percent_series(row,n), axis=1)
train_data_set['team1_series'].describe()

count    948.000000
mean      50.146867
std       22.840814
min        0.000000
25%       35.968379
50%       50.000000
75%       63.636364
max      100.000000
Name: team1_series, dtype: float64

In [None]:
train_data_set[['team1_bound', 'team2_bound']] = train_data_set.apply(lambda row: compute_avg_boundaries(row,n), axis=1)
train_data_set['team1_bound'].describe()

count    948.000000
mean      14.066245
std        7.452144
min        0.000000
25%        8.550000
50%       15.800000
75%       19.850000
max       29.800000
Name: team1_bound, dtype: float64

In [None]:
train_data_set[['team1_top3', 'team2_top3']] = train_data_set.apply(lambda row: compute_avg_top3_batsman(row,n), axis=1)
train_data_set['team1_top3'].describe()

count    948.000000
mean      99.908017
std       37.536575
min        0.000000
25%       92.000000
50%      108.600000
75%      122.400000
max      199.000000
Name: team1_top3, dtype: float64

In [None]:
train_data_set[['team1_economy', 'team2_economy']] = train_data_set.apply(lambda row: compute_avg_economy(row,n), axis=1)
train_data_set['team1_economy'].describe()

count    948.000000
mean       7.368602
std        2.477896
min        0.000000
25%        7.129077
50%        8.000000
75%        8.636130
max       12.562500
Name: team1_economy, dtype: float64

In [None]:
toss_win_ratio=calculate_toss_win_ratio()
print(toss_win_ratio)
train_data_set[['team1_toss', 'team2_toss']] = train_data_set.apply(lambda row: assign_toss_ratio(row, toss_win_ratio), axis=1)
train_data_set['team1_toss'].describe()

0.48523206751054854


count    948.000000
mean       0.504455
std        0.014087
min        0.485232
25%        0.485232
50%        0.514768
75%        0.514768
max        0.514768
Name: team1_toss, dtype: float64

In [None]:
train_data_set.columns

Index(['match id', 'team1', 'team2', 'match_dt', 'winner', 'lighting',
       'team1_roster_ids', 'team2_roster_ids', 'ground_id', 'venue', 'city',
       'lighting_day match', 'lighting_day/night match',
       'lighting_night match', 'season', 'series_name', 'toss winner',
       'team1_avg_strike_rate', 'team2_avg_strike_rate',
       'team1_avg_win_percent', 'team2_avg_win_percent', 'direct_ecounter_1',
       'direct_ecounter_2', 'team1_venue', 'team2_venue', 'team1_avg_wickets',
       'team2_avg_wickets', 'team1_season', 'team2_season', 'team1_dot',
       'team2_dot', 'team1_series', 'team2_series', 'team1_bound',
       'team2_bound', 'team1_top3', 'team2_top3', 'team1_economy',
       'team2_economy', 'team1_toss', 'team2_toss'],
      dtype='object')

In [None]:
train_data_set.dtypes

match id                             int64
team1                               object
team2                               object
match_dt                    datetime64[ns]
winner                              object
lighting                            object
team1_roster_ids                    object
team2_roster_ids                    object
ground_id                            int64
venue                               object
city                                object
lighting_day match                    bool
lighting_day/night match              bool
lighting_night match                  bool
season                              object
series_name                         object
toss winner                         object
team1_avg_strike_rate              float64
team2_avg_strike_rate              float64
team1_avg_win_percent              float64
team2_avg_win_percent              float64
direct_ecounter_1                  float64
direct_ecounter_2                  float64
team1_venue

In [None]:
train_data_set['winners'] = train_data_set.apply(lambda row: 1 if row['winner'] == row['team1'] else 0, axis=1)


In [None]:
train_data_set.columns

Index(['match id', 'team1', 'team2', 'match_dt', 'winner', 'lighting',
       'team1_roster_ids', 'team2_roster_ids', 'ground_id', 'venue', 'city',
       'lighting_day match', 'lighting_day/night match',
       'lighting_night match', 'season', 'series_name', 'toss winner',
       'team1_avg_strike_rate', 'team2_avg_strike_rate',
       'team1_avg_win_percent', 'team2_avg_win_percent', 'direct_ecounter_1',
       'direct_ecounter_2', 'team1_venue', 'team2_venue', 'team1_avg_wickets',
       'team2_avg_wickets', 'team1_season', 'team2_season', 'team1_dot',
       'team2_dot', 'team1_series', 'team2_series', 'team1_bound',
       'team2_bound', 'team1_top3', 'team2_top3', 'team1_economy',
       'team2_economy', 'team1_toss', 'team2_toss', 'winners'],
      dtype='object')

In [None]:
train_data_set.drop(['winner'], axis=1, inplace= True);
train_data_set.dtypes

match id                             int64
team1                               object
team2                               object
match_dt                    datetime64[ns]
lighting                            object
team1_roster_ids                    object
team2_roster_ids                    object
ground_id                            int64
venue                               object
city                                object
lighting_day match                    bool
lighting_day/night match              bool
lighting_night match                  bool
season                              object
series_name                         object
toss winner                         object
team1_avg_strike_rate              float64
team2_avg_strike_rate              float64
team1_avg_win_percent              float64
team2_avg_win_percent              float64
direct_ecounter_1                  float64
direct_ecounter_2                  float64
team1_venue                        float64
team2_venue

In [None]:
train_data_set.to_csv('train_data_set.csv', index=False)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from sklearn.metrics import accuracy_score


In [None]:
# Separate the target variable 'winner'
X = train_data_set.drop(columns=['winners'])
y = train_data_set['winners']

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Handling missing values
imputer = SimpleImputer(strategy='most_frequent')
X = imputer.fit_transform(X)

# Encoding categorical variables if any
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Convert X back to a DataFrame to handle columns if necessary
X = pd.DataFrame(X, columns=train_data_set.drop(columns=['winners']).columns)



In [None]:
# Drop rows with missing target values
train_data_set = train_data_set.dropna(subset=['winners'])

# Separate target variable
y = train_data_set['winners']

# Columns to exclude from training
exclude_columns = [
     'team1', 'team2', 'match_dt', 'winners','lighting',
       'team1_roster_ids', 'team2_roster_ids', 'ground_id', 'venue', 'city',
       'lighting_day match', 'lighting_day/night match',
       'lighting_night match', 'season', 'series_name', 'toss winner'
]

# All columns in the dataframe
all_columns = train_data_set.columns.tolist()

# Columns to use for training
feature_columns = [col for col in all_columns if col not in exclude_columns]

X = train_data_set[feature_columns]
X= pd.concat([X, matches[['match id','team1_id','team2_id']]], axis=1)
X_df = pd.DataFrame(X)  # If X is not already a DataFrame

X_df.to_csv('features.csv', index=False)

y_df = pd.DataFrame(y)  # If y is not already a DataFrame

y_df.to_csv('target.csv', index=False, header=['winner'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Concatenate X_train and y_train
train_set = pd.concat([X_train, y_train], axis=1)

# Concatenate X_test and y_test
test_set = pd.concat([X_test, y_test], axis=1)
# X_train
X_train.drop(['match id','team1_id','team2_id'], axis=1, inplace=True)
X_test.drop(['match id','team1_id','team2_id'], axis=1, inplace=True)
# print(X_train)
# Optionally, you can reset index if needed
train_set.reset_index(drop=True, inplace=True)
test_set.reset_index(drop=True, inplace=True)

train_set.describe()

Unnamed: 0,match id,team1_avg_strike_rate,team2_avg_strike_rate,team1_avg_win_percent,team2_avg_win_percent,direct_ecounter_1,direct_ecounter_2,team1_venue,team2_venue,team1_avg_wickets,...,team1_top3,team2_top3,team1_economy,team2_economy,team1_toss,team2_toss,match id.1,team1_id,team2_id,winners
count,758.0,534.0,508.0,758.0,758.0,758.0,758.0,758.0,758.0,758.0,...,758.0,758.0,758.0,758.0,758.0,758.0,758.0,758.0,758.0,758.0
mean,9319652.0,109.073994,107.813094,33.23219,35.200088,28.463346,25.494437,29.150623,28.478935,5.727704,...,99.630343,102.301715,7.344758,7.514097,0.504754,0.495246,9319652.0,22026.827177,21858.254617,0.497361
std,229651.9,15.169385,14.217567,28.228249,28.822867,41.273008,39.580115,39.011786,39.143718,2.007949,...,37.461375,36.483411,2.473322,2.310662,0.013991,0.013991,229651.9,17711.027965,17610.911838,0.500323
min,8797053.0,29.17,62.068357,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.485232,0.485232,8797053.0,20.0,20.0,0.0
25%,9097075.0,100.519023,98.138932,20.0,20.0,0.0,0.0,0.0,0.0,4.65,...,91.85,93.0,7.12022,7.208882,0.485232,0.485232,9097075.0,6838.0,7258.0,0.0
50%,9327496.0,109.153318,107.857636,25.0,33.333333,0.0,0.0,0.0,0.0,6.0,...,108.225,109.0,7.990608,8.020811,0.514768,0.485232,9327496.0,18360.0,17982.0,0.0
75%,9478318.0,118.661848,117.964955,50.0,50.0,55.357143,50.0,50.0,50.0,7.0,...,122.383333,124.95,8.629374,8.745892,0.514768,0.514768,9478318.0,36126.0,36126.0,1.0
max,9866373.0,153.6172,149.160348,100.0,100.0,100.0,100.0,100.0,100.0,11.0,...,179.0,193.0,12.5625,11.65,0.514768,0.514768,9866373.0,49657.0,49657.0,1.0


In [None]:
print("Shape of X_train:", X_train.shape)
print("Shape of train_set:", train_set.shape)

Shape of X_train: (853, 28)
Shape of train_set: (758, 29)


XGBoost


In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Initialize the XGBoost classifier
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Train the model
xgb_model.fit(X_train, y_train)

# Predict on the training data
y_train_pred = xgb_model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")

# Predict on the test data
y_test_pred = xgb_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


Training Accuracy: 100.00%
Test Accuracy: 85.26%


Gradient Boosting Machine

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Example: Assuming X_train, X_test, y_train, y_test are already defined from previous steps

imputer = SimpleImputer(strategy='mean')

# Fit and transform the imputer on the training data
X_train = imputer.fit_transform(X_train)

# Transform the test data using the fitted imputer
X_test = imputer.transform(X_test)
# Initialize GBM classifier
gbm = GradientBoostingClassifier(random_state=42)


# Fit the model on training data
gbm.fit(X_train, y_train)

# Predict on training set
y_train_pred = gbm.predict(X_train)

# Calculate training accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy (GBM): {train_accuracy}")

# Predict on test set
y_test_pred = gbm.predict(X_test)

# Calculate test accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy (GBM): {test_accuracy}")


Training Accuracy (GBM): 0.9881266490765171
Test Accuracy (GBM): 0.8526315789473684


In [None]:
import lightgbm as lgb

# Convert data to lightgbm Dataset format
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

# Set parameters for LightGBM
params = {
    'objective': 'binary',
    'metric': 'binary_error',
    'verbosity': -1,
    'seed': 42
}

# Train LightGBM model
lgb_model = lgb.train(params, train_data, num_boost_round=100, valid_sets=[train_data, test_data])

# Predict on training set
y_train_pred_lgb = lgb_model.predict(X_train)
y_train_pred_lgb = [round(pred) for pred in y_train_pred_lgb]

# Calculate training accuracy
train_accuracy_lgb = accuracy_score(y_train, y_train_pred_lgb)
print(f"Training Accuracy (LightGBM): {train_accuracy_lgb}")

# Predict on test set
y_test_pred_lgb = lgb_model.predict(X_test)
y_test_pred_lgb = [round(pred) for pred in y_test_pred_lgb]

# Calculate test accuracy
test_accuracy_lgb = accuracy_score(y_test, y_test_pred_lgb)
print(f"Test Accuracy (LightGBM): {test_accuracy_lgb}")


Training Accuracy (LightGBM): 1.0
Test Accuracy (LightGBM): 0.8526315789473684


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [None]:
from catboost import CatBoostClassifier

# Initialize CatBoost classifier
catboost_model = CatBoostClassifier(iterations=1000, random_state=42, logging_level='Silent')

# Fit the model on training data
catboost_model.fit(X_train, y_train)

# Predict on training set
y_train_pred_cb = catboost_model.predict(X_train)

# Calculate training accuracy
train_accuracy_cb = accuracy_score(y_train, y_train_pred_cb)
print(f"Training Accuracy (CatBoost): {train_accuracy_cb}")

# Predict on test set
y_test_pred_cb = catboost_model.predict(X_test)

# Calculate test accuracy
test_accuracy_cb = accuracy_score(y_test, y_test_pred_cb)
print(f"Test Accuracy (CatBoost): {test_accuracy_cb}")

Training Accuracy (CatBoost): 1.0
Test Accuracy (CatBoost): 0.8789473684210526


Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Assuming you have already prepared your features X and target variable y
# Split the data into training and test sets (adjust test_size and random_state as needed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

imputer = SimpleImputer(strategy='mean')

# Fit and transform the imputer on the training data
X_train = imputer.fit_transform(X_train)

# Transform the test data using the fitted imputer
X_test = imputer.transform(X_test)

# Initialize Random Forest classifier with 100 trees
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model on the training data
rf_model.fit(X_train, y_train)

# Predict on training set
y_train_pred = rf_model.predict(X_train)

# Calculate training accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy}")

# Predict on test set
y_test_pred = rf_model.predict(X_test)

# Calculate test accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy}")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_test_pred))

# If you want to extract feature importances
feature_importances = rf_model.feature_importances_
print("Feature Importances:")
print(feature_importances)


Training Accuracy: 1.0
Test Accuracy: 0.8842105263157894
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.87      0.88        45
           1       0.88      0.90      0.89        50

    accuracy                           0.88        95
   macro avg       0.88      0.88      0.88        95
weighted avg       0.88      0.88      0.88        95

Feature Importances:
[0.02187892 0.02217316 0.02240554 0.05300149 0.05269262 0.00840851
 0.00634786 0.00933527 0.00845291 0.01777967 0.01766434 0.16689903
 0.19642417 0.02180023 0.02128744 0.07483895 0.0626712  0.02637244
 0.02397579 0.02526479 0.02167739 0.0260757  0.02523313 0.0030842
 0.00323983 0.02404445 0.01620943 0.02076155]


File Generation

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:

X_featureNames=X.columns
imputer = SimpleImputer(strategy='mean')

# Fit and transform the imputer on the training data
X_train = imputer.fit_transform(X_train)

# Transform the test data using the fitted imputer
X_test = imputer.transform(X_test)
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)
Xtp=pd.DataFrame(X_train);
Xtp.describe()

Unnamed: 0,match id,team1_avg_strike_rate,team2_avg_strike_rate,team1_avg_win_percent,team2_avg_win_percent,direct_ecounter_1,direct_ecounter_2,team1_venue,team2_venue,team1_avg_wickets,...,team2_bound,team1_top3,team2_top3,team1_economy,team2_economy,team1_toss,team2_toss,match id.1,team1_id,team2_id
count,853.0,853.0,853.0,853.0,853.0,853.0,853.0,853.0,853.0,853.0,...,853.0,853.0,853.0,853.0,853.0,853.0,853.0,853.0,853.0,853.0
mean,9328845.0,109.004483,107.739245,32.825322,34.960922,28.160084,25.649998,29.341654,29.873938,5.691676,...,13.981243,99.995428,102.283177,7.377841,7.571238,0.504657,0.495343,9328845.0,22063.656506,21931.780774
std,230415.0,12.50106,11.516708,28.073509,28.396409,40.866679,39.420917,39.026963,39.696605,2.033704,...,7.178137,37.353908,35.517918,2.47036,2.25022,0.014023,0.014023,230415.0,17677.463967,17617.786581
min,8797053.0,29.17,62.068357,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.485232,0.485232,8797053.0,20.0,20.0
25%,9097227.0,104.901818,103.379364,0.0,20.0,0.0,0.0,0.0,0.0,4.6,...,8.6,92.0,92.8,7.12987,7.265823,0.485232,0.485232,9097227.0,6838.0,7258.0
50%,9330915.0,109.004483,107.739245,25.0,33.333333,0.0,0.0,0.0,0.0,6.0,...,15.4,108.6,109.2,8.016667,8.066087,0.514768,0.485232,9330915.0,18570.0,17982.0
75%,9484600.0,114.519394,111.961455,50.0,50.0,50.0,50.0,50.0,50.0,7.0,...,19.4,122.333333,124.2,8.635514,8.752941,0.514768,0.514768,9484600.0,36126.0,36126.0
max,9887863.0,153.6172,150.7334,100.0,100.0,100.0,100.0,100.0,100.0,11.0,...,29.8,199.0,193.0,12.5625,11.65,0.514768,0.514768,9887863.0,49657.0,49657.0


In [None]:
# Check the shapes of your dataframes and predictions
print("Shape of X_train:", X_train.shape)
print("Shape of train_set:", train_set.shape)
# print("Length of predictions:", len(clf_gbm.predict(X_train)))

# If there's a mismatch, re-examine your data splitting process.
# Make sure no target variable information is leaking into X_train.

Shape of X_train: (853, 28)
Shape of train_set: (758, 29)


In [None]:

# user-defined parameters

algo_name = 'GradientBoostingClassifier'
is_ensemble = 'no'
n_trees = 10
depth = 2
lr = 0.1

In [None]:
clf_gbm = GradientBoostingClassifier(n_estimators = n_trees, max_depth = depth, learning_rate = lr).fit(X_train,y_train)

In [None]:
train_set['y_pred_01'] = clf_gbm.predict(X_train)
test_set['y_pred_01'] = clf_gbm.predict(X_test)

ValueError: Length of values (853) does not match length of index (758)

Shape of X_train: (853, 28)
Shape of train_set: (758, 29)
Length of predictions: 853


In [None]:
from sklearn.metrics import classification_report

In [None]:
# Train accuracy
print(classification_report(y_train, clf_gbm.predict(X_train), labels=[0,1]))

              precision    recall  f1-score   support

           0       0.85      0.82      0.84       694
           1       0.82      0.85      0.84       657

    accuracy                           0.84      1351
   macro avg       0.84      0.84      0.84      1351
weighted avg       0.84      0.84      0.84      1351



In [None]:
train_set['win_pred_score'] = clf_gbm.predict_proba(X_train)[:,1]
test_set['win_pred_score'] = clf_gbm.predict_proba(X_test)[:,1]

In [None]:
train_set['win_pred_score'] = np.where( (train_set['y_pred_01']==0), (1-train_set['win_pred_score']), train_set['win_pred_score'])
test_set['win_pred_score'] = np.where( (test_set['y_pred_01']==0), (1-test_set['win_pred_score']), test_set['win_pred_score'])

In [None]:
train_set['win_pred_team_id'] = np.where( (train_set['y_pred_01']==0), (train_set['team1_id']), train_set['team2_id'])
test_set['win_pred_team_id'] = np.where( (test_set['y_pred_01']==0), (test_set['team1_id']), test_set['team2_id'])

In [None]:
X_train = pd.DataFrame(X_train)
print(X_train.columns)
df_feat_importance = pd.DataFrame({'feat_name':X_featureNames.tolist(), 'model_feat_imp_train':clf_gbm.feature_importances_}).sort_values(by='model_feat_imp_train', ascending=False)\
                                                                                                                        .reset_index(drop=True)
df_feat_importance


RangeIndex(start=0, stop=25, step=1)


ValueError: All arrays must be of the same length

In [None]:
train_set['dataset_type'] = 'train'
test_set['dataset_type'] = 'r1'

In [None]:
## refactor

df_file1 = pd.concat([test_set[['match id','dataset_type','win_pred_team_id','win_pred_score',] + list(df_feat_importance['feat_name'].head(10))], \
                     train_set[['match id','dataset_type','win_pred_team_id','win_pred_score',] + list(df_feat_importance['feat_name'].head(10))]])

renaming_dict = {}
for i,col in enumerate(list(df_feat_importance['feat_name'].head(10))):
    renaming_dict[col] = f'indep_feat_id{i+1}'
df_file1.rename(columns=renaming_dict, inplace=True)

for i in range(1,11):
    if f'indep_feat_id{i}' not in df_file1.columns:
        df_file1[f'indep_feat_id{i}'] = np.nan

df_file1['train_algorithm'] = algo_name
df_file1['is_ensemble'] = is_ensemble
df_file1['train_hps_trees'] = n_trees
df_file1['train_hps_depth'] = depth
df_file1['train_hps_lr'] = lr

In [None]:
df_file1.shape
df_file1.head()

Unnamed: 0,match id,match id.1,dataset_type,win_pred_team_id,win_pred_score,indep_feat_id1,indep_feat_id2,indep_feat_id3,indep_feat_id4,indep_feat_id5,indep_feat_id6,indep_feat_id7,indep_feat_id8,indep_feat_id9,indep_feat_id10,train_algorithm,is_ensemble,train_hps_trees,train_hps_depth,train_hps_lr
0,8752029,8752029,r1,8917,0.601186,37.5,40.0,20.0,0.0,52.459016,119.178439,8301,0.497336,0.502664,8.839286,GradientBoostingClassifier,no,10,2,0.1
1,8764433,8764433,r1,76,0.74846,0.0,100.0,100.0,0.0,0.0,,293,0.497336,0.502664,9.675,GradientBoostingClassifier,no,10,2,0.1
2,8752043,8752043,r1,7608,0.586758,60.0,71.428571,20.0,40.0,42.857143,117.414303,8056,0.502664,0.497336,9.051724,GradientBoostingClassifier,no,10,2,0.1
3,8710589,8710589,r1,14286,0.691975,0.0,66.666667,33.333333,0.0,28.571429,,45919,0.497336,0.502664,9.352941,GradientBoostingClassifier,no,10,2,0.1
4,8710519,8710519,r1,11157,0.688622,33.333333,0.0,0.0,33.333333,30.0,100.32625,11157,0.502664,0.497336,8.675676,GradientBoostingClassifier,no,10,2,0.1


In [None]:
# df_feat_importance.rename(index={0:'feat_id'}, inplace=True)
df_file2 = df_feat_importance
df_file2['feat_id'] = [i+1 for i in df_file2.index]
df_file2['feat_rank_train'] = [i+1 for i in df_file2.index]
df_file2 = df_file2.set_index('feat_id')
df_file2['feat_description'] = "Lund_mera"

NameError: name 'df_feat_importance' is not defined

In [None]:
df_file1.to_csv('file1.csv', index=False)
df_file2.to_csv('file2.csv')

In [None]:
import pandas as pd
import sys


# Instructions for participants :
'''
Participants can use this code to run on labeled train/out-of-sample data to mimic evaluation process.
### Datasets required:
This script takes in 3 files as follows:

primary_submission.csv -> This contains the match_id, dataset_type, win_pred_team_id, win_pred_score, train_algorithm, is_ensemble, train_hps_trees, train_hps_depth, train_hps_lr, *top 10 feature values. This is file submitted by participant.
secondary_submission.csv -> This contains feature_name, feature_description, model_feature_importance_rank, model_feature_importance_percentage, feature_correlation_dep_var. This is file submitted by participant.
dep_var.csv    -> This contains match_id, dataset_type, win_team_id. Participants can generate from the labeled train data.

Please ensure that the predicted_score column does not have any null columns and the column names are exactly matching as above.
Please ensure that all these files are stored as ',' separated csv files.

### How to use:
To use this, first open the command line terminal, and call evaluation code script by passing the locations of submission and actual files respectively.
Sample example of using commandline for running the script:

python Evaluation_Code.py path_to_primary_submission_file path_to_secondary_submission_file path_to_DepVar_file
'''


def checkDataType1(df):
    assert (df['match id'].isna().sum() == 0), 'match id should not have NaNs'
    assert (df['match id'].dtype == 'int64'), ('match id is not int64 type')
    assert df['win_pred_team_id'].isna().sum(
    ) == 0, 'win_pred_team_id should not have NaNs'
    assert df['win_pred_team_id'].dtype == 'int64', (
        'win_pred_team_id is not int64 type')
    assert df['win_pred_score'].isna().sum(
    ) == 0, 'win_pred_score should not have NaNs'
    assert df['win_pred_score'].dtype == 'float64', (
        'win_pred_score is not float64 type')
    assert df['train_algorithm'].isna().sum(
    ) == 0, 'train_algorithm should not have NaNs'
    assert df['train_algorithm'].dtype == 'object', (
        'train_algorithm is not object type')
    assert df['is_ensemble'].isna().sum(
    ) == 0, 'is_ensemble should not have NaNs'
    assert df['is_ensemble'].dtype == 'object', (
        'is_ensemble is not object type')
    assert df['train_hps_trees'].isna().sum(
    ) == 0, 'train_hps_trees should not have NaNs'
    assert df['train_hps_depth'].isna().sum(
    ) == 0, 'train_hps_depth should not have NaNs'
    assert df['train_hps_lr'].isna().sum(
    ) == 0, 'train_hps_lr should not have NaNs'
    return None


def checkDataType2(df):
    assert df['feat_id'].isna().sum() == 0, 'feat_id should not have NaNs'
    assert df['feat_id'].dtype == 'int64', ('feat_id is not int type')
    assert df['feat_name'].isna().sum() == 0, 'feat_name should not have NaNs'
    assert df['feat_name'].dtype == 'object', ('feat_name is not object type')
    assert df['feat_description'].isna().sum(
    ) == 0, 'feat_description should not have NaNs'
    assert df['feat_description'].dtype == 'object', (
        'feat_description is not object type')
    assert df['model_feat_imp_train'].isna().sum(
    ) == 0, ' model_feat_imp_train should not have NaNs'
    assert df['model_feat_imp_train'].dtype == 'float64', (
        'model_feat_imp_train is not float type')
    assert df['feat_rank_train'].isna().sum(
    ) == 0, 'feat_rank_train should not have NaNs'
    assert df['feat_rank_train'].dtype == 'int64', (
        'feat_rank_train is not int64 type')
    return None


def getAccuracy(df):
    return round(df[df['winner_id'] == df['win_pred_team_id']].shape[0]*100/df.shape[0], 4)

if len(sys.argv) != 4:
  sys.exit("Please pass three files only as mentioned in the Instructions.")

# Location of submission file. Header here should include match_id, dataset_type, win_team_id. The file should be comma separated.
input1_address = /content/file1.csv
df_input1 = pd.read_csv(input1_address, sep=",", header=0)

input2_address = /content/file2.csv
df_input2 = pd.read_csv(input2_address, sep=",", header=0)

# For participants Team : Location of Dependent Variable file. Header here would be match_id, dataset_type, win_team_id. Participants can generate from the labeled train data. These files are comma separated
round_eval = sys.argv[3]
df_round = pd.read_csv(round_eval, sep=",", header=0)

assert set(['match id', 'dataset_type', 'win_pred_team_id', 'win_pred_score', 'train_algorithm', 'is_ensemble', 'train_hps_trees',
           'train_hps_depth', 'train_hps_lr']).issubset(set(df_input1.columns.tolist())), 'Required columns not present in primary submission file'
assert set(['indep_feat_id1', 'indep_feat_id2', 'indep_feat_id3', 'indep_feat_id4', 'indep_feat_id5', 'indep_feat_id6', 'indep_feat_id7', 'indep_feat_id8',
           'indep_feat_id9', 'indep_feat_id10']).issubset(set(df_input1.columns.tolist())), 'Required indepedent feature columns not present in primary submission file'
assert set(['feat_id', 'feat_name', 'feat_description', 'model_feat_imp_train', 'feat_rank_train']).issubset(
    set(df_input2.columns.tolist())), 'Required columns not present in secondary submission file'

checkDataType1(df_input1)
checkDataType2(df_input2)

assert df_input1.shape[0] == df_input1.drop_duplicates(
    'match id').shape[0], 'Input file should be unique on match id'
# assert df_input1.shape[
#     0] == 1219, f'Input file size number of rows incorrect. Expected rowsize 1219 not equal to uploaded data rowsize {df_input1.shape[0]}'
assert df_input1.shape[1] == 19, 'Input file number of columns not correct. '
assert (df_input1.win_pred_score.min() >= 0) & (df_input1.win_pred_score.max(
) <= 1), 'Win prediction score should be in range [0,1]'
assert df_input1['train_algorithm'].nunique(
) == 1, 'only one algorithm can be used for all data'
assert (len(df_input1['is_ensemble'].unique().tolist()) == 1) & ((df_input1['is_ensemble'].unique().tolist()[
    0] == 'yes') | (df_input1['is_ensemble'].unique().tolist()[0] == 'no')), 'is_ensemble can take only \'yes\' or \'no\''
assert df_input1.apply(lambda x: 0 if (len(str(x['train_algorithm']).split(';')) == len(str(x['train_hps_trees']).split(';'))) &
                       (len(str(x['train_algorithm']).split(';')) == len(str(x['train_hps_depth']).split(';'))) & (len(str(x['train_algorithm']).split(';')) == len(str(x['train_hps_lr']).split(';'))) else 1, axis=1).max() == 0, 'number of fields in algorithm & hyper-parameters column should be same.'

'''
shape_before_join = df_round.shape[0]

r1_size = df_input1[df_input1['dataset_type'] == 'r1'].shape[0]
assert (r1_size ==
        df_round.shape[0]), f'R1 data size in input file is incorrect. Expected rowsize 271 not equal to r1 dataset_type present {r1_size}'
'''

# merging predicted file and dependent variable file
eval_data = pd.merge(df_round, df_input1, on=[
                     'match id'], how='inner').drop_duplicates()
assert (eval_data.shape[0] == df_round.shape[0]
        ), 'match ids in submission template does not match eval data'

print('All checks passed...')
print('Accuracy: ', round(getAccuracy(eval_data), 2))