In [None]:
pip install catboost

In [72]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier, Pool
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import VotingClassifier

In [16]:
train_data = pd.read_csv('train_data.csv')
match_data = pd.read_csv('match_level_data.csv')
batsman_data = pd.read_csv('batsman_data.csv')
bowler_data = pd.read_csv('bowler_data.csv')

In [17]:
train_data.columns

Index(['match id', 'team1', 'team1_id', 'team1_roster_ids', 'team2',
       'team2_id', 'team2_roster_ids', 'winner', 'winner_id', 'toss winner',
       'toss decision', 'venue', 'city', 'match_dt', 'lighting', 'series_name',
       'season', 'ground_id', 'team_count_50runs_last15', 'team_winp_last5',
       'team1only_avg_runs_last15', 'team1_winp_team2_last15',
       'ground_avg_runs_last15'],
      dtype='object')

In [8]:
match_data.columns

Index(['match id', 'team1', 'team2', 'winner', 'by', 'win amount',
       'toss winner', 'toss decision', 'venue', 'city', 'match_dt', 'lighting',
       'series_name', 'season', 'ground_id', 'umpire1', 'umpire2',
       'inning1_runs', 'inning1_wickets', 'inning1_balls', 'inning2_runs',
       'inning2_wickets', 'inning2_balls', 'team1_id', 'team1_roster_ids',
       'team2_id', 'team2_roster_ids', 'series_type', 'winner_id',
       'player_of_the_match_id'],
      dtype='object')

In [9]:
batsman_data.columns

Index(['match id', 'batsman', 'batsman_id', 'batsman_details',
       'is_batsman_captain', 'is_batsman_keeper', 'inning', 'runs',
       'balls_faced', 'over_faced_first', 'wicket kind', 'out_by_bowler',
       'out_by_fielder', 'bowler_id', 'bowler_details', 'is_bowler_keeper',
       'is_bowler_captain', 'strike_rate', 'Fours', 'Sixes', 'match_dt'],
      dtype='object')

In [10]:
bowler_data.columns

Index(['match id', 'bowler', 'bowler_id', 'bowler_details',
       'is_bowler_captain', 'is_bowler_keeper', 'inning', 'runs',
       'wicket_count', 'balls_bowled', 'economy', 'maiden', 'dots', 'Fours',
       'Sixes', 'wides', 'noballs', 'match_dt'],
      dtype='object')

In [19]:
import pandas as pd

def calculate_economy_ratio(match_data, bowler_data, test_data):
    # Step 1: Parse the roster IDs into individual player IDs for both team1 and team2
    match_data['team1_roster_list'] = match_data['team1_roster_ids'].str.split(':')
    match_data['team2_roster_list'] = match_data['team2_roster_ids'].str.split(':')

    # Step 2: Function to calculate average economy for a team
    def calculate_team_economy(roster_list, bowler_data):
        economies = []
        for player_id in roster_list:
            player_id = float(player_id)  # Convert to float if necessary
            if player_id in bowler_data['bowler_id'].values:
                economy = bowler_data[bowler_data['bowler_id'] == player_id]['economy'].values[0]
                economies.append(economy)

        if len(economies) > 0:
            return sum(economies) / len(economies)
        else:
            return None

    # Step 3: Sort match_data by match_dt in descending order
    match_data = match_data.sort_values(by='match_dt', ascending=False)

    # Step 4: Calculate average economy for the last 15 matches for each team
    teams = pd.concat([match_data['team1_id'], match_data['team2_id']]).unique()
    team_avg_economy = {}

    for team_id in teams:
        team_matches = match_data[(match_data['team1_id'] == team_id) | (match_data['team2_id'] == team_id)].head(15)
        team_roster = team_matches.apply(lambda x: x['team1_roster_list'] if x['team1_id'] == team_id else x['team2_roster_list'], axis=1)
        team_economies = team_roster.apply(lambda x: calculate_team_economy(x, bowler_data))
        team_avg_economy[team_id] = team_economies.mean()

    # Step 5: Calculate the ratio of average economies for each match in test_data
    test_data['team1_avg_economy'] = test_data['team1_id'].map(team_avg_economy)
    test_data['team2_avg_economy'] = test_data['team2_id'].map(team_avg_economy)
    test_data['avg_economy_ratio'] = test_data.apply(
        lambda row: row['team1_avg_economy'] / row['team2_avg_economy'] if row['team2_avg_economy'] and row['team1_avg_economy'] else 0,
        axis=1
    )

    test_data.drop(columns=['team1_avg_economy', 'team2_avg_economy'], inplace=True)

    return test_data


In [20]:
train_data = calculate_economy_ratio(match_data, bowler_data, train_data)
train_data.head()

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,winner,winner_id,toss winner,...,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,avg_economy_ratio
0,9331181,Ba,11283,9373356.0:7857520.0:4232164.0:4566540.0:329940...,Hl Ph,12634,3500958.0:4231751.0:2735081.0:2035102.0:369833...,Hl Ph,12634,Hl Ph,...,day/night match,Sd Mq Ai Ty,2022/23,7398,1.666667,0.672131,139.0,100.0,157.178571,1.185952
1,8797060,Ed,20,2089079.0:6139370.0:2076192.0:62432.0:2083409....,Wt Is,41,4690258.0:4069666.0:4230127.0:1942317.0:161392...,Ed,20,Wt Is,...,day/night match,Ed tr of Wt Is,2021/22,1406,1.285714,1.952381,156.0,50.0,103.5,1.110736
2,9433269,We,10576,3298427.0:2288789.0:7773338.0:3519011.0:368195...,Ne,8987,4003390.0:1749075.0:1626526.0:4172447.0:551672...,We,10576,Ne,...,day/night match,Vy Bt,2023,251,0.857143,0.672131,173.266667,0.0,154.333333,0.8809
3,9587073,Ga An Ws,36084,8127230.0:4690328.0:4069666.0:7960847.0:469018...,Bs Rs,36070,3462080.0:2436405.0:1798705.0:7550857.0:574247...,Ga An Ws,36084,Ga An Ws,...,day match,Cn Pr Le,2023,14300,2.166667,1.97561,164.266667,50.0,144.25,1.050725
4,9516457,Pb Ks,30407,8127181.0:197658.0:4239038.0:2398346.0:5053082...,Gt Ts,48341,1958683.0:7491224.0:8059029.0:4377610.0:225245...,Gt Ts,48341,Gt Ts,...,night match,In Pr Le,2023,7118,0.818182,1.327869,164.666667,0.0,189.0,1.142877


In [21]:
import pandas as pd

def add_avg_strike_rate_ratio_feature(match_data, batsman_data, dataset):
    # Step 1: Parse the roster IDs into individual player IDs for both team1 and team2
    match_data['team1_roster_list'] = match_data['team1_roster_ids'].str.split(':')
    match_data['team2_roster_list'] = match_data['team2_roster_ids'].str.split(':')

    # Step 2: Function to calculate average strike rate for a team
    def calculate_team_strike_rate(roster_list, batsman_data):
        strike_rates = []
        for player_id in roster_list:
            try:
                player_id = float(player_id)  # Convert to float if necessary
                if player_id in batsman_data['batsman_id'].values:
                    strike_rate = batsman_data[batsman_data['batsman_id'] == player_id]['strike_rate'].values[0]
                    strike_rates.append(strike_rate)
            except ValueError:
                print(f"Invalid player ID: {player_id}")

        if len(strike_rates) > 0:
            return sum(strike_rates) / len(strike_rates)
        else:
            return 0  # Return 0 if no batsmen are found

    # Step 3: Sort match_data by match_dt in descending order
    match_data = match_data.sort_values(by='match_dt', ascending=False)

    # Step 4: Calculate average strike rate for the last 15 matches for each team
    teams = pd.concat([match_data['team1_id'], match_data['team2_id']]).unique()
    team_avg_strike_rates = {}

    for team_id in teams:
        team_matches = match_data[(match_data['team1_id'] == team_id) | (match_data['team2_id'] == team_id)].head(15)
        team_roster = team_matches.apply(lambda x: x['team1_roster_list'] if x['team1_id'] == team_id else x['team2_roster_list'], axis=1)
        team_strike_rates = team_roster.apply(lambda x: calculate_team_strike_rate(x, batsman_data))
        team_avg_strike_rates[team_id] = team_strike_rates.mean()

    # Step 5: Calculate the ratio of average strike rates for each match in dataset
    dataset['team1_avg_strike_rate'] = dataset['team1_id'].map(team_avg_strike_rates)
    dataset['team2_avg_strike_rate'] = dataset['team2_id'].map(team_avg_strike_rates)
    dataset['avg_strike_rate_ratio'] = dataset.apply(
        lambda row: row['team1_avg_strike_rate'] / row['team2_avg_strike_rate'] if row['team2_avg_strike_rate'] != 0 else 1,
        axis=1
    )

    dataset.drop(columns=['team1_avg_strike_rate', 'team2_avg_strike_rate'], inplace=True)

    return dataset


In [22]:
train_data = add_avg_strike_rate_ratio_feature(match_data, batsman_data, train_data)
train_data.head()

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,winner,winner_id,toss winner,...,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,avg_economy_ratio,avg_strike_rate_ratio
0,9331181,Ba,11283,9373356.0:7857520.0:4232164.0:4566540.0:329940...,Hl Ph,12634,3500958.0:4231751.0:2735081.0:2035102.0:369833...,Hl Ph,12634,Hl Ph,...,Sd Mq Ai Ty,2022/23,7398,1.666667,0.672131,139.0,100.0,157.178571,1.185952,1.532148
1,8797060,Ed,20,2089079.0:6139370.0:2076192.0:62432.0:2083409....,Wt Is,41,4690258.0:4069666.0:4230127.0:1942317.0:161392...,Ed,20,Wt Is,...,Ed tr of Wt Is,2021/22,1406,1.285714,1.952381,156.0,50.0,103.5,1.110736,0.800575
2,9433269,We,10576,3298427.0:2288789.0:7773338.0:3519011.0:368195...,Ne,8987,4003390.0:1749075.0:1626526.0:4172447.0:551672...,We,10576,Ne,...,Vy Bt,2023,251,0.857143,0.672131,173.266667,0.0,154.333333,0.8809,0.666431
3,9587073,Ga An Ws,36084,8127230.0:4690328.0:4069666.0:7960847.0:469018...,Bs Rs,36070,3462080.0:2436405.0:1798705.0:7550857.0:574247...,Ga An Ws,36084,Ga An Ws,...,Cn Pr Le,2023,14300,2.166667,1.97561,164.266667,50.0,144.25,1.050725,0.933639
4,9516457,Pb Ks,30407,8127181.0:197658.0:4239038.0:2398346.0:5053082...,Gt Ts,48341,1958683.0:7491224.0:8059029.0:4377610.0:225245...,Gt Ts,48341,Gt Ts,...,In Pr Le,2023,7118,0.818182,1.327869,164.666667,0.0,189.0,1.142877,1.256099


In [26]:
import pandas as pd

def add_win_percentage_ratio_feature(match_data, train_data):
    # Step 1: Calculate overall win percentage for each team
    total_matches = pd.concat([match_data['team1_id'], match_data['team2_id']]).value_counts()
    total_wins = match_data['winner_id'].value_counts()

    win_percentage = (total_wins / total_matches).fillna(0)  # Handle teams with no wins

    # Step 2: Add win percentage ratio to train_data
    train_data['team1_win_percentage'] = train_data['team1_id'].map(win_percentage)
    train_data['team2_win_percentage'] = train_data['team2_id'].map(win_percentage)
    train_data['win_percentage_ratio'] = train_data.apply(
        lambda row: row['team1_win_percentage'] / row['team2_win_percentage'] if row['team2_win_percentage'] != 0 else 1,
        axis=1
    )

    train_data.drop(columns=['team1_win_percentage', 'team2_win_percentage'], inplace=True)

    return train_data

# Assuming match_data and train_data are already loaded as pandas DataFrames
train_data = add_win_percentage_ratio_feature(match_data, train_data)


In [27]:
train_data

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,winner,winner_id,toss winner,...,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,avg_economy_ratio,avg_strike_rate_ratio,win_percentage_ratio
0,9331181,Ba,11283,9373356.0:7857520.0:4232164.0:4566540.0:329940...,Hl Ph,12634,3500958.0:4231751.0:2735081.0:2035102.0:369833...,Hl Ph,12634,Hl Ph,...,2022/23,7398,1.666667,0.672131,139.000000,100.00,157.178571,1.185952,1.532148,0.810526
1,8797060,Ed,20,2089079.0:6139370.0:2076192.0:62432.0:2083409....,Wt Is,41,4690258.0:4069666.0:4230127.0:1942317.0:161392...,Ed,20,Wt Is,...,2021/22,1406,1.285714,1.952381,156.000000,50.00,103.500000,1.110736,0.800575,1.310638
2,9433269,We,10576,3298427.0:2288789.0:7773338.0:3519011.0:368195...,Ne,8987,4003390.0:1749075.0:1626526.0:4172447.0:551672...,We,10576,Ne,...,2023,251,0.857143,0.672131,173.266667,0.00,154.333333,0.880900,0.666431,0.698925
3,9587073,Ga An Ws,36084,8127230.0:4690328.0:4069666.0:7960847.0:469018...,Bs Rs,36070,3462080.0:2436405.0:1798705.0:7550857.0:574247...,Ga An Ws,36084,Ga An Ws,...,2023,14300,2.166667,1.975610,164.266667,50.00,144.250000,1.050725,0.933639,1.215278
4,9516457,Pb Ks,30407,8127181.0:197658.0:4239038.0:2398346.0:5053082...,Gt Ts,48341,1958683.0:7491224.0:8059029.0:4377610.0:225245...,Gt Ts,48341,Gt Ts,...,2023,7118,0.818182,1.327869,164.666667,0.00,189.000000,1.142877,1.256099,0.700000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,9128601,Pb Ks,30407,2789079.0:197658.0:2398346.0:2827327.0:2082044...,Ci Sr Ks,30414,7422673.0:249087.0:3519011.0:5001170.0:232000....,Pb Ks,30407,Ci Sr Ks,...,2022,5004,0.823529,1.000000,147.333333,66.67,166.400000,0.899728,1.172023,0.750000
944,9433241,Mx,8700,4421689.0:7752989.0:1941743.0:4489722.0:767287...,St,9701,6139370.0:7694581.0:3294444.0:3239102.0:632036...,St,9701,St,...,2023,1042,1.571429,0.012346,167.400000,0.00,170.466667,1.132232,0.666012,0.370370
945,9097227,Bd,22497,4239773.0:1941743.0:3007969.0:4172972.0:155625...,Wn Pe,23869,323049.0:4876122.0:4164978.0:1837205.0:3373138...,Wn Pe,23869,Bd,...,2021/22,1224,3.000000,1.000000,,0.00,,0.959823,0.857122,0.916667
946,9516695,Rn Rs,30428,8058959.0:2162782.0:2981614.0:4690188.0:212569...,Ss Hd,36014,5958840.0:7491294.0:3127354.0:3057312.0:420349...,Ss Hd,36014,Rn Rs,...,2023,4661,0.789474,1.487805,182.800000,66.67,133.375000,1.120217,1.213375,1.551515


In [38]:
import pandas as pd

def add_ground_specific_win_percentage_ratio_feature(match_data, train_data):
    # Step 1: Calculate total matches and wins for each team on each ground
    match_data['match_count'] = 1

    ground_team_matches = match_data.groupby(['ground_id', 'team1_id']).size().unstack(fill_value=0)
    ground_team_matches += match_data.groupby(['ground_id', 'team2_id']).size().unstack(fill_value=0)

    ground_team_wins = match_data.groupby(['ground_id', 'winner_id']).size().unstack(fill_value=0).fillna(0)

    # Ensure the columns are consistent
    for col in ground_team_matches.columns:
        if col not in ground_team_wins.columns:
            ground_team_wins[col] = 0

    for col in ground_team_wins.columns:
        if col not in ground_team_matches.columns:
            ground_team_matches[col] = 0

    # Step 2: Calculate win percentage for each team on each ground
    ground_team_win_percentage = ground_team_wins / ground_team_matches

    # Step 3: Add ground-specific win percentage ratio to train_data
    def get_win_percentage(ground_id, team_id):
        try:
            return ground_team_win_percentage.at[ground_id, team_id]
        except KeyError:
            return 0

    train_data['team1_ground_win_percentage'] = train_data.apply(lambda row: get_win_percentage(row['ground_id'], row['team1_id']), axis=1)
    train_data['team2_ground_win_percentage'] = train_data.apply(lambda row: get_win_percentage(row['ground_id'], row['team2_id']), axis=1)

    train_data['ground_win_percentage_ratio'] = train_data.apply(
        lambda row: row['team1_ground_win_percentage'] / row['team2_ground_win_percentage'] if row['team2_ground_win_percentage'] != 0 else 0,
        axis=1
    )

    train_data.drop(columns=['team1_ground_win_percentage', 'team2_ground_win_percentage'], inplace=True)

    return train_data

# Assuming match_data and train_data are already loaded as pandas DataFrames
train_data = add_ground_specific_win_percentage_ratio_feature(match_data, train_data)


In [39]:
train_data

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,winner,winner_id,toss winner,...,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,avg_economy_ratio,avg_strike_rate_ratio,win_percentage_ratio,ground_win_percentage_ratio
0,9331181,Ba,11283,9373356.0:7857520.0:4232164.0:4566540.0:329940...,Hl Ph,12634,3500958.0:4231751.0:2735081.0:2035102.0:369833...,Hl Ph,12634,Hl Ph,...,7398,1.666667,0.672131,139.000000,100.00,157.178571,1.185952,1.532148,0.810526,0.333333
1,8797060,Ed,20,2089079.0:6139370.0:2076192.0:62432.0:2083409....,Wt Is,41,4690258.0:4069666.0:4230127.0:1942317.0:161392...,Ed,20,Wt Is,...,1406,1.285714,1.952381,156.000000,50.00,103.500000,1.110736,0.800575,1.310638,0.666667
2,9433269,We,10576,3298427.0:2288789.0:7773338.0:3519011.0:368195...,Ne,8987,4003390.0:1749075.0:1626526.0:4172447.0:551672...,We,10576,Ne,...,251,0.857143,0.672131,173.266667,0.00,154.333333,0.880900,0.666431,0.698925,0.772727
3,9587073,Ga An Ws,36084,8127230.0:4690328.0:4069666.0:7960847.0:469018...,Bs Rs,36070,3462080.0:2436405.0:1798705.0:7550857.0:574247...,Ga An Ws,36084,Ga An Ws,...,14300,2.166667,1.975610,164.266667,50.00,144.250000,1.050725,0.933639,1.215278,1.000000
4,9516457,Pb Ks,30407,8127181.0:197658.0:4239038.0:2398346.0:5053082...,Gt Ts,48341,1958683.0:7491224.0:8059029.0:4377610.0:225245...,Gt Ts,48341,Gt Ts,...,7118,0.818182,1.327869,164.666667,0.00,189.000000,1.142877,1.256099,0.700000,0.250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,9128601,Pb Ks,30407,2789079.0:197658.0:2398346.0:2827327.0:2082044...,Ci Sr Ks,30414,7422673.0:249087.0:3519011.0:5001170.0:232000....,Pb Ks,30407,Ci Sr Ks,...,5004,0.823529,1.000000,147.333333,66.67,166.400000,0.899728,1.172023,0.750000,0.771429
944,9433241,Mx,8700,4421689.0:7752989.0:1941743.0:4489722.0:767287...,St,9701,6139370.0:7694581.0:3294444.0:3239102.0:632036...,St,9701,St,...,1042,1.571429,0.012346,167.400000,0.00,170.466667,1.132232,0.666012,0.370370,0.000000
945,9097227,Bd,22497,4239773.0:1941743.0:3007969.0:4172972.0:155625...,Wn Pe,23869,323049.0:4876122.0:4164978.0:1837205.0:3373138...,Wn Pe,23869,Bd,...,1224,3.000000,1.000000,,0.00,,0.959823,0.857122,0.916667,0.600000
946,9516695,Rn Rs,30428,8058959.0:2162782.0:2981614.0:4690188.0:212569...,Ss Hd,36014,5958840.0:7491294.0:3127354.0:3057312.0:420349...,Ss Hd,36014,Rn Rs,...,4661,0.789474,1.487805,182.800000,66.67,133.375000,1.120217,1.213375,1.551515,0.333333


In [40]:
def add_avg_four_ratio_feature_last_15_matches(match_data, batsman_data, dataset):
    # Step 1: Parse the roster IDs into individual player IDs for both team1 and team2
    match_data['team1_roster_list'] = match_data['team1_roster_ids'].str.split(':')
    match_data['team2_roster_list'] = match_data['team2_roster_ids'].str.split(':')

    # Step 2: Function to calculate total fours for a team in a match
    def calculate_team_fours(roster_list, batsman_data):
        total_fours = 0
        for player_id in roster_list:
            player_id = float(player_id)  # Convert to float if necessary
            player_fours = batsman_data[batsman_data['batsman_id'] == player_id]['Fours']
            if not player_fours.empty:
                total_fours += player_fours.sum()
        return total_fours

    # Step 3: Calculate total fours for team1 and team2 in each match
    match_data['team1_total_fours'] = match_data['team1_roster_list'].apply(calculate_team_fours, args=(batsman_data,))
    match_data['team2_total_fours'] = match_data['team2_roster_list'].apply(calculate_team_fours, args=(batsman_data,))

    # Step 4: Calculate average fours per match for each team in their last 15 matches
    teams = pd.concat([match_data['team1_id'], match_data['team2_id']]).unique()
    team_avg_fours_last_15 = {}

    for team_id in teams:
        # Get the last 15 matches for the team
        team_matches = match_data[(match_data['team1_id'] == team_id) | (match_data['team2_id'] == team_id)]
        team_matches = team_matches.sort_values(by='match_dt', ascending=False).head(15)

        total_fours = team_matches.apply(
            lambda x: x['team1_total_fours'] if x['team1_id'] == team_id else x['team2_total_fours'], axis=1
        )
        team_avg_fours_last_15[team_id] = total_fours.mean()

    # Step 5: Map the average fours to the dataset
    dataset['team1_avg_fours_last_15'] = dataset['team1_id'].map(team_avg_fours_last_15)
    dataset['team2_avg_fours_last_15'] = dataset['team2_id'].map(team_avg_fours_last_15)

    # Step 6: Calculate the ratio of average fours for team1 and team2
    dataset['avg_four_ratio_last_15'] = dataset.apply(
        lambda row: row['team1_avg_fours_last_15'] / row['team2_avg_fours_last_15'] if row['team2_avg_fours_last_15'] != 0 else 1,
        axis=1
    )

    # Step 7: Drop the intermediate columns
    dataset.drop(columns=['team1_avg_fours_last_15', 'team2_avg_fours_last_15'], inplace=True)

    return dataset

# Assuming match_data, batsman_data, and dataset are already loaded as pandas DataFrames
train_data = add_avg_four_ratio_feature_last_15_matches(match_data, batsman_data, train_data)


In [41]:
train_data

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,winner,winner_id,toss winner,...,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,avg_economy_ratio,avg_strike_rate_ratio,win_percentage_ratio,ground_win_percentage_ratio,avg_four_ratio_last_15
0,9331181,Ba,11283,9373356.0:7857520.0:4232164.0:4566540.0:329940...,Hl Ph,12634,3500958.0:4231751.0:2735081.0:2035102.0:369833...,Hl Ph,12634,Hl Ph,...,1.666667,0.672131,139.000000,100.00,157.178571,1.185952,1.532148,0.810526,0.333333,1.885096
1,8797060,Ed,20,2089079.0:6139370.0:2076192.0:62432.0:2083409....,Wt Is,41,4690258.0:4069666.0:4230127.0:1942317.0:161392...,Ed,20,Wt Is,...,1.285714,1.952381,156.000000,50.00,103.500000,1.110736,0.800575,1.310638,0.666667,1.979635
2,9433269,We,10576,3298427.0:2288789.0:7773338.0:3519011.0:368195...,Ne,8987,4003390.0:1749075.0:1626526.0:4172447.0:551672...,We,10576,Ne,...,0.857143,0.672131,173.266667,0.00,154.333333,0.880900,0.666431,0.698925,0.772727,0.444444
3,9587073,Ga An Ws,36084,8127230.0:4690328.0:4069666.0:7960847.0:469018...,Bs Rs,36070,3462080.0:2436405.0:1798705.0:7550857.0:574247...,Ga An Ws,36084,Ga An Ws,...,2.166667,1.975610,164.266667,50.00,144.250000,1.050725,0.933639,1.215278,1.000000,0.961681
4,9516457,Pb Ks,30407,8127181.0:197658.0:4239038.0:2398346.0:5053082...,Gt Ts,48341,1958683.0:7491224.0:8059029.0:4377610.0:225245...,Gt Ts,48341,Gt Ts,...,0.818182,1.327869,164.666667,0.00,189.000000,1.142877,1.256099,0.700000,0.250000,0.830525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,9128601,Pb Ks,30407,2789079.0:197658.0:2398346.0:2827327.0:2082044...,Ci Sr Ks,30414,7422673.0:249087.0:3519011.0:5001170.0:232000....,Pb Ks,30407,Ci Sr Ks,...,0.823529,1.000000,147.333333,66.67,166.400000,0.899728,1.172023,0.750000,0.771429,0.798709
944,9433241,Mx,8700,4421689.0:7752989.0:1941743.0:4489722.0:767287...,St,9701,6139370.0:7694581.0:3294444.0:3239102.0:632036...,St,9701,St,...,1.571429,0.012346,167.400000,0.00,170.466667,1.132232,0.666012,0.370370,0.000000,0.607580
945,9097227,Bd,22497,4239773.0:1941743.0:3007969.0:4172972.0:155625...,Wn Pe,23869,323049.0:4876122.0:4164978.0:1837205.0:3373138...,Wn Pe,23869,Bd,...,3.000000,1.000000,,0.00,,0.959823,0.857122,0.916667,0.600000,1.205197
946,9516695,Rn Rs,30428,8058959.0:2162782.0:2981614.0:4690188.0:212569...,Ss Hd,36014,5958840.0:7491294.0:3127354.0:3057312.0:420349...,Ss Hd,36014,Rn Rs,...,0.789474,1.487805,182.800000,66.67,133.375000,1.120217,1.213375,1.551515,0.333333,1.274788


In [48]:
import pandas as pd

def add_avg_six_ratio_feature_last_15_matches(match_data, batsman_data, dataset):
    # Step 1: Parse the roster IDs into individual player IDs for both team1 and team2
    match_data['team1_roster_list'] = match_data['team1_roster_ids'].str.split(':')
    match_data['team2_roster_list'] = match_data['team2_roster_ids'].str.split(':')

    # Step 2: Function to calculate total sixes for a team in a match
    def calculate_team_sixes(roster_list, batsman_data):
        total_sixes = 0
        for player_id in roster_list:
            player_id = float(player_id)  # Convert to float if necessary
            player_sixes = batsman_data[batsman_data['batsman_id'] == player_id]['Sixes']
            if not player_sixes.empty:
                total_sixes += player_sixes.sum()
        return total_sixes

    # Step 3: Calculate total sixes for team1 and team2 in each match
    match_data['team1_total_sixes'] = match_data['team1_roster_list'].apply(calculate_team_sixes, args=(batsman_data,))
    match_data['team2_total_sixes'] = match_data['team2_roster_list'].apply(calculate_team_sixes, args=(batsman_data,))

    # Step 4: Calculate average sixes per match for each team in their last 15 matches
    teams = pd.concat([match_data['team1_id'], match_data['team2_id']]).unique()
    team_avg_sixes_last_15 = {}

    for team_id in teams:
        # Get the last 15 matches for the team
        team_matches = match_data[(match_data['team1_id'] == team_id) | (match_data['team2_id'] == team_id)]
        team_matches = team_matches.sort_values(by='match_dt', ascending=False).head(15)

        total_sixes = team_matches.apply(
            lambda x: x['team1_total_sixes'] if x['team1_id'] == team_id else x['team2_total_sixes'], axis=1
        )
        team_avg_sixes_last_15[team_id] = total_sixes.mean()

    # Step 5: Map the average sixes to the dataset
    dataset['team1_avg_sixes_last_15'] = dataset['team1_id'].map(team_avg_sixes_last_15)
    dataset['team2_avg_sixes_last_15'] = dataset['team2_id'].map(team_avg_sixes_last_15)

    # Step 6: Calculate the ratio of average sixes for team1 and team2
    dataset['avg_six_ratio_last_15'] = dataset.apply(
        lambda row: row['team1_avg_sixes_last_15'] / row['team2_avg_sixes_last_15'] if row['team2_avg_sixes_last_15'] != 0 else 1,
        axis=1
    )

    # Step 7: Drop the intermediate columns
    dataset.drop(columns=['team1_avg_sixes_last_15', 'team2_avg_sixes_last_15'], inplace=True)

    return dataset



In [49]:
train_data = add_avg_six_ratio_feature_last_15_matches(match_data, batsman_data, train_data)
# Display the updated train_data DataFrame with the new column
train_data.head()

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,winner,winner_id,toss winner,...,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,avg_economy_ratio,avg_strike_rate_ratio,win_percentage_ratio,ground_win_percentage_ratio,avg_four_ratio_last_15,avg_six_ratio_last_15
0,9331181,Ba,11283,9373356.0:7857520.0:4232164.0:4566540.0:329940...,Hl Ph,12634,3500958.0:4231751.0:2735081.0:2035102.0:369833...,Hl Ph,12634,Hl Ph,...,0.672131,139.0,100.0,157.178571,1.185952,1.532148,0.810526,0.333333,1.885096,1.832099
1,8797060,Ed,20,2089079.0:6139370.0:2076192.0:62432.0:2083409....,Wt Is,41,4690258.0:4069666.0:4230127.0:1942317.0:161392...,Ed,20,Wt Is,...,1.952381,156.0,50.0,103.5,1.110736,0.800575,1.310638,0.666667,1.979635,1.004384
2,9433269,We,10576,3298427.0:2288789.0:7773338.0:3519011.0:368195...,Ne,8987,4003390.0:1749075.0:1626526.0:4172447.0:551672...,We,10576,Ne,...,0.672131,173.266667,0.0,154.333333,0.8809,0.666431,0.698925,0.772727,0.444444,0.362616
3,9587073,Ga An Ws,36084,8127230.0:4690328.0:4069666.0:7960847.0:469018...,Bs Rs,36070,3462080.0:2436405.0:1798705.0:7550857.0:574247...,Ga An Ws,36084,Ga An Ws,...,1.97561,164.266667,50.0,144.25,1.050725,0.933639,1.215278,1.0,0.961681,1.025801
4,9516457,Pb Ks,30407,8127181.0:197658.0:4239038.0:2398346.0:5053082...,Gt Ts,48341,1958683.0:7491224.0:8059029.0:4377610.0:225245...,Gt Ts,48341,Gt Ts,...,1.327869,164.666667,0.0,189.0,1.142877,1.256099,0.7,0.25,0.830525,1.157634


In [50]:
import pandas as pd

def calculate_wickets_lost_ratio_last_15_matches(match_data, batsman_data, test_data):
    # Step 1: Parse the roster IDs into individual player IDs for both team1 and team2
    match_data['team1_roster_list'] = match_data['team1_roster_ids'].str.split(':')
    match_data['team2_roster_list'] = match_data['team2_roster_ids'].str.split(':')

    # Step 2: Function to calculate total wickets lost for a team in their last 15 matches
    def calculate_team_wickets_lost(roster_list, batsman_data):
        total_wickets_lost = 0
        for player_id in roster_list:
            player_id = float(player_id)  # Convert to float if necessary
            if player_id in batsman_data['batsman_id'].values:
                wickets_lost = batsman_data[(batsman_data['batsman_id'] == player_id) & (batsman_data['wicket kind'].notnull())].shape[0]
                total_wickets_lost += wickets_lost
        return total_wickets_lost

    # Step 3: Calculate total wickets lost for team1 and team2 in their last 15 matches
    match_data['team1_total_wickets_lost'] = match_data['team1_roster_list'].apply(calculate_team_wickets_lost, args=(batsman_data,))
    match_data['team2_total_wickets_lost'] = match_data['team2_roster_list'].apply(calculate_team_wickets_lost, args=(batsman_data,))

    # Step 4: Calculate average wickets lost per match for each team in their last 15 matches
    teams = pd.concat([match_data['team1_id'], match_data['team2_id']]).unique()
    team_avg_wickets_lost_last_15 = {}

    for team_id in teams:
        # Get the last 15 matches for the team
        team_matches = match_data[(match_data['team1_id'] == team_id) | (match_data['team2_id'] == team_id)]
        team_matches = team_matches.sort_values(by='match_dt', ascending=False).head(15)

        total_wickets_lost = team_matches.apply(
            lambda x: x['team1_total_wickets_lost'] if x['team1_id'] == team_id else x['team2_total_wickets_lost'], axis=1
        )
        team_avg_wickets_lost_last_15[team_id] = total_wickets_lost.mean()

    # Step 5: Map the average wickets lost to the test_data
    test_data['team1_avg_wickets_lost_last_15'] = test_data['team1_id'].map(team_avg_wickets_lost_last_15)
    test_data['team2_avg_wickets_lost_last_15'] = test_data['team2_id'].map(team_avg_wickets_lost_last_15)

    # Step 6: Calculate the ratio of average wickets lost for team1 and team2
    test_data['wickets_lost_ratio_last_15'] = test_data.apply(
        lambda row: row['team1_avg_wickets_lost_last_15'] / row['team2_avg_wickets_lost_last_15'] if row['team2_avg_wickets_lost_last_15'] and row['team1_avg_wickets_lost_last_15'] else 0,
        axis=1
    )

    # Step 7: Drop the intermediate columns
    test_data.drop(columns=['team1_avg_wickets_lost_last_15', 'team2_avg_wickets_lost_last_15'], inplace=True)

    return test_data

# Assuming match_data, batsman_data, and test_data are already loaded as pandas DataFrames
train_data = calculate_wickets_lost_ratio_last_15_matches(match_data, batsman_data, train_data)


In [51]:
train_data.head()

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,winner,winner_id,toss winner,...,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,avg_economy_ratio,avg_strike_rate_ratio,win_percentage_ratio,ground_win_percentage_ratio,avg_four_ratio_last_15,avg_six_ratio_last_15,wickets_lost_ratio_last_15
0,9331181,Ba,11283,9373356.0:7857520.0:4232164.0:4566540.0:329940...,Hl Ph,12634,3500958.0:4231751.0:2735081.0:2035102.0:369833...,Hl Ph,12634,Hl Ph,...,139.0,100.0,157.178571,1.185952,1.532148,0.810526,0.333333,1.885096,1.832099,2.616273
1,8797060,Ed,20,2089079.0:6139370.0:2076192.0:62432.0:2083409....,Wt Is,41,4690258.0:4069666.0:4230127.0:1942317.0:161392...,Ed,20,Wt Is,...,156.0,50.0,103.5,1.110736,0.800575,1.310638,0.666667,1.979635,1.004384,1.246449
2,9433269,We,10576,3298427.0:2288789.0:7773338.0:3519011.0:368195...,Ne,8987,4003390.0:1749075.0:1626526.0:4172447.0:551672...,We,10576,Ne,...,173.266667,0.0,154.333333,0.8809,0.666431,0.698925,0.772727,0.444444,0.362616,0.468424
3,9587073,Ga An Ws,36084,8127230.0:4690328.0:4069666.0:7960847.0:469018...,Bs Rs,36070,3462080.0:2436405.0:1798705.0:7550857.0:574247...,Ga An Ws,36084,Ga An Ws,...,164.266667,50.0,144.25,1.050725,0.933639,1.215278,1.0,0.961681,1.025801,1.05378
4,9516457,Pb Ks,30407,8127181.0:197658.0:4239038.0:2398346.0:5053082...,Gt Ts,48341,1958683.0:7491224.0:8059029.0:4377610.0:225245...,Gt Ts,48341,Gt Ts,...,164.666667,0.0,189.0,1.142877,1.256099,0.7,0.25,0.830525,1.157634,0.921828


In [52]:
train_data_new = pd.read_excel('train_data (1).xlsx')

In [53]:
train_data_new.columns

Index(['match id', 'team1_id', 'team1_roster_ids', 'team2_id',
       'team2_roster_ids', 'winner_id', 'toss winner', 'toss decision',
       'lighting', 'ground_id', 'team_count_50runs_last15', 'team_winp_last5',
       'team1only_avg_runs_last15', 'team1_winp_team2_last15',
       'ground_avg_runs_last15', 'team1_avg_runs_on_ground',
       'team2_avg_runs_on_ground', 'avg_strike_rate_ratio',
       'avg_economy_ratio', 'ratio_avg_dots', 'avg_wicket_ratio',
       'avg_four_ratio', 'avg_six_ratio', 'win_percentage_chase_ratio',
       'win_percentage_target_ratio', 'wickets_lost_ratio'],
      dtype='object')

In [54]:
train_data_new['avg_strike_rate_ratio'] = train_data['avg_strike_rate_ratio']
train_data_new['avg_economy_ratio'] = train_data['avg_economy_ratio']
train_data_new['avg_four_ratio'] =train_data['avg_four_ratio_last_15']
train_data_new['avg_six_ratio'] = train_data['avg_six_ratio_last_15']
train_data_new['wickets_lost_ratio'] = train_data['wickets_lost_ratio_last_15']
train_data_new['win_percentage_ratio'] = train_data['win_percentage_ratio']
train_data_new['ground_win_percentage_ratio'] = train_data['ground_win_percentage_ratio']

In [62]:
train_data_new = train_data_new.fillna(0)

In [64]:
train_data_new.to_csv(r'train_data_new.csv', index = False)

In [65]:

X = train_data_new.drop(columns=['team1_id', 'team2_id','team1_roster_ids','team2_roster_ids','winner_id','match id','team2_id','ground_id'])  # Drop unnecessary columns
toss_winner = train_data_new.apply(lambda row: 0 if row['toss winner'] == row['team1_id'] else 1, axis=1)
X['toss winner'] = toss_winner
y = train_data_new.apply(lambda row: 0 if row['winner_id'] == row['team1_id'] else 1, axis=1)

# Splitting the data into train and validation sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize models
xgb_model = XGBClassifier(eval_metric='logloss')
lgb_model = LGBMClassifier()
catboost_model = CatBoostClassifier(verbose=0)
gbm_model = GradientBoostingClassifier()

# Train models
xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=30, verbose=False)
lgb_model.fit(X_train, y_train)
catboost_model.fit(X_train, y_train)
gbm_model.fit(X_train, y_train)

# Evaluate models
models = [xgb_model, lgb_model, catboost_model, gbm_model]
model_names = ['XGBoost', 'LightGBM', 'CatBoost', 'GradientBoosting']
results = {}

for model, name in zip(models, model_names):
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)
    y_pred_test = model.predict(X_test)
    accuracy_train = accuracy_score(y_train,y_pred_train)
    accuracy_val = accuracy_score(y_val, y_pred_val)
    f1_val = f1_score(y_val, y_pred_val)
    accuracy_test = accuracy_score(y_test, y_pred_test)
    f1_test = f1_score(y_test, y_pred_test)
    results[name] = {'train_accuracy': accuracy_train, 'val_accuracy': accuracy_val, 'val_f1': f1_val, 'test_accuracy': accuracy_test, 'test_f1': f1_test}

# Print results
for model_name, metrics in results.items():
    print(f"{model_name}:")
    print(f"  accuracy_train: {metrics['train_accuracy']:.4f}")
    print(f"  Validation Accuracy: {metrics['val_accuracy']:.4f}")
    print(f"  Validation F1 Score: {metrics['val_f1']:.4f}")
    print(f"  Test Accuracy: {metrics['test_accuracy']:.4f}")
    print(f"  Test F1 Score: {metrics['test_f1']:.4f}")

# Selecting the best model based on validation F1 score
best_model_name = max(results, key=lambda x: results[x]['val_f1'])
print(f"\nBest model based on validation F1 score: {best_model_name}")

# Re-training the best model on the entire dataset (train + validation)
best_model = {
    'XGBoost': XGBClassifier(eval_metric='logloss'),
    'LightGBM': LGBMClassifier(),
    'CatBoost': CatBoostClassifier(verbose=0),
    'GradientBoosting': GradientBoostingClassifier()
}[best_model_name]

best_model.fit(X, y)

# # Example of predicting a new match with the best model
# # Replace with actual data from your test dataset
# new_data = pd.DataFrame({
#     'team1_id': [101],
#     'team2_id': [102],
#     'toss_winner': [101],
#     'toss_decision': [1],
#     'lighting': [1],
#     'ground_id': [1],
#     'team_count_50runs_last15': [3],
#     'team_winp_last5': [0.6],
#     'team1only_avg_runs_last15': [250],
#     'team1_winp_team2_last15': [0.5],
#     'ground_avg_runs_last15': [280]
# })

# X_new = new_data.drop(columns=['team1_id', 'team2_id'])
# prediction = best_model.predict(X_new)
# print(f"Predicted winner id: {new_data['team1_id'].values[0] if prediction == 0 else new_data['team2_id'].values[0]}")



[LightGBM] [Info] Number of positive: 386, number of negative: 372
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000550 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3358
[LightGBM] [Info] Number of data points in the train set: 758, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.509235 -> initscore=0.036944
[LightGBM] [Info] Start training from score 0.036944
XGBoost:
  accuracy_train: 0.9815
  Validation Accuracy: 0.7158
  Validation F1 Score: 0.7158
  Test Accuracy: 0.6632
  Test F1 Score: 0.6667
LightGBM:
  accuracy_train: 1.0000
  Validation Accuracy: 0.7263
  Validation F1 Score: 0.7111
  Test Accuracy: 0.6947
  Test F1 Score: 0.6947
CatBoost:
  accuracy_train: 0.9908
  Validation Accuracy: 0.7263
  Validation F1 Score: 0.7234
  Test Accuracy: 0.7053
  Test F1 Score: 0.7021
GradientBoosting:
  ac

<catboost.core.CatBoostClassifier at 0x7fa581e9ab00>

In [67]:
pool_data = Pool(data=X, label=y)
feature_importances = best_model.get_feature_importance(pool_data)
features_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

In [68]:
features_df

Unnamed: 0,Feature,Importance
0,toss winner,0.917648
1,toss decision,1.01712
2,lighting,2.38218
3,team_count_50runs_last15,4.938698
4,team_winp_last5,4.902845
5,team1only_avg_runs_last15,3.817487
6,team1_winp_team2_last15,4.62779
7,ground_avg_runs_last15,3.947841
8,team1_avg_runs_on_ground,4.648212
9,team2_avg_runs_on_ground,3.3761


In [73]:
xgb_param_grid = {
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'n_estimators': [100, 200, 300]
}
lgb_param_grid = {
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300]
}
catboost_param_grid = {
    'learning_rate': [0.05, 0.1, 0.2],
    'depth': [4, 6, 8],
    'iterations': [100, 200, 300]
}
gb_param_grid = {
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300]
}
xgb_model = XGBClassifier()
xgb_grid_search = GridSearchCV(estimator=xgb_model, param_grid=xgb_param_grid, cv=3, scoring='f1_weighted', verbose=1)
xgb_grid_search.fit(X_train, y_train)
print("XGBoost - Best Parameters:", xgb_grid_search.best_params_)
print("XGBoost - Best Score:", xgb_grid_search.best_score_)
best_xgb = xgb_grid_search.best_estimator_
lgb_model = LGBMClassifier()
lgb_grid_search = GridSearchCV(estimator=lgb_model, param_grid=lgb_param_grid, cv=3, scoring='f1_weighted', verbose=1)
lgb_grid_search.fit(X_train, y_train)
print("LightGBM - Best Parameters:", lgb_grid_search.best_params_)
print("LightGBM - Best Score:", lgb_grid_search.best_score_)
best_lgb = lgb_grid_search.best_estimator_
catboost_model = CatBoostClassifier()
catboost_grid_search = GridSearchCV(estimator=catboost_model, param_grid=catboost_param_grid, cv=3, scoring='f1_weighted', verbose=1)
catboost_grid_search.fit(X_train, y_train)
print("CatBoost - Best Parameters:", catboost_grid_search.best_params_)
print("CatBoost - Best Score:", catboost_grid_search.best_score_)
best_catboost = catboost_grid_search.best_estimator_
gb_model = GradientBoostingClassifier()
gb_grid_search = GridSearchCV(estimator=gb_model, param_grid=gb_param_grid, cv=3, scoring='f1_weighted', verbose=1)
gb_grid_search.fit(X_train, y_train)
print("GradientBoosting - Best Parameters:", gb_grid_search.best_params_)
print("GradientBoosting - Best Score:", gb_grid_search.best_score_)
best_gb = gb_grid_search.best_estimator_
voting_clf = VotingClassifier(estimators=[
    ('xgb', best_xgb),
    ('lgb', best_lgb),
    ('catboost', best_catboost),
    ('gb', best_gb)
], voting='soft')

# Train the Voting Classifier on the training data
voting_clf.fit(X_train, y_train)

# Predictions
train_preds = voting_clf.predict(X_train)
val_preds = voting_clf.predict(X_val)
test_preds = voting_clf.predict(X_test)

# Evaluate the Voting Classifier
train_accuracy = accuracy_score(y_train, train_preds)
val_accuracy = accuracy_score(y_val, val_preds)
test_accuracy = accuracy_score(y_test, test_preds)

train_f1 = f1_score(y_train, train_preds, average='weighted')
val_f1 = f1_score(y_val, val_preds, average='weighted')
test_f1 = f1_score(y_test, test_preds, average='weighted')

print(f"Train Accuracy: {train_accuracy:.4f}, Train F1 Score: {train_f1:.4f}")
print(f"Validation Accuracy: {val_accuracy:.4f}, Validation F1 Score: {val_f1:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}, Test F1 Score: {test_f1:.4f}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
21:	learn: 0.4262491	total: 309ms	remaining: 2.5s
22:	learn: 0.4208002	total: 322ms	remaining: 2.48s
23:	learn: 0.4128871	total: 344ms	remaining: 2.52s
24:	learn: 0.4064072	total: 359ms	remaining: 2.51s
25:	learn: 0.4016380	total: 372ms	remaining: 2.49s
26:	learn: 0.3968530	total: 385ms	remaining: 2.46s
27:	learn: 0.3909223	total: 397ms	remaining: 2.44s
28:	learn: 0.3824188	total: 410ms	remaining: 2.42s
29:	learn: 0.3766872	total: 423ms	remaining: 2.39s
30:	learn: 0.3713458	total: 436ms	remaining: 2.38s
31:	learn: 0.3675449	total: 449ms	remaining: 2.36s
32:	learn: 0.3620926	total: 462ms	remaining: 2.34s
33:	learn: 0.3550389	total: 475ms	remaining: 2.32s
34:	learn: 0.3511518	total: 488ms	remaining: 2.3s
35:	learn: 0.3473892	total: 500ms	remaining: 2.28s
36:	learn: 0.3431706	total: 513ms	remaining: 2.26s
37:	learn: 0.3392940	total: 526ms	remaining: 2.24s
38:	learn: 0.3341592	total: 538ms	remaining: 2.22s
39:	learn: 0.332297

In [81]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import train_test_split
import numpy as np

# Assume you have X_train, X_val, y_train, y_val defined

# Define a custom scorer based on F1 score (since it's a binary classification)
f1_scorer = make_scorer(f1_score)

# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [3, 5, 7, 10],
    'l2_leaf_reg': [1, 3, 5, 7],
    'subsample': [0.6, 0.8, 1.0],
    'iterations': [100, 300, 500]
}

# Initialize CatBoost classifier
catboost_model = CatBoostClassifier(verbose=False, random_seed=42)

# Perform GridSearchCV for hyperparameter tuning
catboost_grid_search = GridSearchCV(
    estimator=catboost_model,
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=5,
    verbose=3,
    n_jobs=-1
)

# Fit GridSearchCV
catboost_grid_search.fit(X_train, y_train)

# Get the best model and its hyperparameters
best_catboost_model = catboost_grid_search.best_estimator_
best_params = catboost_grid_search.best_params_

print("Best Hyperparameters for CatBoost:")
print(best_params)

# Evaluate on validation set
y_pred_val = best_catboost_model.predict(X_val)
val_f1 = f1_score(y_val, y_pred_val)

print(f"CatBoost Validation F1 Score: {val_f1:.4f}")


Fitting 5 folds for each of 432 candidates, totalling 2160 fits


KeyboardInterrupt: 