In [105]:
import pandas as pd
import numpy as np
import joblib
import random

# Feature Extraction and Data Cleaning

## Feature Engineering for batsman and bowler

The runs/match is the new metric for a batsman. \
The eco/match is the new metric for a bowler.

If batsman/bowler isnt in the dict, then avg score will be imputed.

In [106]:
batsman_df = pd.read_csv('663e2b548c98c_batsman_level_scorecard.csv')
bowler_df = pd.read_csv('663e2b2c60743_bowler_level_scorecard.csv')

In [107]:
batsman_scores = {}
bowler_scores = {}
for i, row in batsman_df.iterrows():
    if row['batsman_id'] in batsman_scores:
        batsman_scores[row['batsman_id']][1] += 1
        batsman_scores[row['batsman_id']][0] = (batsman_scores[row['batsman_id']][0]*(batsman_scores[row['batsman_id']][1] - 1) + row['runs']) / batsman_scores[row['batsman_id']][1]
    else:
        batsman_scores[row['batsman_id']] = [row['runs'], 1]

for i, row in bowler_df.iterrows():
    if row['bowler_id'] in bowler_scores:
        bowler_scores[row['bowler_id']][1] += 1
        bowler_scores[row['bowler_id']][0] = (bowler_scores[row['bowler_id']][0]*(bowler_scores[row['bowler_id']][1] - 1) + row['economy']) / bowler_scores[row['bowler_id']][1]
    else:
        bowler_scores[row['bowler_id']] = [row['economy'], 1]

## Mods to original df


1. Drop season
2. Drop city
3. Expand and drop roster_ids
4. Change toss winner team into its ID
5. Change date time object into crt format and order by date 
6. series_name ? -- Drop?
7. venue ? -- Drop?
8. Maybe remove "team_count_50runs_last15" ?? 

In [108]:
def extract_player_ids(df: pd.DataFrame) -> pd.DataFrame:
    t1_players = df['team1_roster_ids'].str.split(':', expand=True)
    t2_players = df['team2_roster_ids'].str.split(':', expand=True)
    
    # Convert split strings to integers
    t1_players = t1_players.apply(pd.to_numeric, errors='coerce').astype('Int64')
    t2_players = t2_players.apply(pd.to_numeric, errors='coerce').astype('Int64')
    
    # Some cols had extra team members, remove them
    t1_players.drop(columns=11, inplace=True)
    t2_players.drop(columns=11, inplace=True)
    
    t1_players_headings = [f"t1_p{i+1}" for i in range(t1_players.shape[1])]
    t2_players_headings = [f"t2_p{i+1}" for i in range(t2_players.shape[1])]
    
    t1_players.columns = t1_players_headings
    t2_players.columns = t2_players_headings
    df = pd.concat([df, t1_players, t2_players], axis=1)
    df.drop(columns=['team1_roster_ids', 'team2_roster_ids'], inplace=True)

    return df


In [109]:
def extract_batsman_bowler_scores(new_df: pd.DataFrame, batsman_scores: dict, bowler_scores: dict) -> pd.DataFrame: 
    new_df["t1_batsman_scores"] = 0
    new_df["t1_bowler_scores"] = 0

    new_df["t2_batsman_scores"] = 0
    new_df["t2_bowler_scores"] = 0

    new_players = 0

    t1_players = new_df["team1_roster_ids"].str.split(":")
    for i, row in new_df.iterrows():
        for p in t1_players[i]:
            p = float(p)
            if p in batsman_scores.keys():
                new_df.at[i, "t1_batsman_scores"] += batsman_scores[p][0]
            if p in bowler_scores.keys():
                new_df.at[i, "t1_bowler_scores"] += bowler_scores[p][0]
            if p not in batsman_scores.keys() and p not in bowler_scores.keys():
                new_players += 1
                key_bat, val = random.choice(list(batsman_scores.items()))
                key_bowl, val = random.choice(list(bowler_scores.items()))
                new_df.at[i, "t1_batsman_scores"] += batsman_scores[key_bat][0]
                new_df.at[i, "t1_bowler_scores"] += bowler_scores[key_bowl][0]
    
    t2_players = new_df["team2_roster_ids"].str.split(":")
    for i, row in new_df.iterrows():
        for p in t2_players[i]:
            p = float(p)
            if p in batsman_scores.keys():
                new_df.at[i, "t2_batsman_scores"] += batsman_scores[p][0]
            if p in bowler_scores.keys():
                new_df.at[i, "t2_bowler_scores"] += bowler_scores[p][0]
            if p not in batsman_scores.keys() and p not in bowler_scores.keys():
                new_players += 1
                key_bat, val = random.choice(list(batsman_scores.items()))
                key_bowl, val = random.choice(list(bowler_scores.items()))
                new_df.at[i, "t2_batsman_scores"] += batsman_scores[key_bat][0]
                new_df.at[i, "t2_bowler_scores"] += bowler_scores[key_bowl][0]

    new_df.drop(columns=['team1_roster_ids', 'team2_roster_ids'], inplace=True)

    return new_df

In [110]:
def change_date_time(df: pd.DataFrame) -> pd.DataFrame:
    df['match_dt'] = pd.to_datetime(df['match_dt'], format='%Y-%m-%d')
    df = df.sort_values(by='match_dt')
    df.reset_index(drop=True, inplace=True)

    return df

In [111]:
def add_toss_winner_id(df: pd.DataFrame) -> pd.DataFrame:
    df["toss winner id"] = None  # initialize the column with None values
    for i, row in df.iterrows():
        if row['toss winner'] == row['team1']:
            df.at[i, 'toss winner id'] = row['team1_id']
        else:
            df.at[i, 'toss winner id'] = row['team2_id']

    df["toss winner id"] = df["toss winner id"].astype(int)

    df.drop(columns=['toss winner'], inplace=True)

    return df

In [112]:
def encode_winner_id(df: pd.DataFrame) -> pd.DataFrame:
    # 1- team1, 0 - team2
    df["winning team"] = None
    for i, row in df.iterrows():
        if row['winner_id'] == row['team1_id']:
            df.at[i, 'winning team'] = 1
        else:
            df.at[i, 'winning team'] = 0
    
    df["winning team"] = df["winning team"].astype(int)

    df.drop(columns=['winner_id'], inplace=True)

    return df


In [113]:
def drop_redun_cols(df: pd.DataFrame) -> pd.DataFrame:
    df.drop(columns=['season', 'city', 'series_name', 'venue', 'team1', 'team2',], inplace=True)
    
    return df

# Load the Data for Prediction

In [114]:
df = pd.read_csv('6644a1e287df6_test_data_with_samplefeatures.csv')
df

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,toss winner,toss decision,venue,...,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15
0,9250275,Jy,28594,7438955.0:8271969.0:8369661.0:3685247.0:259025...,Ud Ss of Aa,90,2436944.0:8117500.0:6703528.0:3010748.0:161387...,Ud Ss of Aa,field,Bo Ac Cb,...,2022-07-11,day match,Ud Ss of Aa tr of Ze,2022,3226,0.000000,0.019608,,0.00,
1,9262189,Ga An Ws,36084,3715690.0:6818622.0:4069666.0:393014.0:4690188...,Ja Ts,36098,4690258.0:3761246.0:8464385.0:5742470.0:315072...,Ja Ts,field,Pe Sm Ga,...,2022-09-21,night match,Cn Pr Le,2022,13915,0.615385,0.344262,151.285714,66.67,153.500000
2,9128776,Rn Rs,30428,8058959.0:2162782.0:2981614.0:7833195.0:755605...,Lw Sr Gs,48334,2654014.0:2954769.0:8058903.0:3479860.0:329940...,Rn Rs,bat,Be Sm Mi,...,2022-05-15,night match,In Pr Le,2022,2764,0.842105,0.753086,171.066667,100.00,179.625000
3,9586919,Ja Ts,36098,4690258.0:8464385.0:1613898.0:5744780.0:315072...,St La Ks,36112,2331475.0:313809.0:391103.0:2742837.0:2097017....,St La Ks,field,Dn Sy Nl Ct Sm Gs It St La,...,2023-08-16,night match,Cn Pr Le,2023,11892,0.285714,1.487805,166.000000,75.00,155.500000
4,9128538,Ci Sr Ks,30414,7422673.0:249087.0:62432.0:232000.0:5001170.0:...,Gt Ts,48341,1958683.0:7491224.0:3339160.0:5452754.0:225245...,Gt Ts,field,Ma Ct An Sm Pe,...,2022-04-17,night match,In Pr Le,2022,18752,2.375000,0.310330,169.933333,0.00,164.125000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266,9094371,Ln St,46773,3200973.0:2958164.0:4223883.0:172199.0:4489974...,Wh Fe,46752,8364726.0:4003390.0:2158869.0:3651472.0:225245...,Wh Fe,field,Ls Ln,...,2022-08-24,night match,Te Hd Ms Cn,2022,83,0.916667,2.904762,144.545455,0.00,145.181818
267,9440500,Si La,69,7200598.0:4403531.0:2398346.0:5490582.0:326056...,Ia,55,5043310.0:7491224.0:3127354.0:3125562.0:437761...,Ia,field,Ma Ct An Sm Pe,...,2023-01-05,night match,Si La tr of Ia,2022/23,18752,0.714286,1.000000,159.066667,40.00,154.950000
268,9085173,Sx,9967,2263736.0:164233.0:8820496.0:4215098.0:74087.0...,Gn,7573,2531913.0:1722048.0:319948.0:5164844.0:4174610...,Gn,field,Sa Gs Cf,...,2022-06-19,day match,Vy Bt,2022,4521,1.888889,2.904762,174.400000,50.00,146.821429
269,8887752,Sy Tr,33963,7572123.0:1749075.0:6718802.0:5788320.0:753465...,Me Rs,33942,5406540.0:37351.0:46794.0:2336473.0:7534687.0:...,Me Rs,field,Ds Sm Me,...,2022-01-08,night match,Bg Bh Le,2021/22,8252,0.333333,1.487805,171.066667,100.00,161.750000


In [115]:
df = extract_batsman_bowler_scores(df, batsman_scores, bowler_scores)
# df = encode_winner_id(df)
df = add_toss_winner_id(df)
df = change_date_time(df)
df = drop_redun_cols(df)
df

  new_df.at[i, "t1_bowler_scores"] += bowler_scores[key_bowl][0]
  new_df.at[i, "t1_batsman_scores"] += batsman_scores[p][0]
  new_df.at[i, "t2_batsman_scores"] += batsman_scores[p][0]
  new_df.at[i, "t2_bowler_scores"] += bowler_scores[p][0]


Unnamed: 0,match id,team1_id,team2_id,toss decision,match_dt,lighting,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,t1_batsman_scores,t1_bowler_scores,t2_batsman_scores,t2_bowler_scores,toss winner id
0,8887752,33963,33942,field,2022-01-08,night match,8252,0.333333,1.487805,171.066667,100.00,161.750000,163.964132,69.524817,154.605878,69.149852,33942
1,8887780,33928,33963,bat,2022-01-10,day match,440,1.800000,1.000000,168.000000,50.00,158.777778,191.473110,63.218740,154.001169,67.789817,33928
2,8887794,33914,33921,bat,2022-01-12,night match,1476,0.750000,1.000000,160.533333,33.33,164.000000,158.351526,42.089740,206.556882,65.508094,33914
3,8887801,33928,33963,bat,2022-01-13,day/night match,8252,1.600000,1.000000,168.000000,50.00,161.750000,196.260752,74.511240,171.751169,72.789817,33928
4,8887745,33914,33949,field,2022-01-15,day match,930,0.692308,0.512195,160.533333,33.33,158.250000,182.849350,48.669964,146.638853,83.195539,33949
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266,9615073,41,20,field,2023-12-14,day match,7930,0.923077,2.904762,169.600000,60.00,159.400000,205.452588,60.122461,203.581966,66.932422,20
267,9615087,20,41,field,2023-12-19,day/night match,14300,0.800000,1.000000,174.266667,60.00,151.625000,203.826411,66.173255,203.574071,68.835991,41
268,9702748,33949,33963,bat,2023-12-23,day/night match,6579,0.692308,0.047619,159.333333,50.00,197.000000,171.882387,79.204907,167.990754,61.701148,33949
269,9702776,33921,33963,bat,2023-12-27,night match,1476,0.461538,3.857143,163.800000,60.00,163.300000,155.234453,73.525631,171.990754,60.825011,33921


## Finding Historical Relationship

In [116]:
match_data = pd.read_csv("664389efa0868_match_level_scorecard.csv")
match_data = change_date_time(match_data)
match_data

Unnamed: 0,match id,team1,team2,winner,by,win amount,toss winner,toss decision,venue,city,...,inning2_runs,inning2_wickets,inning2_balls,team1_id,team1_roster_ids,team2_id,team2_roster_ids,series_type,winner_id,player_of_the_match_id
0,8638034,Nn Ds,Wn,Wn,wickets,9.0,Wn,field,By Ol,Mount Maunganui,...,152.0,1.0,97.0,17982,7907451.0:4381761.0:31464.0:258649.0:4949790.0...,18570,2653993.0:6718326.0:6718382.0:2486896.0:228878...,other_domestic,18570,
1,8588005,Me Rs,Sy Tr,Sy Tr,runs,7.0,Sy Tr,field,Ca Ol,Carrara,...,117.0,2.0,74.0,33942,37351.0:46794.0:5406540.0:2231928.0:181404.0:1...,33963,1506098.0:1749075.0:36665.0:2083409.0:7534652....,other_domestic,33963,1749075.0
2,8587837,Sy Ss,Be Ht,Be Ht,wickets,4.0,Sy Ss,bat,Be Ct Gd,Brisbane,...,171.0,6.0,119.0,33956,7869987.0:7620283.0:2076192.0:4002340.0:306369...,33921,7620269.0:2286437.0:87191.0:5786766.0:3114803....,other_domestic,33921,3890984.0
3,8638041,Nn Ds,Oo,Oo,wickets,2.0,Oo,field,By Ol,Mount Maunganui,...,156.0,8.0,126.0,17982,7907451.0:4381761.0:31464.0:4949790.0:258649.0...,18360,2319638.0:256080.0:7918280.0:3913447.0:2690498...,other_domestic,18360,
4,8587921,Ht Hs,Me Ss,Ht Hs,runs,21.0,Me Ss,field,Be Ol,Hobart,...,143.0,9.0,123.0,33928,4223883.0:2161599.0:1655436.0:5788418.0:319948...,33949,363047.0:2275097.0:3901078.0:2275195.0:4230127...,other_domestic,33928,3125849.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1684,9717504,Nw Zd,Bh,Bh,wickets,5.0,Bh,field,Mn Pk Nr,Napier,...,137.0,5.0,114.0,48,6718326.0:4381761.0:2672214.0:5764576.0:306995...,188,3758565.0:2104332.0:4403419.0:3056752.0:693023...,international,188,5831622.0
1685,9866373,Oo,Wn,Wn,wickets,9.0,Oo,bat,Mx Pk Aa,Alexandra,...,53.0,1.0,29.0,18360,2319638.0:8444855.0:4172706.0:7907444.0:631714...,18570,9036516.0:8271969.0:3913447.0:9523954.0:835478...,other_domestic,18570,6718410.0
1686,9702783,Ht Hs,Me Ss,Me Ss,wickets,7.0,Me Ss,field,Be Ol Ht,Hobart,...,67.0,3.0,42.0,33928,1611364.0:5843200.0:5716402.0:3890963.0:194364...,33949,5406540.0:2275195.0:2275097.0:3901078.0:796099...,other_domestic,33949,2275195.0
1687,9702790,Ae Ss,Me Rs,Me Rs,wickets,4.0,Ae Ss,bat,Ds Sm Me,Melbourne,...,178.0,5.0,115.0,33914,4239038.0:2161599.0:2286472.0:5469008.0:368195...,33942,4003390.0:2654014.0:8176356.0:46794.0:1635773....,other_domestic,33942,8176356.0


In [117]:
def update_history(new_df: pd.DataFrame) -> pd.DataFrame:
    new_df["t1_points"] = 0
    new_df["t2_points"] = 0

    for i, row in new_df.iterrows():
        c_t1_id = row['team1_id']
        c_t2_id = row['team2_id']
        match_date = row['match_dt']
        
        for j, row2 in match_data.iterrows():
            if row2['match_dt'] > match_date:
                break
            
            if row2['team1_id'] == c_t1_id and row2['team2_id'] == c_t2_id:
                if row2['winner_id'] == c_t1_id:
                    new_df.at[i, 't1_points'] += 1
                elif row2['winner_id'] == c_t2_id:
                    new_df.at[i, 't2_points'] += 1

    return new_df

In [118]:
df = update_history(df)
df

Unnamed: 0,match id,team1_id,team2_id,toss decision,match_dt,lighting,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,t1_batsman_scores,t1_bowler_scores,t2_batsman_scores,t2_bowler_scores,toss winner id,t1_points,t2_points
0,8887752,33963,33942,field,2022-01-08,night match,8252,0.333333,1.487805,171.066667,100.00,161.750000,163.964132,69.524817,154.605878,69.149852,33942,0,0
1,8887780,33928,33963,bat,2022-01-10,day match,440,1.800000,1.000000,168.000000,50.00,158.777778,191.473110,63.218740,154.001169,67.789817,33928,0,0
2,8887794,33914,33921,bat,2022-01-12,night match,1476,0.750000,1.000000,160.533333,33.33,164.000000,158.351526,42.089740,206.556882,65.508094,33914,1,1
3,8887801,33928,33963,bat,2022-01-13,day/night match,8252,1.600000,1.000000,168.000000,50.00,161.750000,196.260752,74.511240,171.751169,72.789817,33928,0,0
4,8887745,33914,33949,field,2022-01-15,day match,930,0.692308,0.512195,160.533333,33.33,158.250000,182.849350,48.669964,146.638853,83.195539,33949,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266,9615073,41,20,field,2023-12-14,day match,7930,0.923077,2.904762,169.600000,60.00,159.400000,205.452588,60.122461,203.581966,66.932422,20,1,1
267,9615087,20,41,field,2023-12-19,day/night match,14300,0.800000,1.000000,174.266667,60.00,151.625000,203.826411,66.173255,203.574071,68.835991,41,2,2
268,9702748,33949,33963,bat,2023-12-23,day/night match,6579,0.692308,0.047619,159.333333,50.00,197.000000,171.882387,79.204907,167.990754,61.701148,33949,1,2
269,9702776,33921,33963,bat,2023-12-27,night match,1476,0.461538,3.857143,163.800000,60.00,163.300000,155.234453,73.525631,171.990754,60.825011,33921,1,1


In [119]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271 entries, 0 to 270
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   match id                   271 non-null    int64         
 1   team1_id                   271 non-null    int64         
 2   team2_id                   271 non-null    int64         
 3   toss decision              271 non-null    object        
 4   match_dt                   271 non-null    datetime64[ns]
 5   lighting                   271 non-null    object        
 6   ground_id                  271 non-null    int64         
 7   team_count_50runs_last15   271 non-null    float64       
 8   team_winp_last5            271 non-null    float64       
 9   team1only_avg_runs_last15  264 non-null    float64       
 10  team1_winp_team2_last15    271 non-null    float64       
 11  ground_avg_runs_last15     261 non-null    float64       
 12  t1_batsm

# Load Model

In [120]:
loaded_clf = joblib.load('xgb_pipeline_model.pkl')

In [121]:
y_pred = loaded_clf.predict(df)

In [122]:
y_pred_prob = loaded_clf.predict_proba(df)

In [123]:
y_pred_df = pd.DataFrame(y_pred, columns=['y_pred'])
y_pred_prob_df = pd.DataFrame(y_pred_prob, columns=['y_pred_prob_0', 'y_pred_prob_1'])

In [124]:
final_df = pd.concat([df, y_pred_df, y_pred_prob_df], axis=1)

final_df

Unnamed: 0,match id,team1_id,team2_id,toss decision,match_dt,lighting,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,...,t1_batsman_scores,t1_bowler_scores,t2_batsman_scores,t2_bowler_scores,toss winner id,t1_points,t2_points,y_pred,y_pred_prob_0,y_pred_prob_1
0,8887752,33963,33942,field,2022-01-08,night match,8252,0.333333,1.487805,171.066667,...,163.964132,69.524817,154.605878,69.149852,33942,0,0,1,0.212151,0.787849
1,8887780,33928,33963,bat,2022-01-10,day match,440,1.800000,1.000000,168.000000,...,191.473110,63.218740,154.001169,67.789817,33928,0,0,1,0.077616,0.922384
2,8887794,33914,33921,bat,2022-01-12,night match,1476,0.750000,1.000000,160.533333,...,158.351526,42.089740,206.556882,65.508094,33914,1,1,1,0.199798,0.800202
3,8887801,33928,33963,bat,2022-01-13,day/night match,8252,1.600000,1.000000,168.000000,...,196.260752,74.511240,171.751169,72.789817,33928,0,0,1,0.207946,0.792054
4,8887745,33914,33949,field,2022-01-15,day match,930,0.692308,0.512195,160.533333,...,182.849350,48.669964,146.638853,83.195539,33949,0,1,0,0.993977,0.006023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266,9615073,41,20,field,2023-12-14,day match,7930,0.923077,2.904762,169.600000,...,205.452588,60.122461,203.581966,66.932422,20,1,1,1,0.483179,0.516821
267,9615087,20,41,field,2023-12-19,day/night match,14300,0.800000,1.000000,174.266667,...,203.826411,66.173255,203.574071,68.835991,41,2,2,1,0.188995,0.811005
268,9702748,33949,33963,bat,2023-12-23,day/night match,6579,0.692308,0.047619,159.333333,...,171.882387,79.204907,167.990754,61.701148,33949,1,2,0,0.916210,0.083790
269,9702776,33921,33963,bat,2023-12-27,night match,1476,0.461538,3.857143,163.800000,...,155.234453,73.525631,171.990754,60.825011,33921,1,1,0,0.559654,0.440346


In [125]:

final_df['win_pred_team_id'] = final_df.apply(
    lambda row: row['team1_id'] if row['y_pred_prob_1'] > row['y_pred_prob_0'] else row['team2_id'], 
    axis=1
)

final_df['win_pred_score'] = final_df.apply(
    lambda row: row['y_pred_prob_1'] if row['y_pred_prob_1'] > row['y_pred_prob_0'] else row['y_pred_prob_0'], 
    axis=1
)

final_df


Unnamed: 0,match id,team1_id,team2_id,toss decision,match_dt,lighting,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,...,t2_batsman_scores,t2_bowler_scores,toss winner id,t1_points,t2_points,y_pred,y_pred_prob_0,y_pred_prob_1,win_pred_team_id,win_pred_score
0,8887752,33963,33942,field,2022-01-08,night match,8252,0.333333,1.487805,171.066667,...,154.605878,69.149852,33942,0,0,1,0.212151,0.787849,33963,0.787849
1,8887780,33928,33963,bat,2022-01-10,day match,440,1.800000,1.000000,168.000000,...,154.001169,67.789817,33928,0,0,1,0.077616,0.922384,33928,0.922384
2,8887794,33914,33921,bat,2022-01-12,night match,1476,0.750000,1.000000,160.533333,...,206.556882,65.508094,33914,1,1,1,0.199798,0.800202,33914,0.800202
3,8887801,33928,33963,bat,2022-01-13,day/night match,8252,1.600000,1.000000,168.000000,...,171.751169,72.789817,33928,0,0,1,0.207946,0.792054,33928,0.792054
4,8887745,33914,33949,field,2022-01-15,day match,930,0.692308,0.512195,160.533333,...,146.638853,83.195539,33949,0,1,0,0.993977,0.006023,33949,0.993977
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266,9615073,41,20,field,2023-12-14,day match,7930,0.923077,2.904762,169.600000,...,203.581966,66.932422,20,1,1,1,0.483179,0.516821,41,0.516821
267,9615087,20,41,field,2023-12-19,day/night match,14300,0.800000,1.000000,174.266667,...,203.574071,68.835991,41,2,2,1,0.188995,0.811005,20,0.811005
268,9702748,33949,33963,bat,2023-12-23,day/night match,6579,0.692308,0.047619,159.333333,...,167.990754,61.701148,33949,1,2,0,0.916210,0.083790,33963,0.916210
269,9702776,33921,33963,bat,2023-12-27,night match,1476,0.461538,3.857143,163.800000,...,171.990754,60.825011,33921,1,1,0,0.559654,0.440346,33963,0.559654


In [126]:
final_df.to_csv('test_prediction.csv', index=False)

In [127]:
classifier = loaded_clf.named_steps['classifier']
params = classifier.get_params()
params

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': 0.7,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': 0.4,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': 0.6,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': 10,
 'max_leaves': None,
 'min_child_weight': 1,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [128]:
n_estimators = params.get('n_estimators', 'Not set')
max_depth = params.get('max_depth', 'Not set')
learning_rate = params.get('learning_rate', 'Not set')

print(f"Number of Trees (n_estimators): {n_estimators}")
print(f"Max Depth of Trees (max_depth): {max_depth}")
print(f"Learning Rate (learning_rate): {learning_rate}")


Number of Trees (n_estimators): None
Max Depth of Trees (max_depth): 10
Learning Rate (learning_rate): 0.6
