In [25]:
import pandas as pd
import numpy as np
import joblib
import random

# Feature Extraction and Data Cleaning

## Feature Engineering for batsman and bowler

The runs/match is the new metric for a batsman. \
The eco/match is the new metric for a bowler.

If batsman/bowler isnt in the dict, then avg score will be imputed.

In [26]:
batsman_df = pd.read_csv('663e2b548c98c_batsman_level_scorecard.csv')
bowler_df = pd.read_csv('663e2b2c60743_bowler_level_scorecard.csv')

In [27]:
batsman_scores = {}
bowler_scores = {}
for i, row in batsman_df.iterrows():
    if row['batsman_id'] in batsman_scores:
        batsman_scores[row['batsman_id']][1] += 1
        batsman_scores[row['batsman_id']][0] = (batsman_scores[row['batsman_id']][0]*(batsman_scores[row['batsman_id']][1] - 1) + row['runs']) / batsman_scores[row['batsman_id']][1]
    else:
        batsman_scores[row['batsman_id']] = [row['runs'], 1]

for i, row in bowler_df.iterrows():
    if row['bowler_id'] in bowler_scores:
        bowler_scores[row['bowler_id']][1] += 1
        bowler_scores[row['bowler_id']][0] = (bowler_scores[row['bowler_id']][0]*(bowler_scores[row['bowler_id']][1] - 1) + row['economy']) / bowler_scores[row['bowler_id']][1]
    else:
        bowler_scores[row['bowler_id']] = [row['economy'], 1]

## Mods to original df


1. Drop season
2. Drop city
3. Expand and drop roster_ids
4. Change toss winner team into its ID
5. Change date time object into crt format and order by date 
6. series_name ? -- Drop?
7. venue ? -- Drop?
8. Maybe remove "team_count_50runs_last15" ?? 

In [28]:
def extract_player_ids(df: pd.DataFrame) -> pd.DataFrame:
    t1_players = df['team1_roster_ids'].str.split(':', expand=True)
    t2_players = df['team2_roster_ids'].str.split(':', expand=True)
    
    # Convert split strings to integers
    t1_players = t1_players.apply(pd.to_numeric, errors='coerce').astype('Int64')
    t2_players = t2_players.apply(pd.to_numeric, errors='coerce').astype('Int64')
    
    # Some cols had extra team members, remove them
    t1_players.drop(columns=11, inplace=True)
    t2_players.drop(columns=11, inplace=True)
    
    t1_players_headings = [f"t1_p{i+1}" for i in range(t1_players.shape[1])]
    t2_players_headings = [f"t2_p{i+1}" for i in range(t2_players.shape[1])]
    
    t1_players.columns = t1_players_headings
    t2_players.columns = t2_players_headings
    df = pd.concat([df, t1_players, t2_players], axis=1)
    df.drop(columns=['team1_roster_ids', 'team2_roster_ids'], inplace=True)

    return df


In [29]:
def extract_batsman_bowler_scores(new_df: pd.DataFrame, batsman_scores: dict, bowler_scores: dict) -> pd.DataFrame: 
    new_df["t1_batsman_scores"] = 0
    new_df["t1_bowler_scores"] = 0

    new_df["t2_batsman_scores"] = 0
    new_df["t2_bowler_scores"] = 0

    new_players = 0

    t1_players = new_df["team1_roster_ids"].str.split(":")
    for i, row in new_df.iterrows():
        for p in t1_players[i]:
            p = float(p)
            if p in batsman_scores.keys():
                new_df.at[i, "t1_batsman_scores"] += batsman_scores[p][0]
            if p in bowler_scores.keys():
                new_df.at[i, "t1_bowler_scores"] += bowler_scores[p][0]
            if p not in batsman_scores.keys() and p not in bowler_scores.keys():
                new_players += 1
                key_bat, val = random.choice(list(batsman_scores.items()))
                key_bowl, val = random.choice(list(bowler_scores.items()))
                new_df.at[i, "t1_batsman_scores"] += batsman_scores[key_bat][0]
                new_df.at[i, "t1_bowler_scores"] += bowler_scores[key_bowl][0]
    
    t2_players = new_df["team2_roster_ids"].str.split(":")
    for i, row in new_df.iterrows():
        for p in t2_players[i]:
            p = float(p)
            if p in batsman_scores.keys():
                new_df.at[i, "t2_batsman_scores"] += batsman_scores[p][0]
            if p in bowler_scores.keys():
                new_df.at[i, "t2_bowler_scores"] += bowler_scores[p][0]
            if p not in batsman_scores.keys() and p not in bowler_scores.keys():
                new_players += 1
                key_bat, val = random.choice(list(batsman_scores.items()))
                key_bowl, val = random.choice(list(bowler_scores.items()))
                new_df.at[i, "t2_batsman_scores"] += batsman_scores[key_bat][0]
                new_df.at[i, "t2_bowler_scores"] += bowler_scores[key_bowl][0]

    new_df.drop(columns=['team1_roster_ids', 'team2_roster_ids'], inplace=True)

    return new_df

In [30]:
def change_date_time(df: pd.DataFrame) -> pd.DataFrame:
    df['match_dt'] = pd.to_datetime(df['match_dt'], format='%Y-%m-%d')
    df = df.sort_values(by='match_dt')
    df.reset_index(drop=True, inplace=True)

    return df

In [31]:
def add_toss_winner_id(df: pd.DataFrame) -> pd.DataFrame:
    df["toss winner id"] = None  # initialize the column with None values
    for i, row in df.iterrows():
        if row['toss winner'] == row['team1']:
            df.at[i, 'toss winner id'] = row['team1_id']
        else:
            df.at[i, 'toss winner id'] = row['team2_id']

    df["toss winner id"] = df["toss winner id"].astype(int)

    df.drop(columns=['toss winner'], inplace=True)

    return df

In [32]:
def encode_winner_id(df: pd.DataFrame) -> pd.DataFrame:
    # 1- team1, 0 - team2
    df["winning team"] = None
    for i, row in df.iterrows():
        if row['winner_id'] == row['team1_id']:
            df.at[i, 'winning team'] = 1
        else:
            df.at[i, 'winning team'] = 0
    
    df["winning team"] = df["winning team"].astype(int)

    df.drop(columns=['winner_id'], inplace=True)

    return df


In [33]:
def drop_redun_cols(df: pd.DataFrame) -> pd.DataFrame:
    df.drop(columns=['season', 'city', 'series_name', 'venue', 'team1', 'team2',], inplace=True)
    
    return df

# Load the Data for Prediction

In [34]:
df = pd.read_csv('663e2b6d54457_train_data_with_samplefeatures.csv')
df

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,winner,winner_id,toss winner,...,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15
0,9331181,Ba,11283,9373356.0:7857520.0:4232164.0:4566540.0:329940...,Hl Ph,12634,3500958.0:4231751.0:2735081.0:2035102.0:369833...,Hl Ph,12634,Hl Ph,...,2022-10-20,day/night match,Sd Mq Ai Ty,2022/23,7398,1.666667,0.672131,139.000000,100.00,157.178571
1,8797060,Ed,20,2089079.0:6139370.0:2076192.0:62432.0:2083409....,Wt Is,41,4690258.0:4069666.0:4230127.0:1942317.0:161392...,Ed,20,Wt Is,...,2022-01-23,day/night match,Ed tr of Wt Is,2021/22,1406,1.285714,1.952381,156.000000,50.00,103.500000
2,9433269,We,10576,3298427.0:2288789.0:7773338.0:3519011.0:368195...,Ne,8987,4003390.0:1749075.0:1626526.0:4172447.0:551672...,We,10576,Ne,...,2023-06-02,day/night match,Vy Bt,2023,251,0.857143,0.672131,173.266667,0.00,154.333333
3,9587073,Ga An Ws,36084,8127230.0:4690328.0:4069666.0:7960847.0:469018...,Bs Rs,36070,3462080.0:2436405.0:1798705.0:7550857.0:574247...,Ga An Ws,36084,Ga An Ws,...,2023-09-10,day match,Cn Pr Le,2023,14300,2.166667,1.975610,164.266667,50.00,144.250000
4,9516457,Pb Ks,30407,8127181.0:197658.0:4239038.0:2398346.0:5053082...,Gt Ts,48341,1958683.0:7491224.0:8059029.0:4377610.0:225245...,Gt Ts,48341,Gt Ts,...,2023-04-13,night match,In Pr Le,2023,7118,0.818182,1.327869,164.666667,0.00,189.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,9128601,Pb Ks,30407,2789079.0:197658.0:2398346.0:2827327.0:2082044...,Ci Sr Ks,30414,7422673.0:249087.0:3519011.0:5001170.0:232000....,Pb Ks,30407,Ci Sr Ks,...,2022-04-25,night match,In Pr Le,2022,5004,0.823529,1.000000,147.333333,66.67,166.400000
944,9433241,Mx,8700,4421689.0:7752989.0:1941743.0:4489722.0:767287...,St,9701,6139370.0:7694581.0:3294444.0:3239102.0:632036...,St,9701,St,...,2023-06-02,day/night match,Vy Bt,2023,1042,1.571429,0.012346,167.400000,0.00,170.466667
945,9097227,Bd,22497,4239773.0:1941743.0:3007969.0:4172972.0:155625...,Wn Pe,23869,323049.0:4876122.0:4164978.0:1837205.0:3373138...,Wn Pe,23869,Bd,...,2022-02-07,day match,CA T0 Ce,2021/22,1224,3.000000,1.000000,,0.00,
946,9516695,Rn Rs,30428,8058959.0:2162782.0:2981614.0:4690188.0:212569...,Ss Hd,36014,5958840.0:7491294.0:3127354.0:3057312.0:420349...,Ss Hd,36014,Rn Rs,...,2023-05-07,night match,In Pr Le,2023,4661,0.789474,1.487805,182.800000,66.67,133.375000


In [35]:
df = extract_batsman_bowler_scores(df, batsman_scores, bowler_scores)
# df = encode_winner_id(df)
df = add_toss_winner_id(df)
df = change_date_time(df)
df = drop_redun_cols(df)
df

  new_df.at[i, "t1_batsman_scores"] += batsman_scores[p][0]
  new_df.at[i, "t1_bowler_scores"] += bowler_scores[key_bowl][0]
  new_df.at[i, "t2_batsman_scores"] += batsman_scores[p][0]
  new_df.at[i, "t2_bowler_scores"] += bowler_scores[key_bowl][0]


Unnamed: 0,match id,team1_id,team2_id,winner,winner_id,toss decision,match_dt,lighting,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,t1_batsman_scores,t1_bowler_scores,t2_batsman_scores,t2_bowler_scores,toss winner id
0,8887675,33921,33928,Be Ht,33921,field,2022-01-01,day/night match,6348,0.583333,0.672131,166.933333,0.0,162.333333,164.990940,64.865668,180.486717,59.819847,33928
1,8887689,33935,33949,Ph Ss,33935,bat,2022-01-02,day/night match,2932,1.222222,1.327869,178.733333,100.0,,175.942939,65.563866,145.693795,66.384949,33935
2,8887703,33949,33942,Me Rs,33942,bat,2022-01-03,night match,440,0.400000,41.000000,171.066667,50.0,162.687500,131.260843,65.224949,178.413958,72.468163,33949
3,8887717,33928,33914,Ae Ss,33914,field,2022-01-05,night match,930,1.000000,61.000000,173.333333,100.0,161.045455,209.260752,64.958383,160.236141,49.645070,33914
4,8887731,33921,33942,Me Rs,33942,field,2022-01-06,night match,3961,0.363636,1.952381,167.000000,100.0,,160.291849,71.456329,171.886181,77.495298,33942
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,9717504,48,188,Bh,188,field,2023-12-27,night match,3184,1.000000,0.603960,177.400000,80.0,159.666667,168.948516,82.523262,179.682191,74.880587,188
944,9866373,18360,18570,Wn,18570,bat,2023-12-28,day/night match,5382,1.400000,1.487805,165.733333,0.0,,146.101642,67.486819,125.940873,52.543500,18360
945,9702783,33928,33949,Me Ss,33949,field,2023-12-28,night match,6348,1.444444,1.000000,175.933333,40.0,152.785714,162.632115,53.129189,175.110821,77.188414,33949
946,9702790,33914,33942,Me Rs,33942,bat,2023-12-29,night match,8252,0.529412,1.000000,162.133333,40.0,158.500000,182.642780,62.116303,179.579465,50.631640,33914


## Finding Historical Relationship

In [36]:
match_data = pd.read_csv("664389efa0868_match_level_scorecard.csv")
match_data = change_date_time(match_data)
match_data

Unnamed: 0,match id,team1,team2,winner,by,win amount,toss winner,toss decision,venue,city,...,inning2_runs,inning2_wickets,inning2_balls,team1_id,team1_roster_ids,team2_id,team2_roster_ids,series_type,winner_id,player_of_the_match_id
0,8638034,Nn Ds,Wn,Wn,wickets,9.0,Wn,field,By Ol,Mount Maunganui,...,152.0,1.0,97.0,17982,7907451.0:4381761.0:31464.0:258649.0:4949790.0...,18570,2653993.0:6718326.0:6718382.0:2486896.0:228878...,other_domestic,18570,
1,8588005,Me Rs,Sy Tr,Sy Tr,runs,7.0,Sy Tr,field,Ca Ol,Carrara,...,117.0,2.0,74.0,33942,37351.0:46794.0:5406540.0:2231928.0:181404.0:1...,33963,1506098.0:1749075.0:36665.0:2083409.0:7534652....,other_domestic,33963,1749075.0
2,8587837,Sy Ss,Be Ht,Be Ht,wickets,4.0,Sy Ss,bat,Be Ct Gd,Brisbane,...,171.0,6.0,119.0,33956,7869987.0:7620283.0:2076192.0:4002340.0:306369...,33921,7620269.0:2286437.0:87191.0:5786766.0:3114803....,other_domestic,33921,3890984.0
3,8638041,Nn Ds,Oo,Oo,wickets,2.0,Oo,field,By Ol,Mount Maunganui,...,156.0,8.0,126.0,17982,7907451.0:4381761.0:31464.0:4949790.0:258649.0...,18360,2319638.0:256080.0:7918280.0:3913447.0:2690498...,other_domestic,18360,
4,8587921,Ht Hs,Me Ss,Ht Hs,runs,21.0,Me Ss,field,Be Ol,Hobart,...,143.0,9.0,123.0,33928,4223883.0:2161599.0:1655436.0:5788418.0:319948...,33949,363047.0:2275097.0:3901078.0:2275195.0:4230127...,other_domestic,33928,3125849.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1684,9717504,Nw Zd,Bh,Bh,wickets,5.0,Bh,field,Mn Pk Nr,Napier,...,137.0,5.0,114.0,48,6718326.0:4381761.0:2672214.0:5764576.0:306995...,188,3758565.0:2104332.0:4403419.0:3056752.0:693023...,international,188,5831622.0
1685,9866373,Oo,Wn,Wn,wickets,9.0,Oo,bat,Mx Pk Aa,Alexandra,...,53.0,1.0,29.0,18360,2319638.0:8444855.0:4172706.0:7907444.0:631714...,18570,9036516.0:8271969.0:3913447.0:9523954.0:835478...,other_domestic,18570,6718410.0
1686,9702783,Ht Hs,Me Ss,Me Ss,wickets,7.0,Me Ss,field,Be Ol Ht,Hobart,...,67.0,3.0,42.0,33928,1611364.0:5843200.0:5716402.0:3890963.0:194364...,33949,5406540.0:2275195.0:2275097.0:3901078.0:796099...,other_domestic,33949,2275195.0
1687,9702790,Ae Ss,Me Rs,Me Rs,wickets,4.0,Ae Ss,bat,Ds Sm Me,Melbourne,...,178.0,5.0,115.0,33914,4239038.0:2161599.0:2286472.0:5469008.0:368195...,33942,4003390.0:2654014.0:8176356.0:46794.0:1635773....,other_domestic,33942,8176356.0


In [37]:
def update_history(new_df: pd.DataFrame) -> pd.DataFrame:
    new_df["t1_points"] = 0
    new_df["t2_points"] = 0

    for i, row in new_df.iterrows():
        c_t1_id = row['team1_id']
        c_t2_id = row['team2_id']
        match_date = row['match_dt']
        
        for j, row2 in match_data.iterrows():
            if row2['match_dt'] > match_date:
                break
            
            if row2['team1_id'] == c_t1_id and row2['team2_id'] == c_t2_id:
                if row2['winner_id'] == c_t1_id:
                    new_df.at[i, 't1_points'] += 1
                elif row2['winner_id'] == c_t2_id:
                    new_df.at[i, 't2_points'] += 1

    return new_df

In [38]:
df = update_history(df)
df

Unnamed: 0,match id,team1_id,team2_id,winner,winner_id,toss decision,match_dt,lighting,ground_id,team_count_50runs_last15,...,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,t1_batsman_scores,t1_bowler_scores,t2_batsman_scores,t2_bowler_scores,toss winner id,t1_points,t2_points
0,8887675,33921,33928,Be Ht,33921,field,2022-01-01,day/night match,6348,0.583333,...,166.933333,0.0,162.333333,164.990940,64.865668,180.486717,59.819847,33928,1,0
1,8887689,33935,33949,Ph Ss,33935,bat,2022-01-02,day/night match,2932,1.222222,...,178.733333,100.0,,175.942939,65.563866,145.693795,66.384949,33935,2,0
2,8887703,33949,33942,Me Rs,33942,bat,2022-01-03,night match,440,0.400000,...,171.066667,50.0,162.687500,131.260843,65.224949,178.413958,72.468163,33949,0,2
3,8887717,33928,33914,Ae Ss,33914,field,2022-01-05,night match,930,1.000000,...,173.333333,100.0,161.045455,209.260752,64.958383,160.236141,49.645070,33914,0,1
4,8887731,33921,33942,Me Rs,33942,field,2022-01-06,night match,3961,0.363636,...,167.000000,100.0,,160.291849,71.456329,171.886181,77.495298,33942,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,9717504,48,188,Bh,188,field,2023-12-27,night match,3184,1.000000,...,177.400000,80.0,159.666667,168.948516,82.523262,179.682191,74.880587,188,6,3
944,9866373,18360,18570,Wn,18570,bat,2023-12-28,day/night match,5382,1.400000,...,165.733333,0.0,,146.101642,67.486819,125.940873,52.543500,18360,0,4
945,9702783,33928,33949,Me Ss,33949,field,2023-12-28,night match,6348,1.444444,...,175.933333,40.0,152.785714,162.632115,53.129189,175.110821,77.188414,33949,2,1
946,9702790,33914,33942,Me Rs,33942,bat,2023-12-29,night match,8252,0.529412,...,162.133333,40.0,158.500000,182.642780,62.116303,179.579465,50.631640,33914,2,3


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 948 entries, 0 to 947
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   match id                   948 non-null    int64         
 1   team1_id                   948 non-null    int64         
 2   team2_id                   948 non-null    int64         
 3   winner                     948 non-null    object        
 4   winner_id                  948 non-null    int64         
 5   toss decision              948 non-null    object        
 6   match_dt                   948 non-null    datetime64[ns]
 7   lighting                   948 non-null    object        
 8   ground_id                  948 non-null    int64         
 9   team_count_50runs_last15   948 non-null    float64       
 10  team_winp_last5            948 non-null    float64       
 11  team1only_avg_runs_last15  927 non-null    float64       
 12  team1_wi

# Load Model

In [40]:
loaded_clf = joblib.load('xgb_pipeline_model.pkl')

In [41]:
y_pred = loaded_clf.predict(df)

In [42]:
y_pred_prob = loaded_clf.predict_proba(df)

In [43]:
y_pred_df = pd.DataFrame(y_pred, columns=['y_pred'])
y_pred_prob_df = pd.DataFrame(y_pred_prob, columns=['y_pred_prob_0', 'y_pred_prob_1'])

In [44]:
final_df = pd.concat([df, y_pred_df, y_pred_prob_df], axis=1)

final_df

Unnamed: 0,match id,team1_id,team2_id,winner,winner_id,toss decision,match_dt,lighting,ground_id,team_count_50runs_last15,...,t1_batsman_scores,t1_bowler_scores,t2_batsman_scores,t2_bowler_scores,toss winner id,t1_points,t2_points,y_pred,y_pred_prob_0,y_pred_prob_1
0,8887675,33921,33928,Be Ht,33921,field,2022-01-01,day/night match,6348,0.583333,...,164.990940,64.865668,180.486717,59.819847,33928,1,0,1,0.000848,0.999152
1,8887689,33935,33949,Ph Ss,33935,bat,2022-01-02,day/night match,2932,1.222222,...,175.942939,65.563866,145.693795,66.384949,33935,2,0,1,0.004440,0.995560
2,8887703,33949,33942,Me Rs,33942,bat,2022-01-03,night match,440,0.400000,...,131.260843,65.224949,178.413958,72.468163,33949,0,2,0,0.997922,0.002078
3,8887717,33928,33914,Ae Ss,33914,field,2022-01-05,night match,930,1.000000,...,209.260752,64.958383,160.236141,49.645070,33914,0,1,0,0.998843,0.001157
4,8887731,33921,33942,Me Rs,33942,field,2022-01-06,night match,3961,0.363636,...,160.291849,71.456329,171.886181,77.495298,33942,1,1,0,0.995797,0.004203
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,9717504,48,188,Bh,188,field,2023-12-27,night match,3184,1.000000,...,168.948516,82.523262,179.682191,74.880587,188,6,3,0,0.589581,0.410419
944,9866373,18360,18570,Wn,18570,bat,2023-12-28,day/night match,5382,1.400000,...,146.101642,67.486819,125.940873,52.543500,18360,0,4,0,0.996873,0.003127
945,9702783,33928,33949,Me Ss,33949,field,2023-12-28,night match,6348,1.444444,...,162.632115,53.129189,175.110821,77.188414,33949,2,1,1,0.030759,0.969241
946,9702790,33914,33942,Me Rs,33942,bat,2023-12-29,night match,8252,0.529412,...,182.642780,62.116303,179.579465,50.631640,33914,2,3,1,0.201247,0.798753


In [45]:

final_df['win_pred_team_id'] = final_df.apply(
    lambda row: row['team1_id'] if row['y_pred_prob_1'] > row['y_pred_prob_0'] else row['team2_id'], 
    axis=1
)

final_df['win_pred_score'] = final_df.apply(
    lambda row: row['y_pred_prob_1'] if row['y_pred_prob_1'] > row['y_pred_prob_0'] else row['y_pred_prob_0'], 
    axis=1
)

final_df


Unnamed: 0,match id,team1_id,team2_id,winner,winner_id,toss decision,match_dt,lighting,ground_id,team_count_50runs_last15,...,t2_batsman_scores,t2_bowler_scores,toss winner id,t1_points,t2_points,y_pred,y_pred_prob_0,y_pred_prob_1,win_pred_team_id,win_pred_score
0,8887675,33921,33928,Be Ht,33921,field,2022-01-01,day/night match,6348,0.583333,...,180.486717,59.819847,33928,1,0,1,0.000848,0.999152,33921,0.999152
1,8887689,33935,33949,Ph Ss,33935,bat,2022-01-02,day/night match,2932,1.222222,...,145.693795,66.384949,33935,2,0,1,0.004440,0.995560,33935,0.995560
2,8887703,33949,33942,Me Rs,33942,bat,2022-01-03,night match,440,0.400000,...,178.413958,72.468163,33949,0,2,0,0.997922,0.002078,33942,0.997922
3,8887717,33928,33914,Ae Ss,33914,field,2022-01-05,night match,930,1.000000,...,160.236141,49.645070,33914,0,1,0,0.998843,0.001157,33914,0.998843
4,8887731,33921,33942,Me Rs,33942,field,2022-01-06,night match,3961,0.363636,...,171.886181,77.495298,33942,1,1,0,0.995797,0.004203,33942,0.995797
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,9717504,48,188,Bh,188,field,2023-12-27,night match,3184,1.000000,...,179.682191,74.880587,188,6,3,0,0.589581,0.410419,188,0.589581
944,9866373,18360,18570,Wn,18570,bat,2023-12-28,day/night match,5382,1.400000,...,125.940873,52.543500,18360,0,4,0,0.996873,0.003127,18570,0.996873
945,9702783,33928,33949,Me Ss,33949,field,2023-12-28,night match,6348,1.444444,...,175.110821,77.188414,33949,2,1,1,0.030759,0.969241,33928,0.969241
946,9702790,33914,33942,Me Rs,33942,bat,2023-12-29,night match,8252,0.529412,...,179.579465,50.631640,33914,2,3,1,0.201247,0.798753,33914,0.798753


In [46]:
final_df.to_csv('train_prediction.csv', index=False)

In [47]:
classifier = loaded_clf.named_steps['classifier']
params = classifier.get_params()
params

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': 0.7,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': 0.4,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': 0.6,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': 10,
 'max_leaves': None,
 'min_child_weight': 1,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [48]:
n_estimators = params.get('n_estimators', 'Not set')
max_depth = params.get('max_depth', 'Not set')
learning_rate = params.get('learning_rate', 'Not set')

print(f"Number of Trees (n_estimators): {n_estimators}")
print(f"Max Depth of Trees (max_depth): {max_depth}")
print(f"Learning Rate (learning_rate): {learning_rate}")


Number of Trees (n_estimators): None
Max Depth of Trees (max_depth): 10
Learning Rate (learning_rate): 0.6
