In [1]:
import os
import gc


import numpy as np
import pandas as pd
import cv2


import math
import xgboost as xgb
import torch
import scipy.stats as sss


from sklearn.metrics import roc_auc_score, matthews_corrcoef
from glob import glob
from tqdm import tqdm
from shutil import copyfile

In [2]:
import random

In [3]:
def seed_everything(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    torch.manual_seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)
#     if torch.cuda.is_available():
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)
    # torch.backends.cudnn.enabled = False
    torch.backends.cudnn.benchmark = False        
    torch.backends.cudnn.deterministic = True

In [4]:
class Config:
    NAME = "xgb_post"

    seed = 42
    num_fold = 5
    
    xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'learning_rate':0.005,
        'tree_method':'hist' if not torch.cuda.is_available() else 'gpu_hist'
    }

In [5]:
def setup(cfg):
    cfg.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # set dirs
    cfg.EXP = cfg.NAME

    cfg.EXP_MODEL = os.path.join(cfg.EXP, 'model')
    cfg.EXP_FIG = os.path.join(cfg.EXP, 'fig')
    cfg.EXP_PREDS = os.path.join(cfg.EXP, 'preds')

    # make dirs
    for d in [cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
        os.makedirs(d, exist_ok=True)
        
    return cfg


In [6]:
cfg = setup(Config)

In [7]:
seed_everything(cfg.seed)

### Pre folds setup

In [8]:
df_train_df = pd.read_csv('./xgb_pre/model/xgb_G_oof.csv') 
    # train_df['contact_id'][:2] =>
    # 0    58168_003392_0_38590_43854
    # 1    58168_003392_0_38590_41257
    # Name: contact_id, dtype: object
df_train_df['step'] = df_train_df['contact_id'].apply(lambda x: int(x.split('_')[2]))
    # train_df['step'][:2] =>    
    # 0    0
    # 1    0
    # Name: contact_id, dtype: int64 

df_train_df['vid'] = df_train_df['contact_id'].apply(lambda x: '_'.join(x.split('_')[:2]))
    # train_df['vid'][:2] =>    
    # 0    58168_003392
    # 1    58168_003392
    # Name: contact_id, dtype: object
df_train_df['nfl_player_id_1'] = df_train_df['contact_id'].apply(lambda x: int(x.split('_')[3]))
    # train_df['nfl_player_id_1'][:2] =>    
    # 0    38590
    # 1    38590
    # Name: contact_id, dtype: int64    

df_train_df['nfl_player_id_2'] = 'G'


In [14]:
df_train_df.head()

Unnamed: 0,contact_id,fold,contact,pred,frame,step,vid,nfl_player_id_1,nfl_player_id_2
0,58188_001358_0_46075_G,2,0,0.001174,291.368,0,58188_001358,46075,G
1,58188_001358_0_46466_G,2,0,0.000684,291.368,0,58188_001358,46466,G
2,58188_001358_0_45215_G,2,0,0.000324,291.368,0,58188_001358,45215,G
3,58188_001358_0_42477_G,2,0,0.001358,291.368,0,58188_001358,42477,G
4,58188_001358_0_47973_G,2,0,0.001093,291.368,0,58188_001358,47973,G


### Train

In [9]:
# https://www.kaggle.com/code/arjanso/reducing-dataframe-memory-size-by-65
def reduce_mem_usage(props):
#     start_mem_usg = props.memory_usage().sum() / 1024**2 
#     print("Memory usage of properties dataframe is :",start_mem_usg," MB")
#     NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
#             # Print current column type
#             print("******************************")
#             print("Column: ",col)
#             print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
#             # Integer does not support NA, therefore, NA needs to be filled
#             if not np.isfinite(props[col]).all(): 
#                 NAlist.append(col)
#                 props[col].fillna(mn-1,inplace=True)  
                   
            # test if column dtype is int.
            if 'int' in props[col].dtype.name:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
#             # Print new column type
#             print("dtype after: ",props[col].dtype)
#             print("******************************")
    
#     # Print final result
#     print("___MEMORY USAGE AFTER COMPLETION:___")
#     mem_usg = props.memory_usage().sum() / 1024**2 
#     print("Memory usage is: ",mem_usg," MB")
#     print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
#     return props, NAlist
    return props

In [10]:
def get_oof(config_name):
    dfs = []
    for i in [0,1,2]:# [0,1,2,3,4]
#         df_pred = pd.read_csv(f'../cnn/outputs/{config_name}/oof_f{i}.csv')
        df_pred = pd.read_csv(f'val_results/oof_{config_name}_f{i}.csv')        
        dfs.append(df_pred)
    df_pred = pd.concat(dfs).reset_index(drop=True)

    # df_pred.columns => 
    # Index(['path', 'fold', 'contact', 'distance', 'step', 'e_empty', 's_empty', 'pred'],  dtype='object')
    # df_pred[['path'][0] => slicing_g/58506_001241_53443_G_0294_0
    
    pred_step_dict = {}
    for i, row in df_pred.iterrows():
        idx = row['path'].split('/')[-1]
        step = int(idx.split('_')[-1])
        root = '_'.join(idx.split('_')[:-2])
        idx = f'{root}_{step}'
        pred_step_dict[idx] = row['pred']        

    # pred_step_dict =>
    # {'58506_001241_53443_G_0': 0.2598580420017242,
    #  '58506_001241_52419_G_0': 0.166795015335083,
    #  '58506_001241_53431_G_0': 0.1642752438783645}    
    # ...
        
    return pred_step_dict

In [15]:
def feature_engineering(train_df):    
    
    pred_step_dict = get_oof('r50ir_csn_c11_m1_d2_G_all')
    pred_step_dict2 = get_oof('r50ir_csn_c15_m1_d2_G_all')    
    
    
    pred_step_dict1 = {}
    for i, row in train_df.iterrows():
        c_id = row['contact_id']
        fr_id = int(row['frame'])
        pred = row['pred']
        gk, gp, st, idx1, idx2 = c_id.split('_')
        # gk, gp, st, idx1, idx2 => 58188, 001358, 0, 46075, G
        
        idx = f'{gk}_{gp}_{idx1}_{idx2}_{st}'
        # idx => 58188_001358_46075_G_0
        
        pred_step_dict1[idx] = row['pred']    
    
    results = []
    for i, row in tqdm(train_df.iterrows()):
        
        vid = row['vid']
        idx1 = row['nfl_player_id_1']
        idx2 = row['nfl_player_id_2']        
        step = row['step']
        # step => 0
        
        ## row['frame'] => 291.368
        # fr_id = int(row['frame'])
        ## f'{fr_id:04d}' => 0291
        # idx = f'{vid}_{idx1}_{idx2}_{fr_id:04d}_{step}'        
        ## idx => 58188_001358_46075_G_0291_0
        contact = row['contact']        
        
        
        item = {'contact_id':row['contact_id'], 'contact':contact, 'step':row['step'], 'frame': row['frame'], 'fold':row['fold']}
        item['nfl_player_id_1'] = row['nfl_player_id_1']
        item['nfl_player_id_2'] = row['nfl_player_id_2']
        item['prob1'] = row['pred'] # xgb_pre probability
        item['vid'] = row['vid']
        
        for j in range(-15,15):
            this_idx = f'{vid}_{idx1}_{idx2}_{step+j}'
            # this_idx => 58188_001358_46075_G_-15

            prob = 0
            weight = 0
            if this_idx in pred_step_dict:
                prob += pred_step_dict[this_idx]
                weight += 1

            if this_idx in pred_step_dict2:
                prob += pred_step_dict2[this_idx]
                weight += 1

            if weight > 0:
                item[f'prob_{j}'] = prob/weight
            else:
                item[f'prob_{j}'] = np.nan

            if this_idx in pred_step_dict1:
                item[f'prob1_{j}'] = pred_step_dict1[this_idx]
            else:
                item[f'prob1_{j}'] = np.nan
            
            # item =>
            # {'contact_id': '58188_001358_0_46075_G', 'contact': 0, 'step': 0, 'frame': 291.368, 'fold': 2, 
            #  'nfl_player_id_1': 46075, 'nfl_player_id_2': 'G', 'prob1': 0.0011736668, 'vid': '58188_001358', 
            #  'prob_-15': nan, 'prob1_-15': nan}
            
        results.append(item) 

    train_df = pd.DataFrame(results)        

    k=0
    for i, row in tqdm(train_df.iterrows()):
        vid = row['vid']
        idx = row['nfl_player_id_1']
        idx = f'{vid}_{idx}'
        # step = row['step']

        item = {}

        for j in range(-15,15):
            # the cnn_prob and pre_xgb_prob from the 20 neighboring steps.
            if j>-10 and j < 10:
                item[f'prob_{j}'] = row[f'prob_{j}']
                item[f'prob1_{j}'] = row[f'prob1_{j}']
            # the ensemble probability from the 30 neighboring steps.
            item[f'prob3_{j}'] = 0.85*row[f'prob_{j}']  + 0.15*row[f'prob1_{j}']

        if k==0: 
            feature_cols = list(item.keys()) 
            # feature_cols => ['prob3_-15', 'prob3_-14', 'prob3_-13', 'prob3_-12', 'prob3_-11', ..., 'prob3_11', 'prob3_12', 'prob3_13', 'prob3_14']
            k+=1

        item['fold'] = row['fold']
        item['contact'] = row['contact']
        item['contact_id'] = row['contact_id']
        item['frame'] = row['frame']

        results.append(item)
        
    train_df = pd.DataFrame(results)        
    
    return train_df, feature_cols

In [16]:
# shutil.copyfile(src, dst, *, follow_symlinks=True) => 
# Copy the contents (no metadata) of the file named src to a file named dst and return dst in the most efficient way possible. 
# copyfile(os.path.basename(__file__), os.path.join(cfg.EXP_MODEL, os.path.basename(__file__)))

In [17]:
df_train_df, feature_cols = feature_engineering(df_train_df)


410633it [01:40, 4094.39it/s]
410633it [06:57, 982.77it/s] 


In [18]:
# len(feature_cols)
# ## 68

In [19]:
# 58200_003925_61_42352_43388

In [20]:
def fit_xgboost(cfg, params, add_suffix=''):
    oof_pred = []
    for fold in [2,1,0,3,4]:
        if fold == -1: continue

        x_train = df_train_df[df_train_df.fold!=fold][feature_cols]
        y_train = df_train_df[df_train_df.fold!=fold]['contact']

        x_val = df_train_df[df_train_df.fold==fold]

        x_valid = x_val[feature_cols]

        y_valid = df_train_df[df_train_df.fold==fold]['contact']

        print(x_train.shape, x_valid.shape)

        xgb_train = xgb.DMatrix(x_train, label=y_train)
        xgb_valid = xgb.DMatrix(x_valid, label=y_valid)
        evals = [(xgb_train,'train'),(xgb_valid,'eval')]
        
        model = xgb.train(
            params,
            xgb_train,
            num_boost_round=10_000,
            
            # validation metric needs to improve at least once in every early_stopping_rounds round(s) to continue training.
            early_stopping_rounds=200,
            evals=evals,
            verbose_eval=100,
#             xgb_model='../input/dk-1st-data-1/kaggle_data_1/xgb_fold1_xgb_1st.model'
        )

#         model_path = os.path.join(cfg.EXP_MODEL, f'xgb_fold{fold}{add_suffix}.model')
        model_path = f'xgb_fold{fold}{add_suffix}.model'
        model.save_model(model_path)
        model = xgb.Booster()
        model.load_model(model_path)

        dvalid = xgb.DMatrix(x_valid)

        pred_i = model.predict(dvalid) 
        print(pred_i.shape)
        # print(pred_i[:10], y_valid[:10])

        x_val['pred'] = pred_i
        x_val = x_val[['contact_id', 'fold', 'contact', 'pred', 'frame']]
        oof_pred.append(x_val)

        gt = y_valid.values
        all_pos = np.sum(gt==1)

        for thres in [0.0002,0.001, 0.003, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05,0.1,0.2,0.3, 0.4, 0.5]:
            pred = 1*(pred_i > thres)
            tp = np.sum((gt==1)*(pred==1))
            pred_pos = np.sum(pred==1)

            score = matthews_corrcoef(gt, pred > thres)

            print(f'thres {thres:.4f} tp {tp} all_pos {all_pos:.4f} pred_pos {pred_pos:.4f}, score {score:.4f}')

        score = round(roc_auc_score(y_valid, pred_i), 5)
        print(f'Performance of the prediction: {score}\n')
        del model; gc.collect()

    oof_df = pd.concat(oof_pred)
#     oof_df.to_csv(f'{cfg.EXP_MODEL}/xgb_G_oof.csv', index=False)
    oof_df.to_csv(f'xgb_post_G_oof.csv', index=False)    


    gt = oof_df.contact.values
    all_pos = np.sum(gt==1)
    for thres in range(20,50):
        thres = thres*0.01
        pred = 1*(oof_df.pred.values > thres)
        tp = np.sum((gt==1)*(pred==1))
        pred_pos = np.sum(pred==1)

        score = matthews_corrcoef(gt, pred > thres)

        print(f'thres {thres:.4f} tp {tp} all_pos {all_pos:.4f} pred_pos {pred_pos:.4f}, score {score:.4f}')
    return oof_df


In [21]:
oof_pred = fit_xgboost(cfg, cfg.xgb_params, add_suffix="_xgb_2nd")

(656700, 68) (164566, 68)
[0]	train-auc:0.95069	eval-auc:0.97882
[100]	train-auc:0.96341	eval-auc:0.98800
[200]	train-auc:0.96476	eval-auc:0.98874
[300]	train-auc:0.96556	eval-auc:0.98895
[400]	train-auc:0.96674	eval-auc:0.98906
[500]	train-auc:0.96810	eval-auc:0.98888
[596]	train-auc:0.96968	eval-auc:0.98867
(164566,)
thres 0.0002 tp 5238 all_pos 5238.0000 pred_pos 164566.0000, score 0.0000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


thres 0.0010 tp 5238 all_pos 5238.0000 pred_pos 164566.0000, score 0.0000
thres 0.0030 tp 5238 all_pos 5238.0000 pred_pos 164566.0000, score 0.0000
thres 0.0050 tp 5238 all_pos 5238.0000 pred_pos 164566.0000, score 0.0000
thres 0.0100 tp 5238 all_pos 5238.0000 pred_pos 164566.0000, score 0.0000
thres 0.0200 tp 5238 all_pos 5238.0000 pred_pos 164566.0000, score 0.0000
thres 0.0300 tp 5228 all_pos 5238.0000 pred_pos 63582.0000, score 0.2278
thres 0.0400 tp 5207 all_pos 5238.0000 pred_pos 36885.0000, score 0.3348
thres 0.0500 tp 5150 all_pos 5238.0000 pred_pos 29909.0000, score 0.3768
thres 0.1000 tp 4967 all_pos 5238.0000 pred_pos 12629.0000, score 0.5937
thres 0.2000 tp 4583 all_pos 5238.0000 pred_pos 7768.0000, score 0.7077
thres 0.3000 tp 4191 all_pos 5238.0000 pred_pos 5940.0000, score 0.7427
thres 0.4000 tp 3921 all_pos 5238.0000 pred_pos 5080.0000, score 0.7524
thres 0.5000 tp 3583 all_pos 5238.0000 pred_pos 4285.0000, score 0.7492
Performance of the prediction: 0.98867

(656838, 6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


thres 0.0010 tp 7984 all_pos 7984.0000 pred_pos 128726.0000, score 0.1190
thres 0.0030 tp 7984 all_pos 7984.0000 pred_pos 70517.0000, score 0.2607
thres 0.0050 tp 7984 all_pos 7984.0000 pred_pos 56296.0000, score 0.3131
thres 0.0100 tp 7972 all_pos 7984.0000 pred_pos 41601.0000, score 0.3874
thres 0.0200 tp 7946 all_pos 7984.0000 pred_pos 31780.0000, score 0.4588
thres 0.0300 tp 7868 all_pos 7984.0000 pred_pos 25288.0000, score 0.5208
thres 0.0400 tp 7784 all_pos 7984.0000 pred_pos 20811.0000, score 0.5764
thres 0.0500 tp 7719 all_pos 7984.0000 pred_pos 18220.0000, score 0.6161
thres 0.1000 tp 7524 all_pos 7984.0000 pred_pos 13903.0000, score 0.6965
thres 0.2000 tp 7193 all_pos 7984.0000 pred_pos 11116.0000, score 0.7498
thres 0.3000 tp 6847 all_pos 7984.0000 pred_pos 9667.0000, score 0.7671
thres 0.4000 tp 6512 all_pos 7984.0000 pred_pos 8716.0000, score 0.7689
thres 0.5000 tp 6187 all_pos 7984.0000 pred_pos 7938.0000, score 0.7658
Performance of the prediction: 0.98957

(656794, 68) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


thres 0.0010 tp 5104 all_pos 5104.0000 pred_pos 164472.0000, score 0.0000
thres 0.0030 tp 5104 all_pos 5104.0000 pred_pos 164472.0000, score 0.0000
thres 0.0050 tp 5104 all_pos 5104.0000 pred_pos 164472.0000, score 0.0000
thres 0.0100 tp 5104 all_pos 5104.0000 pred_pos 108860.0000, score 0.1279
thres 0.0200 tp 4922 all_pos 5104.0000 pred_pos 31907.0000, score 0.3486
thres 0.0300 tp 4780 all_pos 5104.0000 pred_pos 24292.0000, score 0.3979
thres 0.0400 tp 4696 all_pos 5104.0000 pred_pos 19672.0000, score 0.4414
thres 0.0500 tp 4646 all_pos 5104.0000 pred_pos 16619.0000, score 0.4805
thres 0.1000 tp 4435 all_pos 5104.0000 pred_pos 10681.0000, score 0.5839
thres 0.2000 tp 4228 all_pos 5104.0000 pred_pos 7457.0000, score 0.6736
thres 0.3000 tp 4072 all_pos 5104.0000 pred_pos 6102.0000, score 0.7203
thres 0.4000 tp 3887 all_pos 5104.0000 pred_pos 5408.0000, score 0.7313
thres 0.5000 tp 3658 all_pos 5104.0000 pred_pos 4715.0000, score 0.7379
Performance of the prediction: 0.97725

(656266, 68

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


thres 0.0010 tp 9298 all_pos 9520.0000 pred_pos 52654.0000, score 0.3491
thres 0.0030 tp 9066 all_pos 9520.0000 pred_pos 39844.0000, score 0.4110
thres 0.0050 tp 8850 all_pos 9520.0000 pred_pos 33372.0000, score 0.4481
thres 0.0100 tp 8500 all_pos 9520.0000 pred_pos 26566.0000, score 0.4927
thres 0.0200 tp 7924 all_pos 9520.0000 pred_pos 20548.0000, score 0.5304
thres 0.0300 tp 7592 all_pos 9520.0000 pred_pos 18014.0000, score 0.5461
thres 0.0400 tp 7270 all_pos 9520.0000 pred_pos 16338.0000, score 0.5506
thres 0.0500 tp 7038 all_pos 9520.0000 pred_pos 15270.0000, score 0.5522
thres 0.1000 tp 6104 all_pos 9520.0000 pred_pos 12122.0000, score 0.5384
thres 0.2000 tp 4758 all_pos 9520.0000 pred_pos 8600.0000, score 0.4984
thres 0.3000 tp 4022 all_pos 9520.0000 pred_pos 6796.0000, score 0.4748
thres 0.4000 tp 3346 all_pos 9520.0000 pred_pos 5408.0000, score 0.4429
thres 0.5000 tp 2562 all_pos 9520.0000 pred_pos 3892.0000, score 0.4003
Performance of the prediction: 0.95091

(658466, 68) (1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


thres 0.0010 tp 5782 all_pos 5782.0000 pred_pos 162800.0000, score 0.0000
thres 0.0030 tp 5782 all_pos 5782.0000 pred_pos 162800.0000, score 0.0000
thres 0.0050 tp 5782 all_pos 5782.0000 pred_pos 162800.0000, score 0.0000
thres 0.0100 tp 5782 all_pos 5782.0000 pred_pos 162800.0000, score 0.0000
thres 0.0200 tp 5782 all_pos 5782.0000 pred_pos 162800.0000, score 0.0000
thres 0.0300 tp 5782 all_pos 5782.0000 pred_pos 162800.0000, score 0.0000
thres 0.0400 tp 5502 all_pos 5782.0000 pred_pos 59608.0000, score 0.2332
thres 0.0500 tp 5410 all_pos 5782.0000 pred_pos 44012.0000, score 0.2875
thres 0.1000 tp 4856 all_pos 5782.0000 pred_pos 19280.0000, score 0.4284
thres 0.2000 tp 4174 all_pos 5782.0000 pred_pos 11690.0000, score 0.4832
thres 0.3000 tp 3360 all_pos 5782.0000 pred_pos 7088.0000, score 0.5055
thres 0.4000 tp 2352 all_pos 5782.0000 pred_pos 3856.0000, score 0.4834
thres 0.5000 tp 1084 all_pos 5782.0000 pred_pos 1588.0000, score 0.3470
Performance of the prediction: 0.93449

thres 0.