In [12]:
import os
import gc


import numpy as np
import pandas as pd
import cv2


import math
import xgboost as xgb
import torch
import scipy.stats as sss


from sklearn.metrics import roc_auc_score, matthews_corrcoef
from glob import glob
from tqdm import tqdm
from shutil import copyfile

In [13]:
import random

In [14]:
def seed_everything(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    torch.manual_seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)
#     if torch.cuda.is_available():
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)
    # torch.backends.cudnn.enabled = False
    torch.backends.cudnn.benchmark = False        
    torch.backends.cudnn.deterministic = True

In [15]:
class Config:
    NAME = "xgb_post_not_g"

    seed = 42
    num_fold = 5
    
    xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'learning_rate':0.005,
        'tree_method':'hist' if not torch.cuda.is_available() else 'gpu_hist'
    }

In [16]:
def setup(cfg):
    cfg.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # set dirs
    cfg.EXP = cfg.NAME

    cfg.EXP_MODEL = os.path.join(cfg.EXP, 'model')
    cfg.EXP_FIG = os.path.join(cfg.EXP, 'fig')
    cfg.EXP_PREDS = os.path.join(cfg.EXP, 'preds')

    # make dirs
    for d in [cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
        os.makedirs(d, exist_ok=True)
        
    return cfg


In [17]:
cfg = setup(Config)

In [18]:
seed_everything(cfg.seed)

### Pre folds setup

In [19]:
df_train_df = pd.read_csv('./xgb_pre_not_g/model/xgb_not_G_oof.csv') 
    # train_df['contact_id'][:2] =>
    # 0    58168_003392_0_38590_43854
    # 1    58168_003392_0_38590_41257
    # Name: contact_id, dtype: object
df_train_df['step'] = df_train_df['contact_id'].apply(lambda x: int(x.split('_')[2]))
    # train_df['step'][:2] =>    
    # 0    0
    # 1    0
    # Name: contact_id, dtype: int64 

df_train_df['vid'] = df_train_df['contact_id'].apply(lambda x: '_'.join(x.split('_')[:2]))
    # train_df['vid'][:2] =>    
    # 0    58168_003392
    # 1    58168_003392
    # Name: contact_id, dtype: object
    
print(df_train_df.shape)

(711383, 9)


In [20]:
df_train_df.head()

Unnamed: 0,contact_id,fold,contact,pred,frame,nfl_player_id_1,nfl_player_id_2,step,vid
0,58168_003392_0_38590_41944,0,0,3.1e-05,294.665,38590,41944,0,58168_003392
1,58168_003392_0_38590_47944,0,0,8.7e-05,294.665,38590,47944,0,58168_003392
2,58168_003392_0_38590_44822,0,0,8.3e-05,294.665,38590,44822,0,58168_003392
3,58168_003392_0_38590_39947,0,0,0.000234,294.665,38590,39947,0,58168_003392
4,58168_003392_0_38590_45695,0,0,1e-05,294.665,38590,45695,0,58168_003392


### Train

In [21]:
# https://www.kaggle.com/code/arjanso/reducing-dataframe-memory-size-by-65
def reduce_mem_usage(props):
#     start_mem_usg = props.memory_usage().sum() / 1024**2 
#     print("Memory usage of properties dataframe is :",start_mem_usg," MB")
#     NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
#             # Print current column type
#             print("******************************")
#             print("Column: ",col)
#             print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
#             # Integer does not support NA, therefore, NA needs to be filled
#             if not np.isfinite(props[col]).all(): 
#                 NAlist.append(col)
#                 props[col].fillna(mn-1,inplace=True)  
                   
            # test if column dtype is int.
            if 'int' in props[col].dtype.name:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
#             # Print new column type
#             print("dtype after: ",props[col].dtype)
#             print("******************************")
    
#     # Print final result
#     print("___MEMORY USAGE AFTER COMPLETION:___")
#     mem_usg = props.memory_usage().sum() / 1024**2 
#     print("Memory usage is: ",mem_usg," MB")
#     print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
#     return props, NAlist
    return props

In [22]:
def get_oof(config_name):
    dfs = []
    for i in [0,2,3,4]:#[0,1,2,3,4]
#         df_pred = pd.read_csv(f'../cnn/outputs/{config_name}/oof_f{i}.csv')
        df_pred = pd.read_csv(f'val_results/oof_{config_name}_f{i}.csv')        
        dfs.append(df_pred)
    df_pred = pd.concat(dfs).reset_index(drop=True)

    pred_step_dict = {}
    for i, row in df_pred.iterrows():
        idx = row['path'].split('/')[-1]
        step = int(idx.split('_')[-1])
        root = '_'.join(idx.split('_')[:-2])
        idx = f'{root}_{step}'
        pred_step_dict[idx] = row['pred']

    return pred_step_dict

In [23]:
def get_meta(path):
    xgb_df = pd.read_csv(path)
    xgb_dict = {}
    for i, row in xgb_df.iterrows():
        c_id = row['contact_id']
        fr_id = int(row['frame'])
        pred = row['pred']
        gk, gp, st, idx1, idx2 = c_id.split('_')
        idx = f'{gk}_{gp}_{idx1}_{idx2}_{st}'
        xgb_dict[idx] = pred

    return xgb_dict

In [27]:
def feature_engineering(train_df):    
    
    pred_step_dict1 = get_oof('r50ir_csn_c15_m1_d2_all')    
    xgb_dict = get_meta('./xgb_pre_not_g/model/xgb_not_G_oof.csv')
           
    
    results = []
    for i, row in tqdm(train_df.iterrows()):
        idx1 = row['nfl_player_id_1']
        idx2 = row['nfl_player_id_2']
        fr_id = int(row['frame'])
        vid = row['vid']
        idx = f'{vid}_{idx1}_{idx2}_{fr_id:04d}'
        step = row['step']
        idx = f'{idx}_{step}'

        item = {'contact_id':row['contact_id'], 'contact':row['contact'], 'step':row['step'], 'frame': row['frame'], 'fold':row['fold']}
        item['nfl_player_id_1'] = row['nfl_player_id_1']
        item['nfl_player_id_2'] = row['nfl_player_id_2']
        item['vid'] = row['vid']

        for j in range(-10,10):
            this_idx = f'{vid}_{idx1}_{idx2}_{step+j}'
            prob = 0
            weight = 0

            if this_idx in pred_step_dict1:
                prob += pred_step_dict1[this_idx]
                weight += 1

            if weight > 0:
                item[f'prob_{j}'] = prob/weight
            else:
                item[f'prob_{j}'] = np.nan

            if this_idx in xgb_dict:
                item[f'prob1_{j}'] = xgb_dict[this_idx]
            else:
                item[f'prob1_{j}'] = np.nan

        results.append(item) 

    train_df = pd.DataFrame(results)        

    results = []
    nan_val = np.nan
    # window_size = 2
    k=0
    for i, row in tqdm(train_df.iterrows()):
        vid = row['vid']
        idx = row['nfl_player_id_1']
        idx = f'{vid}_{idx}'

        idx2 = row['nfl_player_id_2']
        idx2 = f'{vid}_{idx2}'

        step = row['step']


        item = {}

        for j in range(-10,10):
            # the ensemble probability from the 20 neighboring steps.
            item[f'prob4_{j}'] = 0.8*row[f'prob_{j}'] + 0.2*row[f'prob1_{j}']
            
        if k==0: feature_cols = list(item.keys())
        k+=1
                
        # feature_cols =>
        # ['prob4_-10', 'prob4_-9', 'prob4_-8', 'prob4_-7', 'prob4_-6', 'prob4_-5', 'prob4_-4', 'prob4_-3', 'prob4_-2', 'prob4_-1', 'prob4_0', 'prob4_1', 'prob4_2', 'prob4_3', 'prob4_4', 'prob4_5', 'prob4_6', 'prob4_7', 'prob4_8', 'prob4_9']

        item['fold'] = row['fold']
        item['contact'] = row['contact']
        item['contact_id'] = row['contact_id']
        item['frame'] = row['frame']
        item['nfl_player_id_1'] = row['nfl_player_id_1']
        item['nfl_player_id_2'] = row['nfl_player_id_2']

        results.append(item)

    train_df = pd.DataFrame(results)

    return train_df, feature_cols 

In [25]:
# shutil.copyfile(src, dst, *, follow_symlinks=True) => 
# Copy the contents (no metadata) of the file named src to a file named dst and return dst in the most efficient way possible. 
# copyfile(os.path.basename(__file__), os.path.join(cfg.EXP_MODEL, os.path.basename(__file__)))

In [28]:
df_train_df, feature_cols = feature_engineering(df_train_df)


711383it [01:00, 11742.46it/s]
711383it [01:49, 6503.79it/s]


In [30]:
# len(feature_cols)
# ## 400

In [31]:
# 58200_003925_61_42352_43388

In [32]:
def fit_xgboost(cfg, params, add_suffix=''):
    oof_pred = []
    for fold in [2,1,0,3,4]:
        if fold == -1: continue

        x_train = df_train_df[df_train_df.fold!=fold][feature_cols]
        y_train = df_train_df[df_train_df.fold!=fold]['contact']

        x_val = df_train_df[df_train_df.fold==fold]

        x_valid = x_val[feature_cols]

        y_valid = df_train_df[df_train_df.fold==fold]['contact']

        print(x_train.shape, x_valid.shape)

        xgb_train = xgb.DMatrix(x_train, label=y_train)
        xgb_valid = xgb.DMatrix(x_valid, label=y_valid)
        evals = [(xgb_train,'train'),(xgb_valid,'eval')]
        
        model = xgb.train(
            params,
            xgb_train,
            num_boost_round=10_000,
            early_stopping_rounds=300,
            evals=evals,
            verbose_eval=100,
#             xgb_model='../input/dk-1st-data-1/kaggle_data_1/xgb_fold1_xgb_1st.model'
        )

#         model_path = os.path.join(cfg.EXP_MODEL, f'xgb_fold{fold}{add_suffix}.model')
        model_path = f'xgb_not_fold{fold}{add_suffix}.model'
        model.save_model(model_path)
        model = xgb.Booster()
        model.load_model(model_path)

        dvalid = xgb.DMatrix(x_valid)

        pred_i = model.predict(dvalid) 
        print(pred_i.shape)
        # print(pred_i[:10], y_valid[:10])

        x_val['pred'] = pred_i
        #x_val = x_val[['contact_id', 'fold', 'contact', 'pred', 'frame']]
        x_val = x_val[['contact_id', 'fold', 'contact', 'pred', 'frame', 'nfl_player_id_1', 'nfl_player_id_2']]
        oof_pred.append(x_val)

        gt = y_valid.values
        all_pos = np.sum(gt==1)

        for thres in [0.0002,0.001, 0.003, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05,0.1,0.2,0.3, 0.4, 0.5]:
            pred = 1*(pred_i > thres)
            tp = np.sum((gt==1)*(pred==1))
            pred_pos = np.sum(pred==1)

            score = matthews_corrcoef(gt, pred > thres)

            print(f'thres {thres:.4f} tp {tp} all_pos {all_pos:.4f} pred_pos {pred_pos:.4f}, score {score:.4f}')

        score = round(roc_auc_score(y_valid, pred_i), 5)
        print(f'Performance of the prediction: {score}\n')
        del model; gc.collect()

    oof_df = pd.concat(oof_pred)
#     oof_df.to_csv(f'{cfg.EXP_MODEL}/xgb_G_oof.csv', index=False)
    oof_df.to_csv(f'xgb_post_not_G_oof.csv', index=False)    


    gt = oof_df.contact.values
    all_pos = np.sum(gt==1)
    for thres in range(20,50):
        thres = thres*0.01
        pred = 1*(oof_df.pred.values > thres)
        tp = np.sum((gt==1)*(pred==1))
        pred_pos = np.sum(pred==1)

        score = matthews_corrcoef(gt, pred > thres)

        print(f'thres {thres:.4f} tp {tp} all_pos {all_pos:.4f} pred_pos {pred_pos:.4f}, score {score:.4f}')
    return oof_df


In [33]:
oof_pred = fit_xgboost(cfg, cfg.xgb_params, add_suffix="_xgb_2nd")

(580583, 20) (130800, 20)
[0]	train-auc:0.83587	eval-auc:0.98281
[100]	train-auc:0.84681	eval-auc:0.98341
[200]	train-auc:0.84944	eval-auc:0.98375
[300]	train-auc:0.85238	eval-auc:0.98422
[400]	train-auc:0.85371	eval-auc:0.98418
[500]	train-auc:0.85529	eval-auc:0.98414
[540]	train-auc:0.85582	eval-auc:0.98421
(130800,)
thres 0.0002 tp 8746 all_pos 8746.0000 pred_pos 130800.0000, score 0.0000
thres 0.0010 tp 8746 all_pos 8746.0000 pred_pos 130800.0000, score 0.0000
thres 0.0030 tp 8746 all_pos 8746.0000 pred_pos 130800.0000, score 0.0000
thres 0.0050 tp 8746 all_pos 8746.0000 pred_pos 130800.0000, score 0.0000
thres 0.0100 tp 8746 all_pos 8746.0000 pred_pos 130800.0000, score 0.0000
thres 0.0200 tp 8746 all_pos 8746.0000 pred_pos 130800.0000, score 0.0000
thres 0.0300 tp 8746 all_pos 8746.0000 pred_pos 130800.0000, score 0.0000
thres 0.0400 tp 8744 all_pos 8746.0000 pred_pos 118519.0000, score 0.0860
thres 0.0500 tp 8730 all_pos 8746.0000 pred_pos 111266.0000, score 0.1108
thres 0.1000 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


thres 0.2000 tp 8511 all_pos 8746.0000 pred_pos 19716.0000, score 0.6153
thres 0.3000 tp 8339 all_pos 8746.0000 pred_pos 16445.0000, score 0.6683
thres 0.4000 tp 8135 all_pos 8746.0000 pred_pos 14265.0000, score 0.7051
thres 0.5000 tp 7844 all_pos 8746.0000 pred_pos 12412.0000, score 0.7325
Performance of the prediction: 0.98421

(550415, 20) (160968, 20)
[0]	train-auc:0.96560	eval-auc:0.50000
[100]	train-auc:0.96741	eval-auc:0.50000
[200]	train-auc:0.96779	eval-auc:0.50000
[299]	train-auc:0.96824	eval-auc:0.50000
(160968,)
thres 0.0002 tp 9984 all_pos 9984.0000 pred_pos 160968.0000, score 0.0000
thres 0.0010 tp 9984 all_pos 9984.0000 pred_pos 160968.0000, score 0.0000
thres 0.0030 tp 9984 all_pos 9984.0000 pred_pos 160968.0000, score 0.0000
thres 0.0050 tp 9984 all_pos 9984.0000 pred_pos 160968.0000, score 0.0000
thres 0.0100 tp 9984 all_pos 9984.0000 pred_pos 160968.0000, score 0.0000
thres 0.0200 tp 9984 all_pos 9984.0000 pred_pos 160968.0000, score 0.0000
thres 0.0300 tp 9984 all_p

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


thres 0.0500 tp 9984 all_pos 9984.0000 pred_pos 160968.0000, score 0.0000
thres 0.1000 tp 9984 all_pos 9984.0000 pred_pos 160968.0000, score 0.0000
thres 0.2000 tp 0 all_pos 9984.0000 pred_pos 0.0000, score 0.0000
thres 0.3000 tp 0 all_pos 9984.0000 pred_pos 0.0000, score 0.0000
thres 0.4000 tp 0 all_pos 9984.0000 pred_pos 0.0000, score 0.0000
thres 0.5000 tp 0 all_pos 9984.0000 pred_pos 0.0000, score 0.0000
Performance of the prediction: 0.5

(574956, 20) (136427, 20)
[0]	train-auc:0.83000	eval-auc:0.94915
[100]	train-auc:0.84126	eval-auc:0.95743
[200]	train-auc:0.84795	eval-auc:0.95600
[300]	train-auc:0.85213	eval-auc:0.95712
[400]	train-auc:0.85350	eval-auc:0.95714
[404]	train-auc:0.85351	eval-auc:0.95708
(136427,)
thres 0.0002 tp 9501 all_pos 9501.0000 pred_pos 136427.0000, score 0.0000
thres 0.0010 tp 9501 all_pos 9501.0000 pred_pos 136427.0000, score 0.0000
thres 0.0030 tp 9501 all_pos 9501.0000 pred_pos 136427.0000, score 0.0000
thres 0.0050 tp 9501 all_pos 9501.0000 pred_pos 13

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


thres 0.1000 tp 8840 all_pos 9501.0000 pred_pos 20702.0000, score 0.5938
thres 0.2000 tp 8414 all_pos 9501.0000 pred_pos 15141.0000, score 0.6747
thres 0.3000 tp 7910 all_pos 9501.0000 pred_pos 11925.0000, score 0.7218
thres 0.4000 tp 7324 all_pos 9501.0000 pred_pos 9874.0000, score 0.7375
thres 0.5000 tp 6673 all_pos 9501.0000 pred_pos 8347.0000, score 0.7319
Performance of the prediction: 0.95708

(549981, 20) (161402, 20)
[0]	train-auc:0.83556	eval-auc:0.91175
[100]	train-auc:0.84499	eval-auc:0.92279
[200]	train-auc:0.84839	eval-auc:0.92670
[300]	train-auc:0.84943	eval-auc:0.92911
[400]	train-auc:0.85206	eval-auc:0.92982
[500]	train-auc:0.85347	eval-auc:0.93404
[600]	train-auc:0.85452	eval-auc:0.93465
[700]	train-auc:0.85550	eval-auc:0.93539
[800]	train-auc:0.85667	eval-auc:0.93604
[900]	train-auc:0.85722	eval-auc:0.93666
[1000]	train-auc:0.85789	eval-auc:0.93695
[1100]	train-auc:0.85832	eval-auc:0.93714
[1200]	train-auc:0.85895	eval-auc:0.93712
[1300]	train-auc:0.85940	eval-auc:0.9

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


thres 0.0400 tp 9980 all_pos 11053.0000 pred_pos 21550.0000, score 0.6133
thres 0.0500 tp 9850 all_pos 11053.0000 pred_pos 20108.0000, score 0.6294
thres 0.1000 tp 9313 all_pos 11053.0000 pred_pos 15995.0000, score 0.6747
thres 0.2000 tp 8306 all_pos 11053.0000 pred_pos 11848.0000, score 0.7049
thres 0.3000 tp 7444 all_pos 11053.0000 pred_pos 9642.0000, score 0.7021
thres 0.4000 tp 6597 all_pos 11053.0000 pred_pos 8059.0000, score 0.6808
thres 0.5000 tp 5806 all_pos 11053.0000 pred_pos 6801.0000, score 0.6521
Performance of the prediction: 0.93661

(589597, 20) (121786, 20)
[0]	train-auc:0.84310	eval-auc:0.94716
[100]	train-auc:0.85174	eval-auc:0.95595
[200]	train-auc:0.85330	eval-auc:0.95502
[300]	train-auc:0.85687	eval-auc:0.95696
[400]	train-auc:0.85802	eval-auc:0.95885
[500]	train-auc:0.85920	eval-auc:0.95859
[600]	train-auc:0.86042	eval-auc:0.95853
[700]	train-auc:0.86144	eval-auc:0.95910
[800]	train-auc:0.86243	eval-auc:0.95886
[900]	train-auc:0.86316	eval-auc:0.95910
[1000]	trai

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


thres 0.2000 tp 6350 all_pos 8383.0000 pred_pos 8764.0000, score 0.7212
thres 0.3000 tp 5757 all_pos 8383.0000 pred_pos 7261.0000, score 0.7201
thres 0.4000 tp 5114 all_pos 8383.0000 pred_pos 6057.0000, score 0.7007
thres 0.5000 tp 4454 all_pos 8383.0000 pred_pos 4995.0000, score 0.6722
Performance of the prediction: 0.95896

thres 0.2000 tp 31581 all_pos 47667.0000 pred_pos 55469.0000, score 0.5843
thres 0.2100 tp 31356 all_pos 47667.0000 pred_pos 54218.0000, score 0.5874
thres 0.2200 tp 31140 all_pos 47667.0000 pred_pos 52990.0000, score 0.5908
thres 0.2300 tp 30925 all_pos 47667.0000 pred_pos 51786.0000, score 0.5941
thres 0.2400 tp 30704 all_pos 47667.0000 pred_pos 50666.0000, score 0.5970
thres 0.2500 tp 30479 all_pos 47667.0000 pred_pos 49658.0000, score 0.5991
thres 0.2600 tp 30268 all_pos 47667.0000 pred_pos 48652.0000, score 0.6016
thres 0.2700 tp 30070 all_pos 47667.0000 pred_pos 47757.0000, score 0.6037
thres 0.2800 tp 29858 all_pos 47667.0000 pred_pos 46895.0000, score 0.60