In [1]:
import os
import gc


import numpy as np
import pandas as pd
import cv2


import math
import xgboost as xgb
import torch
import scipy.stats as sss


from sklearn.metrics import roc_auc_score, matthews_corrcoef
from glob import glob
from tqdm import tqdm
from shutil import copyfile

In [2]:
import random

In [3]:
def seed_everything(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    torch.manual_seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)
#     if torch.cuda.is_available():
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)
    # torch.backends.cudnn.enabled = False
    torch.backends.cudnn.benchmark = False        
    torch.backends.cudnn.deterministic = True

In [4]:
class Config:
    NAME = "xgb_pre"

    seed = 42
    num_fold = 5
    
    xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'learning_rate':0.005,
        'tree_method':'hist' if not torch.cuda.is_available() else 'gpu_hist'
    }

In [5]:
def setup(cfg):
    cfg.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # set dirs
    cfg.EXP = cfg.NAME

    cfg.EXP_MODEL = os.path.join(cfg.EXP, 'model')
    cfg.EXP_FIG = os.path.join(cfg.EXP, 'fig')
    cfg.EXP_PREDS = os.path.join(cfg.EXP, 'preds')

    # make dirs
    for d in [cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
        os.makedirs(d, exist_ok=True)
        
    return cfg


In [6]:
cfg = setup(Config)

In [7]:
seed_everything(cfg.seed)

### Pre folds setup

In [8]:
# df_train_df = pd.read_csv('../input/dk-1st-data/kaggle_data/train_folds.csv') 
# df_train_df = df_train_df[df_train_df['nfl_player_id_2'] == 'G']
#     # train_df['contact_id'][:2] =>
#     # 0    58168_003392_0_38590_43854
#     # 1    58168_003392_0_38590_41257
#     # Name: contact_id, dtype: object
# df_train_df['step'] = df_train_df['contact_id'].apply(lambda x: int(x.split('_')[2]))
#     # train_df['step'][:2] =>    
#     # 0    0
#     # 1    0
#     # Name: contact_id, dtype: int64 

# df_train_df['vid'] = df_train_df['contact_id'].apply(lambda x: '_'.join(x.split('_')[:2]))
#     # train_df['vid'][:2] =>    
#     # 0    58168_003392
#     # 1    58168_003392
#     # Name: contact_id, dtype: object
# df_train_df['nfl_player_id_1'] = df_train_df['contact_id'].apply(lambda x: int(x.split('_')[3]))
#     # train_df['nfl_player_id_1'][:2] =>    
#     # 0    38590
#     # 1    38590
#     # Name: contact_id, dtype: int64    

# df_train_df.to_parquet("train_folds_G", compression="gzip", index=False)

### Train

In [9]:
# https://www.kaggle.com/code/arjanso/reducing-dataframe-memory-size-by-65
def reduce_mem_usage(props):
#     start_mem_usg = props.memory_usage().sum() / 1024**2 
#     print("Memory usage of properties dataframe is :",start_mem_usg," MB")
#     NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
#             # Print current column type
#             print("******************************")
#             print("Column: ",col)
#             print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
#             # Integer does not support NA, therefore, NA needs to be filled
#             if not np.isfinite(props[col]).all(): 
#                 NAlist.append(col)
#                 props[col].fillna(mn-1,inplace=True)  
                   
            # test if column dtype is int.
            if 'int' in props[col].dtype.name:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
#             # Print new column type
#             print("dtype after: ",props[col].dtype)
#             print("******************************")
    
#     # Print final result
#     print("___MEMORY USAGE AFTER COMPLETION:___")
#     mem_usg = props.memory_usage().sum() / 1024**2 
#     print("Memory usage is: ",mem_usg," MB")
#     print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
#     return props, NAlist
    return props

In [10]:
def feature_engineering(train_df, trk_dict):    
    results = []
    nan_val = 0
    window_size = 25    
    k = 0
    l = 0    
    for i, row in tqdm(train_df.iterrows()):
        
        vid = row['vid']
        idx = row['nfl_player_id_1']
        idx = f'{vid}_{idx}'
        step = row['step']

        agg_dict = {'s': [], 'dis': [], 'dir': [], 'o': [], 'a': [], 'sa': [], 'x': [], 'y': []}
        
            # idx => 58168_003392_38590
        if idx not in trk_dict:
            item = {'s': nan_val, 'dis': nan_val, 'dir': nan_val, 'o': nan_val, 'a': nan_val, 'sa': nan_val, 'x': nan_val, 'y': nan_val}
            for j in range(-window_size,window_size):                
                if j == 0:
                    continue
                    
                item[f's_{j}'] = nan_val
                item[f'dis_{j}'] = nan_val
                item[f'dir_{j}'] = nan_val
                item[f'o_{j}'] = nan_val
                item[f'a_{j}'] = nan_val
                item[f'sa_{j}'] = nan_val
                item[f'x_{j}'] = nan_val
                item[f'y_{j}'] = nan_val
                
            # item => 
            # {'s': 0, 'dis': 0, 'dir': 0, 'o': 0, 'a': 0, 'sa': 0, 'x': 0, 'y': 0, 
            #  's_-25': 0, 'dis_-25': 0, 'dir_-25': 0, 'o_-25': 0, 'a_-25': 0, 'sa_-25': 0, 'x_-25': 0, 'y_-25': 0, 
            #  's_-24': 0, 'dis_-24': 0, 'dir_-24': 0, 'o_-24': 0, 'a_-24': 0, 'sa_-24': 0, 'x_-24': 0, 'y_-24': 0, ...
        else:
            # idx => 58168_003392_38590
            if step in trk_dict[idx]:
                item = {'s': trk_dict[idx][step]['s'], 'dis': trk_dict[idx][step]['dis'], 'dir': trk_dict[idx][step]['dir'], 'o': trk_dict[idx][step]['o']} 
                item['a'] = trk_dict[idx][step]['a']
                item['sa'] = trk_dict[idx][step]['sa']
                item['x'] = trk_dict[idx][step]['x']
                item['y'] = trk_dict[idx][step]['y']                      
                    # item => {'s': 0.52, 'dis': 0.06, 'dir': 141.08, 'o': 100.37, 'a': 0.59, 'sa': 0.58, 'x': 40.33, 'y': 25.28}                 
            else:
                item = {'s': nan_val, 'dis': nan_val, 'dir': nan_val, 'o': nan_val, 'a': nan_val, 'sa': nan_val, 'x': nan_val, 'y': nan_val}
                
            for j in range(-window_size,window_size):
                step1 = step + j 

                if j == 0:
                    continue
                
                if step1 in trk_dict[idx]:
                    item[f's_{j}'] = item[f's'] - trk_dict[idx][step1]['s']
                    item[f'dis_{j}'] = item[f'dis'] - trk_dict[idx][step1]['dis']
                    item[f'dir_{j}'] = item[f'dir'] - trk_dict[idx][step1]['dir']
                    item[f'o_{j}'] = item[f'o'] - trk_dict[idx][step1]['o']
                    item[f'a_{j}'] = item[f'a'] - trk_dict[idx][step1]['a']
                    item[f'sa_{j}'] = item[f'sa'] - trk_dict[idx][step1]['sa']
                    item[f'x_{j}'] = item[f'x'] - trk_dict[idx][step1]['x']
                    item[f'y_{j}'] = item[f'y'] - trk_dict[idx][step1]['y']                          
                else:
                    item[f's_{j}'] = nan_val
                    item[f'dis_{j}'] = nan_val
                    item[f'dir_{j}'] = nan_val
                    item[f'o_{j}'] = nan_val
                    item[f'a_{j}'] = nan_val
                    item[f'sa_{j}'] = nan_val
                    item[f'x_{j}'] = nan_val
                    item[f'y_{j}'] = nan_val
                    
            # item =>
            # {'s': 0.52, 'dis': 0.06, 'dir': 141.08, 'o': 100.37, 'a': 0.59, 'sa': 0.58, 'x': 40.33, 'y': 25.28,
            #  's_-25': 0.47, 'dis_-25': 0.06, 'dir_-25': -24.51, 'o_-25': 22.76, 'a_-25': 0.42, 'sa_-25': 0.41, 'x_-25': 0.079, 'y_-25': -0.059 ...
        
        # item['step'] = row['step']

        if k==0: feature_cols = list(item.keys())
            # feature_cols =>
            # ['s', 'dis', 'dir', 'o', 'a', 'sa', 'x', 'y', 
            #  's_-25', 'dis_-25', 'dir_-25', 'o_-25', 'a_-25', 'sa_-25', 'x_-25', 'y_-25', 
            #  's_-24', 'dis_-24', 'dir_-24', 'o_-24', 'a_-24', 'sa_-24', 'x_-24', 'y_-24', ...                    
            # len(feature_cols) => 409
        k += 1
        
        item['step'] = row['step']
        item['fold'] = row['fold']
        item['contact'] = row['contact']
        item['contact_id'] = row['contact_id']
        item['frame'] = row['frame']

        # item => 
        # {'s': 0, 'dis': 0, 'dir': 0, 'o': 0, 'a': 0, 'sa': 0, 'x': 0, 'y': 0, 
        #  's_-25': 0, 'dis_-25': 0, 'dir_-25': 0, 'o_-25': 0, 'a_-25': 0, 'sa_-25': 0, 'x_-25': 0, 'y_-25': 0, 
        #  's_-24': 0, 'dis_-24': 0, 'dir_-24': 0, 'o_-24': 0, 'a_-24': 0, 'sa_-24': 0, 'x_-24': 0, 'y_-24': 0,
        # ...
        # 's_24': 0, 'dis_24': 0, 'dir_24': 0, 'o_24': 0, 'a_24': 0, 'sa_24': 0, 'x_24': 0, 'y_24': 0,
        # 'step': 0, 'fold': 0, 'contact': 0, 'contact_id': '58168_003392_0_38590_43854', 'frame': 294.665}  
                                
        results.append(item)
        
        if k==300000: 
            tmp = pd.DataFrame(results)
            tmp = reduce_mem_usage(tmp) 
            tmp.to_parquet("tmp_{}".format(l), compression="gzip", index=False)            
            results = []
            tmp = np.NaN
            _ = gc.collect()            
            k=1
            l += 1
            
    tmp = pd.DataFrame(results)
    tmp = reduce_mem_usage(tmp) 
    tmp.to_parquet("tmp_{}".format(l), compression="gzip", index=False)            
    results = []
    tmp = np.NaN
    _ = gc.collect()            

    return feature_cols

In [11]:
# shutil.copyfile(src, dst, *, follow_symlinks=True) => 
# Copy the contents (no metadata) of the file named src to a file named dst and return dst in the most efficient way possible. 
# copyfile(os.path.basename(__file__), os.path.join(cfg.EXP_MODEL, os.path.basename(__file__)))

In [12]:
# df_train_df = pd.read_parquet('../input/dk-1st-data/kaggle_data/train_folds_G').reset_index(drop=True)
# np_trk_dict = np.load('../input/dk-1st-data/kaggle_data/trk_dict.npy', allow_pickle=True).item()
#     # allow_pickle=True; allow loading pickled object arrays stored in npy files.
#     # .item(); Copy an element of an array to a standard Python scalar and return it.
#     # .item() is necessary for loading dictionary type items.


In [13]:
# df_train_df.groupby(by=['nfl_player_id_1','vid'])['step'].unique()
#     # nfl_player_id_1  vid         
#     # 25511            58224_000998    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
#     #                  58224_002486    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
#     #                  58368_003163    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
#     #                  58418_000608    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
#     #                  58418_000637    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
#     #                                                        ...                        
#     # 53930            58545_000874    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
#     # 53946            58560_001856    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
#     # 53953            58573_000445    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
#     # Name: step, Length: 5279, dtype: object

In [14]:
# df_train_df.info()
# 4721617-302556

In [15]:
# np.info(np_trk_dict)
#     # dict() -> new empty dictionary
#     # dict(mapping) -> new dictionary initialized from a mapping object's
#     #     (key, value) pairs
#     # dict(iterable) -> new dictionary initialized as if via:
#     #     d = {}
#     #     for k, v in iterable:
#     #         d[k] = v
#     # dict(**kwargs) -> new dictionary initialized with the name=value pairs
#     #     in the keyword argument list.  For example:  dict(one=1, two=2)

In [16]:
# feature_cols = feature_engineering(df_train_df, np_trk_dict)


410633it [03:10, 2160.39it/s]


In [17]:
# len(feature_cols)
# ## 400

400

In [18]:
# np.save('feature_cols_G.npy',feature_cols)

In [18]:
# 58200_003925_61_42352_43388

In [19]:
# df_train_df = pd.read_parquet(f'/kaggle/working/tmp_0')
# for i in [1]:
#     df_train_df = pd.concat([df_train_df,pd.read_parquet('/kaggle/working/tmp_{}'.format(i))])
# df_train_df.to_parquet(f"df_xgb_pre_g", compression="gzip", index=False)

0         0.47
1         0.67
2        -1.10
3         0.65
4         0.50
          ... 
110628    0.56
110629    0.54
110630    1.44
110631   -4.01
110632   -3.54
Name: s_-25, Length: 410633, dtype: float32

In [20]:
def fit_xgboost(cfg, params, add_suffix=''):
    oof_pred = []
    for fold in [2,1,0,3,4]:
        if fold == -1: continue

        x_train = df_train_df[df_train_df.fold!=fold][feature_cols]
        y_train = df_train_df[df_train_df.fold!=fold]['contact']

        x_val = df_train_df[df_train_df.fold==fold]

        x_valid = x_val[feature_cols]

        y_valid = df_train_df[df_train_df.fold==fold]['contact']

        print(x_train.shape, x_valid.shape)

        xgb_train = xgb.DMatrix(x_train, label=y_train)
        xgb_valid = xgb.DMatrix(x_valid, label=y_valid)
        evals = [(xgb_train,'train'),(xgb_valid,'eval')]
        
        model = xgb.train(
            params,
            xgb_train,
            num_boost_round=1000,#10_000,
            early_stopping_rounds=300,
            evals=evals,
            verbose_eval=100,
#             xgb_model='../input/dk-1st-data-1/kaggle_data_1/xgb_fold1_xgb_1st.model'
        )

#         model_path = os.path.join(cfg.EXP_MODEL, f'xgb_fold{fold}{add_suffix}.model')
        model_path = f'xgb_fold{fold}{add_suffix}.model'
        model.save_model(model_path)
        model = xgb.Booster()
        model.load_model(model_path)

        dvalid = xgb.DMatrix(x_valid)

        pred_i = model.predict(dvalid) 
        print(pred_i.shape)
        # print(pred_i[:10], y_valid[:10])

        x_val['pred'] = pred_i
        x_val = x_val[['contact_id', 'fold', 'contact', 'pred', 'frame']]
        oof_pred.append(x_val)

        gt = y_valid.values
        all_pos = np.sum(gt==1)

        for thres in [0.0002,0.001, 0.003, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05,0.1,0.2,0.3, 0.4, 0.5]:
            pred = 1*(pred_i > thres)
            tp = np.sum((gt==1)*(pred==1))
            pred_pos = np.sum(pred==1)

            score = matthews_corrcoef(gt, pred > thres)

            print(f'thres {thres:.4f} tp {tp} all_pos {all_pos:.4f} pred_pos {pred_pos:.4f}, score {score:.4f}')

        score = round(roc_auc_score(y_valid, pred_i), 5)
        print(f'Performance of the prediction: {score}\n')
        del model; gc.collect()

    oof_df = pd.concat(oof_pred)
#     oof_df.to_csv(f'{cfg.EXP_MODEL}/xgb_G_oof.csv', index=False)
    oof_df.to_csv(f'xgb_G_oof.csv', index=False)    

    gt = oof_df.contact.values
    all_pos = np.sum(gt==1)
    for thres in [0.001, 0.002, 0.01, 0.02, 0.03, 0.04, 0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7]:
        pred = 1*(oof_df.pred.values > thres)
        tp = np.sum((gt==1)*(pred==1))
        pred_pos = np.sum(pred==1)

        score = matthews_corrcoef(gt, pred > thres)

        print(f'thres {thres:.4f} tp {tp} all_pos {all_pos:.4f} pred_pos {pred_pos:.4f}, score {score:.4f}')



In [21]:
def get_oof(config_name):
    dfs = []
    for i in [0,1,2,3,4]:
#         df_pred = pd.read_csv(f'../cnn/outputs/{config_name}/oof_f{i}.csv')
        df_pred = pd.read_csv(f'{config_name}_oof_f{i}.csv')        
        dfs.append(df_pred)
    df_pred = pd.concat(dfs).reset_index(drop=True)

    pred_step_dict = {}
    for i, row in df_pred.iterrows():
        idx = row['path'].split('/')[-1]
        step = int(idx.split('_')[-1])
        root = '_'.join(idx.split('_')[:-2])
        idx = f'{root}_{step}'
        pred_step_dict[idx] = row['pred']

    return pred_step_dict

In [None]:
df_train_df = pd.read_parquet("../input/dk-1st-data-1/kaggle_data_1/df_xgb_pre_g").reset_index(drop=True)
feature_cols = np.load('../input/dk-1st-data-1/kaggle_data_1/feature_cols_G.npy', allow_pickle=True)#.item()
oof_pred = fit_xgboost(cfg, cfg.xgb_params, add_suffix="_xgb_1st")

(328397, 401) (82236, 401)


In [None]:
# fold2 =>
# thres 0.2000 tp 1881 all_pos 2619.0000 pred_pos 3668.0000, score 0.5918
# thres 0.3000 tp 1556 all_pos 2619.0000 pred_pos 2466.0000, score 0.5999
# thres 0.4000 tp 1245 all_pos 2619.0000 pred_pos 1719.0000, score 0.5762
# thres 0.5000 tp 926 all_pos 2619.0000 pred_pos 1173.0000, score 0.5190

# fold1 =>
# thres 0.2000 tp 2809 all_pos 3992.0000 pred_pos 4935.0000, score 0.6121
# thres 0.3000 tp 2277 all_pos 3992.0000 pred_pos 3512.0000, score 0.5895
# thres 0.4000 tp 1754 all_pos 3992.0000 pred_pos 2494.0000, score 0.5388
# thres 0.5000 tp 1354 all_pos 3992.0000 pred_pos 1819.0000, score 0.4869