In [25]:
import os
import gc


import numpy as np
import pandas as pd
import cv2


import math
import xgboost as xgb
import torch
import scipy.stats as sss


from sklearn.metrics import roc_auc_score, matthews_corrcoef
from glob import glob
from tqdm import tqdm
from shutil import copyfile

In [26]:
import random

In [27]:
def seed_everything(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    torch.manual_seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)
#     if torch.cuda.is_available():
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)
    # torch.backends.cudnn.enabled = False
    torch.backends.cudnn.benchmark = False        
    torch.backends.cudnn.deterministic = True

In [28]:
class Config:
    NAME = "xgb_pre_not_g"

    seed = 42
    num_fold = 5
    
    xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'learning_rate':0.005,
        'tree_method':'hist' if not torch.cuda.is_available() else 'gpu_hist'
    }

In [29]:
def setup(cfg):
    cfg.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # set dirs
    cfg.EXP = cfg.NAME

    cfg.EXP_MODEL = os.path.join(cfg.EXP, 'model')
    cfg.EXP_FIG = os.path.join(cfg.EXP, 'fig')
    cfg.EXP_PREDS = os.path.join(cfg.EXP, 'preds')

    # make dirs
    for d in [cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
        os.makedirs(d, exist_ok=True)
        
    return cfg


In [30]:
cfg = setup(Config)

In [31]:
seed_everything(cfg.seed)

### Pre folds setup

In [32]:
# df_train_df = pd.read_csv('../input/dk-1st-data/kaggle_data/train_folds.csv') 
# df_train_df = df_train_df[df_train_df['nfl_player_id_2'] != 'G']
#     # train_df['contact_id'][:2] =>
#     # 0    58168_003392_0_38590_43854
#     # 1    58168_003392_0_38590_41257
#     # Name: contact_id, dtype: object
# df_train_df['step'] = df_train_df['contact_id'].apply(lambda x: int(x.split('_')[2]))
#     # train_df['step'][:2] =>    
#     # 0    0
#     # 1    0
#     # Name: contact_id, dtype: int64 
# df_train_df['vid'] = df_train_df['contact_id'].apply(lambda x: '_'.join(x.split('_')[:2]))
#     # train_df['vid'][:2] =>    
#     # 0    58168_003392
#     # 1    58168_003392
#     # Name: contact_id, dtype: object
# df_train_df = df_train_df[df_train_df['distance']<4.0]

# df_train_df['game_play'] = df_train_df['vid']

# df_train_df.to_parquet("train_folds_not_G", compression="gzip", index=False)

### Train

In [33]:
# https://www.kaggle.com/code/arjanso/reducing-dataframe-memory-size-by-65
def reduce_mem_usage(props):
#     start_mem_usg = props.memory_usage().sum() / 1024**2 
#     print("Memory usage of properties dataframe is :",start_mem_usg," MB")
#     NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
#             # Print current column type
#             print("******************************")
#             print("Column: ",col)
#             print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
#             # Integer does not support NA, therefore, NA needs to be filled
#             if not np.isfinite(props[col]).all(): 
#                 NAlist.append(col)
#                 props[col].fillna(mn-1,inplace=True)  
                   
            # test if column dtype is int.
            if 'int' in props[col].dtype.name:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
#             # Print new column type
#             print("dtype after: ",props[col].dtype)
#             print("******************************")
    
#     # Print final result
#     print("___MEMORY USAGE AFTER COMPLETION:___")
#     mem_usg = props.memory_usage().sum() / 1024**2 
#     print("Memory usage is: ",mem_usg," MB")
#     print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
#     return props, NAlist
    return props

In [34]:
def extract_feat(idx, trk_dict, step, nan_val=0, window_size=10):
#     pos_code = {'CB':1, 'DE':2, 'FS':3, 'TE':4, 'ILB':5, 'OLB':6, 'T':7, 'G':8, 'C':9, 'QB':10, 'WR':11, 'RB':12, 'NT':13, 'DT':14,
#         'MLB':15, 'SS':16, 'OT':17, 'LB':18, 'OG':19, 'SAF':20, 'DB':21, 'LS':22, 'K':23, 'P':24, 'FB':25, 'S':26, 'DL':27, 'HB':28}

    if idx not in trk_dict:
        item = {'s': nan_val, 'dis': nan_val, 'dir': nan_val, 'o': nan_val, 'a': nan_val, 'sa': nan_val, 'x': nan_val, 'y': nan_val, 't': nan_val}
        # item[f'pos'] = nan_val
    else:
        if step in trk_dict[idx]:
            item = {'s': trk_dict[idx][step]['s'], 'dis': trk_dict[idx][step]['dis'], 'dir': trk_dict[idx][step]['dir'], 'o': trk_dict[idx][step]['o']} 
            item['a'] = trk_dict[idx][step]['a']
            item['sa'] = trk_dict[idx][step]['sa']
            item['x'] = trk_dict[idx][step]['x']
            item['y'] = trk_dict[idx][step]['y'] 
            item['t'] = trk_dict[idx][step]['t'] 
        else:
            item = {'s': nan_val, 'dis': nan_val, 'dir': nan_val, 'o': nan_val, 'a': nan_val, 'sa': nan_val, 'x': nan_val, 'y': nan_val, 't': nan_val}

    return item

In [35]:
def calc_dist(idx1, idx2, trk_dict, step, nan_val=0):
    if idx1 not in trk_dict or idx2 not in trk_dict:
        return nan_val, nan_val, nan_val
    else:
        if step not in trk_dict[idx1] or step not in trk_dict[idx2]:
            return nan_val, nan_val, nan_val

        x1 = trk_dict[idx1][step]['x']
        y1 = trk_dict[idx1][step]['y'] 

        x2 = trk_dict[idx2][step]['x']
        y2 = trk_dict[idx2][step]['y'] 

        dist = math.sqrt((x1-x2)*(x1-x2) + (y1-y2)*(y1-y2))

        sa_dif = trk_dict[idx1][step]['sa'] - trk_dict[idx2][step]['sa'] 
        a_dif = trk_dict[idx1][step]['a'] - trk_dict[idx2][step]['a'] 

        return dist, a_dif, sa_dif


In [36]:
def feature_engineering(train_df, trk_dict, det_dict):    
    results = []
    nan_val = 0
    window_size = 2 # 2  
    m = 0
    n = 0    
    for i, row in tqdm(train_df.iterrows()):
        
        vid = row['vid']
        idx = row['nfl_player_id_1']
        idx = f'{vid}_{idx}'
        idx2 = row['nfl_player_id_2']
        idx2 = f'{vid}_{idx2}'        
        step = row['step']

        agg_dict = {'s': [], 'dis': [], 'dir': [], 'o': [], 'a': [], 'sa': [], 'x': [], 'y': []}
        
        item1 = extract_feat(idx, trk_dict, step, nan_val=nan_val, window_size=window_size)
        item2 = extract_feat(idx2, trk_dict, step, nan_val=nan_val, window_size=window_size) 
        # item1 =>
        # {'s': 0.52, 'dis': 0.06, 'dir': 141.08, 'o': 100.37, 'a': 0.59, 'sa': 0.58, 'x': 40.33, 'y': 25.28, 't': 'home'}
        # item2 =>
        # {'s': 0.68, 'dis': 0.08, 'dir': 234.17, 'o': 282.07, 'a': 0.81, 'sa': 0.81, 'x': 42.0, 'y': 22.85, 't': 'away'} 
        item = {}
        for k, val in item1.items():
            if k in ['t']:                
                if val == item2[k]:
                    # if home == home or away == away:
                    item[k] = 0
                        # item[t] = 0
                else:
                    item[k] = 1
                        # item[t] = 1
                continue
                

            item[k] = val 
            item[f'{k}_2'] = item2[k]

            if k not in ['pos']:
                item[f'{k}_dif'] = val - item2[k]

            if k in ['o', 'dir']:
                item[f'{k}_s'] = math.sin(val)
                item[f'{k}_c'] = math.cos(val)
                item[f'{k}_s2'] = math.sin(item2[k])
                item[f'{k}_c2'] = math.cos(item2[k])
                item[f'{k}_sd'] = math.sin(val - item2[k])
                item[f'{k}_cd'] = math.cos(val - item2[k])

            if k in ['o', 'dir']:
                item[f'{k}_s'] = math.sin(math.pi*val/180)
                item[f'{k}_c'] = math.cos(math.pi*val/180)
                item[f'{k}_s2'] = math.sin(math.pi*item2[k]/180)
                item[f'{k}_c2'] = math.cos(math.pi*item2[k]/180)
                item[f'{k}_sd'] = math.sin(math.pi*(val - item2[k])/180)
                item[f'{k}_cd'] = math.cos(math.pi*(val - item2[k])/180)

        item['distance'] = row['distance']
        item['step'] = row['step']

        # item =>        
        # {'s': 0.52, 's_2': 0.68, 's_dif': -0.16, 'dis': 0.06, 'dis_2': 0.08, 'dis_dif': -0.02, 
        #  'dir': 141.08, 'dir_2': 234.17, 'dir_dif': -93.08, 'dir_s': 0.62, 'dir_c': -0.77, 'dir_s2': -0.81, 'dir_c2': -0.58, 
        #  'dir_sd': -0.99, 'dir_cd': -0.053, 'o': 100.37, 'o_2': 282.07, 'o_dif': -181.7, 'o_s': 0.98, 'o_c': -0.18, 'o_s2': -0.97, 'o_c2': 0.20, 
        #  'o_sd': 0.029, 'o_cd': -0.99, 'a': 0.59, 'a_2': 0.81, 'a_dif': -0.22, 'sa': 0.58, 'sa_2': 0.81, 'sa_dif': -0.23, 'x': 40.33, 'x_2': 42.0, 'x_dif': -1.67, 
        #  'y': 25.28,  'y_2': 22.85, 'y_dif': 2.42, 't': 1, 'distance': 2.949, 'step': 0}
        
        # idx, idx2, step => 58168_003392_38590, 58168_003392_41944, 0
        for j in range(20):
            dist, a_dif, sa_dif = calc_dist(idx, idx2, trk_dict, step+1+j, nan_val=np.nan)
                # dist, a_dif, sa_dif => 2.8876461001999507, -0.15000000000000002, -0.14
            item[f'dist_{j}'] = dist
            if j<20:
                item[f'a_{j}'] = a_dif
                item[f'sa_{j}'] = sa_dif
                                
                
        for j in range(20):
            dist, a_dif, sa_dif = calc_dist(idx, idx2, trk_dict, step-1-j, nan_val=np.nan)
            item[f'dist_p{j}'] = dist
            if j<20:
                item[f'a_p{j}'] = a_dif
                item[f'sa_p{j}'] = sa_dif

        idx1 = int(row['nfl_player_id_1'])
        idx2 = int(row['nfl_player_id_2'])
        step = row['step']
        frame = int(row['frame'])+6                
        
        for view in ['Sideline', 'Endzone']:
            v_vid = vid + '_' + view
            area1_list = []
            area2_list = []
            for ff in range(-30,30,2):
                fr = frame + ff
                if fr in det_dict[v_vid] and idx1 in det_dict[v_vid][fr] and idx2 in det_dict[v_vid][fr]: 
                    # v_vid, fr, idx1 => 58168_003392_Sideline, 290, 38590
                    x1, y1, w1, h1 = det_dict[v_vid][fr][idx1]['box']
                    x2, y2, w2, h2 = det_dict[v_vid][fr][idx2]['box']
                    
                    # prepairing center co-ordinates of helmets
                    x1 = x1 + w1/2
                    y1 = y1 + h1/2
                    x2 = x2 + w2/2
                    y2 = y2 + h2/2
                    dist = math.sqrt((x1 - x2)*(x1 - x2) + (y1 - y2)*(y1 - y2))
                    
                    # f'{view}_{ff}_dist' => Sideline_-10_dist
                    item[f'{view}_{ff}_dist'] = dist
                    area1_list.append(w1*h1)
                    area2_list.append(w2*h2)
                else:
                    item[f'{view}_{ff}_dist'] = np.nan    

            if len(area2_list)>0:
                item[f'{view}_area1'] = np.mean(area1_list)
                item[f'{view}_area2'] = np.mean(area2_list)
            else:
                item[f'{view}_area1'] = np.nan
                item[f'{view}_area2'] = np.nan

        if m==0: feature_cols = list(item.keys())
        m+=1

        item['fold'] = row['fold']
        item['contact'] = row['contact']
        item['contact_id'] = row['contact_id']
        item['frame'] = row['frame']
        item['nfl_player_id_1'] = row['nfl_player_id_1']
        item['nfl_player_id_2'] = row['nfl_player_id_2']

        results.append(item)
        
        if m==300000: 
            tmp = pd.DataFrame(results)
            tmp = reduce_mem_usage(tmp) 
            tmp.to_parquet("tmp_{}".format(n), compression="gzip", index=False)            
            results = []
            tmp = np.NaN
            _ = gc.collect()            
            m=1
            n += 1        
            
    tmp = pd.DataFrame(results)
    tmp = reduce_mem_usage(tmp) 
    tmp.to_parquet("tmp_{}".format(n), compression="gzip", index=False)            
    results = []
    tmp = np.NaN
    _ = gc.collect()             

    return feature_cols

In [37]:
# shutil.copyfile(src, dst, *, follow_symlinks=True) => 
# Copy the contents (no metadata) of the file named src to a file named dst and return dst in the most efficient way possible. 
# copyfile(os.path.basename(__file__), os.path.join(cfg.EXP_MODEL, os.path.basename(__file__)))

In [38]:
# df_train_df = pd.read_parquet('../input/dk-1st-data/kaggle_data/train_folds_not_G').reset_index(drop=True)
# np_trk_dict = np.load('../input/dk-1st-data/kaggle_data/trk_dict.npy', allow_pickle=True).item()
# #     # allow_pickle=True; allow loading pickled object arrays stored in npy files.
# #     # .item(); Copy an element of an array to a standard Python scalar and return it.
# #     # .item() is necessary for loading dictionary type items.
# np_det_dict = np.load('../input/dk-1st-data/kaggle_data/det_dict.npy', allow_pickle=True).item()

In [39]:
# df_train_df.info()
# 4721617-302556

In [40]:
# np.info(np_trk_dict)
#     # dict() -> new empty dictionary
#     # dict(mapping) -> new dictionary initialized from a mapping object's
#     #     (key, value) pairs
#     # dict(iterable) -> new dictionary initialized as if via:
#     #     d = {}
#     #     for k, v in iterable:
#     #         d[k] = v
#     # dict(**kwargs) -> new dictionary initialized with the name=value pairs
#     #     in the keyword argument list.  For example:  dict(one=1, two=2)

In [41]:
# feature_cols = feature_engineering(df_train_df, np_trk_dict, np_det_dict)


In [42]:
# len(feature_cols)
##221

In [43]:
# np.save('feature_cols_not_g.npy',feature_cols)

In [44]:
# 58200_003925_61_42352_43388

In [45]:
# df_train_df = pd.read_parquet(f'/kaggle/working/tmp_0')
# for i in [1,2]:
#     df_train_df = pd.concat([df_train_df,pd.read_parquet('/kaggle/working/tmp_{}'.format(i))])
# df_train_df.to_parquet(f"df_xgb_pre_not_g", compression="gzip", index=False)

In [46]:
def fit_xgboost(cfg, params, add_suffix=''):
    oof_pred = []
    for fold in [0,1,2,3,4]:
        if fold == -1: continue

        x_train = df_train_df[df_train_df.fold!=fold][feature_cols]
        y_train = df_train_df[df_train_df.fold!=fold]['contact']

        x_val = df_train_df[df_train_df.fold==fold]

        x_valid = x_val[feature_cols]

        y_valid = df_train_df[df_train_df.fold==fold]['contact']

        print(x_train.shape, x_valid.shape)

        xgb_train = xgb.DMatrix(x_train, label=y_train)
        xgb_valid = xgb.DMatrix(x_valid, label=y_valid)
        evals = [(xgb_train,'train'),(xgb_valid,'eval')]
        
        model = xgb.train(
            params,
            xgb_train,
            num_boost_round=10_000,#1000,
            early_stopping_rounds=300,
            evals=evals,
            verbose_eval=100,
#             xgb_model='../input/dk-1st-data-1/kaggle_data_1/xgb_fold1_xgb_1st.model'
        )

#         model_path = os.path.join(cfg.EXP_MODEL, f'xgb_fold{fold}{add_suffix}.model')
        model_path = f'xgb_not_fold{fold}{add_suffix}.model'
        model.save_model(model_path)
        model = xgb.Booster()
        model.load_model(model_path)

        dvalid = xgb.DMatrix(x_valid)

        pred_i = model.predict(dvalid) 
        print(pred_i.shape)
        # print(pred_i[:10], y_valid[:10])

        x_val['pred'] = pred_i
#         x_val = x_val[['contact_id', 'fold', 'contact', 'pred', 'frame']]
        x_val = x_val[['contact_id', 'fold', 'contact', 'pred', 'frame', 'nfl_player_id_1', 'nfl_player_id_2']]
        
        oof_pred.append(x_val)

        gt = y_valid.values
        all_pos = np.sum(gt==1)

        for thres in [0.0002,0.001, 0.003, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05,0.1,0.2,0.3, 0.4, 0.5]:
            pred = 1*(pred_i > thres)
            tp = np.sum((gt==1)*(pred==1))
            pred_pos = np.sum(pred==1)

            score = matthews_corrcoef(gt, pred > thres)

            print(f'thres {thres:.4f} tp {tp} all_pos {all_pos:.4f} pred_pos {pred_pos:.4f}, score {score:.4f}')

        score = round(roc_auc_score(y_valid, pred_i), 5)
        print(f'Performance of the prediction: {score}\n')
        del model; gc.collect()

    oof_df = pd.concat(oof_pred)
#     oof_df.to_csv(f'{cfg.EXP_MODEL}/xgb_G_oof.csv', index=False)
    oof_df.to_csv(f'xgb_not_G_oof.csv', index=False)    

    gt = oof_df.contact.values
    all_pos = np.sum(gt==1)
    #for thres in [0.001, 0.002, 0.01, 0.02, 0.03, 0.04, 0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7]:
    for thres in range(20,60):        
        thres = thres*0.01        
        pred = 1*(oof_df.pred.values > thres)
        tp = np.sum((gt==1)*(pred==1))
        pred_pos = np.sum(pred==1)

        score = matthews_corrcoef(gt, pred > thres)

        print(f'thres {thres:.4f} tp {tp} all_pos {all_pos:.4f} pred_pos {pred_pos:.4f}, score {score:.4f}')



In [47]:
def get_oof(config_name):
    dfs = []
    for i in [0,1,2,3,4]:
#         df_pred = pd.read_csv(f'../cnn/outputs/{config_name}/oof_f{i}.csv')
        df_pred = pd.read_csv(f'{config_name}_oof_f{i}.csv')        
        dfs.append(df_pred)
    df_pred = pd.concat(dfs).reset_index(drop=True)

    pred_step_dict = {}
    for i, row in df_pred.iterrows():
        idx = row['path'].split('/')[-1]
        step = int(idx.split('_')[-1])
        root = '_'.join(idx.split('_')[:-2])
        idx = f'{root}_{step}'
        pred_step_dict[idx] = row['pred']

    return pred_step_dict

In [None]:
df_train_df = pd.read_parquet("../input/dk-1st-data-1/kaggle_data_1/df_xgb_pre_not_g").reset_index(drop=True)
feature_cols = np.load('../input/dk-1st-data-1/kaggle_data_1/feature_cols_not_g.npy', allow_pickle=True)#.item()
oof_pred = fit_xgboost(cfg, cfg.xgb_params, add_suffix="_xgb_1st")

(574956, 221) (136427, 221)
[0]	train-auc:0.97445	eval-auc:0.97620


In [None]:
# fold0 =>
# thres 0.3000 tp 8029 all_pos 9501.0000 pred_pos 11339.0000, score 0.7552
# thres 0.4000 tp 7524 all_pos 9501.0000 pred_pos 9941.0000, score 0.7569
# thres 0.5000 tp 6902 all_pos 9501.0000 pred_pos 8660.0000, score 0.7439

# fold1 =>
# thres 0.3000 tp 8239 all_pos 9984.0000 pred_pos 12388.0000, score 0.7219
# thres 0.4000 tp 7715 all_pos 9984.0000 pred_pos 10807.0000, score 0.7250
# thres 0.5000 tp 7169 all_pos 9984.0000 pred_pos 9540.0000, score 0.7174

# fold2 =>
# thres 0.3000 tp 7445 all_pos 8746.0000 pred_pos 10491.0000, score 0.7599
# thres 0.4000 tp 6997 all_pos 8746.0000 pred_pos 9259.0000, score 0.7611
# thres 0.5000 tp 6493 all_pos 8746.0000 pred_pos 8176.0000, score 0.7518

# fold3 =>
# thres 0.2000 tp 9606 all_pos 11053.0000 pred_pos 15470.0000, score 0.7122
# thres 0.3000 tp 9006 all_pos 11053.0000 pred_pos 13202.0000, score 0.7252
# thres 0.4000 tp 8342 all_pos 11053.0000 pred_pos 11425.0000, score 0.7231
# thres 0.5000 tp 7665 all_pos 11053.0000 pred_pos 9951.0000, score 0.7122

# fold4 =>
# thres 0.2000 tp 7292 all_pos 8383.0000 pred_pos 10926.0000, score 0.7422
# thres 0.3000 tp 6819 all_pos 8383.0000 pred_pos 9460.0000, score 0.7474
# thres 0.4000 tp 6357 all_pos 8383.0000 pred_pos 8340.0000, score 0.7426
# thres 0.5000 tp 5897 all_pos 8383.0000 pred_pos 7346.0000, score 0.7345