In [1]:
import warnings
# warnings.simplefilter('ignore')

import pandas as pd
import numpy as np
import polars as pl
import matplotlib.pyplot as plt
import gc,os,random
import time,datetime
from tqdm import tqdm

from utils import *
# root = args.root
# seed = args.seed
# remark = args.remark
# save_dir = args.save_dir

In [2]:
# import graphviz
# from torchview import draw_graph
# graphviz.set_jupyter_format('png')

In [3]:
from sklearn.model_selection import StratifiedKFold

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset,TensorDataset, DataLoader,RandomSampler
from typing import Optional, Callable, List
import math

## Config

In [5]:
root='.'
remark=''
save_dir=''
seed=42
id_name = 'customer_ID'
label_name = 'target'
num_workers = 11
use_amp = False
do_train = True

eps = 1e-3

In [6]:
CUDA_VISIBLE_DEVICES = "0"
os.environ["CUDA_VISIBLE_DEVICES"]=CUDA_VISIBLE_DEVICES

In [7]:
gpus = list(range(torch.cuda.device_count()))
print('available gpus:',gpus)


available gpus: [0]


##  Utils

In [8]:
# https://www.kaggle.com/code/arjanso/reducing-dataframe-memory-size-by-65
def reduce_mem_usage(props):
#     start_mem_usg = props.memory_usage().sum() / 1024**2 
#     print("Memory usage of properties dataframe is :",start_mem_usg," MB")
#     NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
#             # Print current column type
#             print("******************************")
#             print("Column: ",col)
#             print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
#             # Integer does not support NA, therefore, NA needs to be filled
#             if not np.isfinite(props[col]).all(): 
#                 NAlist.append(col)
#                 props[col].fillna(mn-1,inplace=True)  
                   
            # test if column dtype is int.
            if 'int' in props[col].dtype.name:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
#             # Print new column type
#             print("dtype after: ",props[col].dtype)
#             print("******************************")
    
#     # Print final result
#     print("___MEMORY USAGE AFTER COMPLETION:___")
#     mem_usg = props.memory_usage().sum() / 1024**2 
#     print("Memory usage is: ",mem_usg," MB")
#     print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
#     return props, NAlist
    return props

In [None]:
def one_hot_encoding(df,cols,is_drop=True):
    for col in cols:
#         print('one hot encoding:',col)
        dummies = pd.get_dummies(pd.Series(df[col]),prefix='oneHot_%s'%col)
        df = pd.concat([df,dummies],axis=1)
    if is_drop:
        df.drop(cols,axis=1,inplace=True)
    return df
cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]


df = pd.read_parquet(f'{root}/train_test_data')
df = df.drop(['S_2'],axis=1)
df = one_hot_encoding(df,cat_features,True)

for col in tqdm(df.columns):
    if col not in ['customer_ID','S_2']:
        df[col] /= 100
    df[col] = df[col].fillna(0)

df = reduce_mem_usage(df)
df.to_parquet(f'{root}/nn_train_test_data',  compression='gzip', index=False)

## Preprocessing

In [5]:
# loading and saving series features (polars library).
df = pl.read_parquet(f'{root}/nn_train_test_data')
train_y =  pl.read_csv(f'{root}/train_labels.csv').sort(by="customer_ID", reverse=False)
df.filter(~pl.col("customer_ID").is_in(train_y["customer_ID"]),
         ).sort(by="customer_ID", reverse=False).write_parquet(f'{root}/nn_series_test_feature', compression='gzip')
df.filter(pl.col("customer_ID").is_in(train_y["customer_ID"]),
         ).sort(by="customer_ID", reverse=False).write_parquet(f'{root}/nn_series_train_feature', compression='gzip')
train_y.write_parquet(f'{root}/train_labels', compression='gzip')

In [8]:
# loading and saving manual features (polars library).
df = pl.read_parquet(f'{root}/nn_all_feature')
train_y =  pl.read_parquet(f'{root}/train_labels')
df = df.filter(~pl.col("customer_ID").is_in(train_y["customer_ID"]))
df = df.sort(by="customer_ID", reverse=False)
df.write_parquet(f'{root}/nn_manual_test_feature', compression='gzip')

df = pl.read_parquet(f'{root}/nn_all_feature')
df = df.filter(pl.col("customer_ID").is_in(train_y["customer_ID"]))
df = df.sort(by="customer_ID", reverse=False)
df.write_parquet(f'{root}/nn_manual_train_feature', compression='gzip')

## Model

In [8]:
class Amodel(nn.Module):
    def __init__(self, series_dim, feature_dim, target_num, hidden_num, hidden_dim, drop_rate=0.5, use_series_oof=False):
        super(Amodel, self).__init__()
        self.use_series_oof = use_series_oof

        self.input_series_block = nn.Sequential(
                                        nn.Linear(series_dim, hidden_dim)
                                        ,nn.LayerNorm(hidden_dim)
                                        )
        self.input_feature_block = nn.Sequential(
                                        nn.Linear(feature_dim, hidden_dim)
                                        ,nn.BatchNorm1d(hidden_dim)
                                        ,nn.LeakyReLU()
                                        )
        self.gru_series = nn.GRU(hidden_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.hidden_feature_block = []
        for h in range(hidden_num-1):
            # extend; add element/block/dataframe/etc. to list as previous element/block/dataframe/etc..
            self.hidden_feature_block.extend([
                                     nn.Linear(hidden_dim, hidden_dim)
                                     ,nn.BatchNorm1d(hidden_dim)
                                     ,nn.Dropout(drop_rate)
                                     ,nn.LeakyReLU()
                                     ])
        self.hidden_feature_block = nn.Sequential(*self.hidden_feature_block)

        self.output_block = nn.Sequential(
                                         nn.Linear(3*hidden_dim if use_series_oof else 2*hidden_dim, 1*hidden_dim)
                                         ,nn.LeakyReLU()
                                         ,nn.Linear(1*hidden_dim, 1*hidden_dim)
                                         ,nn.LeakyReLU()                                         
                                         ,nn.Linear(1*hidden_dim, target_num)
                                         ,nn.Sigmoid()
                                         )

    def batch_gru(self,series,mask):
            # series.shape, mask.shape => torch.Size([2, 13, 128]), torch.Size([2, 13]) => (2 is batch_size)
        node_num = mask.sum(dim=-1).detach().cpu()
            # node_num, node_num.shape => tensor([13., 13.]), torch.Size([2])

        # All RNN modules accept packed sequences as inputs.
        # Packs a Tensor (containing padded sequences of variable length) by removing pads. 
        # i.e., for both batches, we have [13,128] and [13,128] data. And we mentioned in node_num [13,13] that -
        # - the sequence length of both batch elements is '13'. So, pack all data and here is nothing to remove.
        pack = nn.utils.rnn.pack_padded_sequence(series, node_num, batch_first=True, enforce_sorted=False)
            # pack.data.shape, pack.batch_sizes.shape => torch.Size([26, 128]), torch.Size([13])
            # batch_sizes => tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]) => ( 13*[2,128] = [26,128] )
            # pack.sorted_indices, pack.unsorted_indices => tensor([0, 1]), tensor([0, 1])
            # sorted indices [0,1] i.e., order of packing for each batch of 2 => 
            #  1st batch => [0,0,:] (torch.Size([13, 128])) and [1,0,:] (torch.Size([13, 128]))
            #  2nd batch => [0,1,:] (torch.Size([13, 128])) and [1,1,:] (torch.Size([13, 128]))
            #  3rd batch => [0,2,:] (torch.Size([13, 128])) and [1,2,:] (torch.Size([13, 128]))
            #  i.e., to prepare 'pack batch' pick one element from every original batch.

        message, hidden = self.gru_series(pack)            
            # message.data.shape, hidden.shape => torch.Size([26, 256]) torch.Size([2, 2, 128])
            # message.batch_sizes => tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
            # hidden => the last hidden state of GRU after whole pack passed.
            # message => contains hidden state value at the time of every input.
        
        pooling_feature = []
        for i,n in enumerate(node_num.numpy()):
            n = int(n)
                # n => 13
            bi = 0
            
            # unsorted_indices is for retrieval in the same order as opted during packing.            
            si = message.unsorted_indices[i] 
                # si => tensor(0)
            for k in range(n):
                if k == n-1:
                    sample_feature = message.data[bi+si]
                        # bi+si => tensor(24)                    
                        # sample_feature.shape => torch.Size([256])
                bi = bi + message.batch_sizes[k]
            pooling_feature.append(sample_feature)
        return torch.stack(pooling_feature,0)

    def forward(self, data):
            # data['batch_series'].shape => torch.Size([2, 13, 223]) => torch.Size([batch_size, 13, 223])
        x1 = self.input_series_block(data['batch_series'])
            # x1.shape => torch.Size([2, 13, 128]) 
        x1 = self.batch_gru(x1,data['batch_mask'])
            # x1.shape => torch.Size([2, 256])

        if self.use_series_oof:
            x2 = self.input_feature_block(data['batch_feature'])
                # x2.shape =>  torch.Size([2, 128])
            x2 = self.hidden_feature_block(x2)
                # x2.shape => torch.Size([2, 128])
            x = torch.cat([x1,x2],axis=1)
                # x.shape => torch.Size([2, 384])
            y = self.output_block(x)
                # y.shape => torch.Size([2, 1])
        else:
            y = self.output_block(x1)
        return y

In [9]:
# train_dataset = TaskDataset(df,f,[series_idx.values[i] for i in range(2)],y)
# train_dataloader = DataLoader(train_dataset,batch_size=2,shuffle=True, drop_last=True, collate_fn=train_dataset.collate_fn,num_workers=11)
# m = Amodel(223,(6372+13)*2,1,3,128,use_series_oof=True)

In [10]:
# for data in train_dataloader:
#     draw_graph(m, input_data = [data], expand_nested=True, save_graph=True, device='cpu').visual_graph
#     m.forward(data)    
#     break

In [9]:
class TaskDataset:
    def __init__(self,df_series,df_feature,uidxs,df_y=None):
        self.df_series = df_series
        self.df_feature = df_feature
        self.df_y = df_y
        self.uidxs = uidxs

    def __len__(self):
        return (len(self.uidxs))

    def __getitem__(self, index):
        # (i1 => index_start, i2 => index_end, idx => customer_ID index) in series_idx array.
        i1,i2,idx = self.uidxs[index]
        series = self.df_series.iloc[i1:i2+1,1:].values # converting all 'iloc values' to array leaving customer_ID.
            # series.shape => (13, 223)        

        if len(series.shape) == 1:
                # series.shape => (223,) 
            series = series.reshape((-1,)+series.shape[-1:])
                # series.shape => (1, 223)
                
#         series_ = series.copy()
#             # series_ => 
#             # [[0.93 0.   0.   ... 0.   0.   0.01]
#             #  [0.93 0.   0.   ... 0.   0.   0.01]
#             #  [0.95 0.09 0.02 ... 0.   0.   0.01]
#             # ...
#         series_[series_!=0] = 1.0 - series_[series_!=0] + 0.001
#             # series_        
#             # [[0.07099999 0.         0.         ... 0.         0.         0.991     ]
#             #  [0.07099999 0.         0.         ... 0.         0.         0.991     ]
#             #  [0.05100001 0.91099995 0.981      ... 0.         0.         0.991     ]
#             # ...        

        feature = self.df_feature.loc[idx].values[1:]
            # feature.shape => (6385,)
        feature_ = feature.copy()
        feature_[feature_!=0] = 1.0 - feature_[feature_!=0] + 0.001
            # feature.shape => (6385,)        
        if self.df_y is not None:
            label = self.df_y.loc[idx,[label_name]].values
            return {
                    'SERIES': series,#np.concatenate([series,series_],axis=1),
                    'FEATURE': np.concatenate([feature,feature_]), 
                        # np.concatenate([feature,feature_]).shape => (12770,)
                    'LABEL': label,
                    }
        else:
            return {
                    'SERIES': series,#np.concatenate([series,series_],axis=1),
                    'FEATURE': np.concatenate([feature,feature_]),
                    }
        
    def collate_fn(self, batch):
        """
        Padding to same size.
        """

        batch_size = len(batch)
        batch_series = torch.zeros((batch_size, 13, batch[0]['SERIES'].shape[1])) # (1,13,223)
        batch_mask = torch.zeros((batch_size, 13)) 
        batch_feature = torch.zeros((batch_size, batch[0]['FEATURE'].shape[0])) # (1,12770)
        batch_y = torch.zeros((batch_size, 1))

        for i, item in enumerate(batch):
            v = item['SERIES'].astype(np.float32)
            batch_series[i, :v.shape[0], :] = torch.tensor(v)#.float()
            batch_mask[i,:v.shape[0]] = 1.0
            v = item['FEATURE'].astype(np.float32)
            batch_feature[i] = torch.tensor(v)#.float()
            if self.df_y is not None:
                v = item['LABEL'].astype(np.float32)
                batch_y[i] = torch.tensor(v)#.float()

        return {'batch_series':batch_series,'batch_mask':batch_mask,'batch_feature':batch_feature,'batch_y':batch_y}

In [10]:
def Metric(labels,preds):
    return amex_metric_mod(labels,preds)

def amex_metric_mod(y_true, y_pred):
        # y_true => 
        # 0          0
        # 1          0
        # 2          0
        #           ..
        # 5531449    0
        # 5531450    0
        # Name: target, Length: 2765213, dtype: int64
        
        # y_pred => [0.0018315  0.00164183 0.00174071 ... 0.00212767 0.00309472]
    labels     = np.transpose(np.array([y_true, y_pred]))    

    labels     = labels[labels[:, 1].argsort()[::-1]]
        # .argsort() => Return the integer indices that would sort the Series values.
        # .argsort()[::-1] => reverse the sorted indices.
        # labels => 
        # [[1.00000000e+00 9.97522107e-01]
        # [1.00000000e+00 9.97491614e-01]
        # ...
        # [0.00000000e+00 8.86596095e-05]
        # [0.00000000e+00 8.82186859e-05]]
        # labels[:,0] denotes originl targets, labels[:,1] denotes predicted targets.
        
    weights    = np.where(labels[:,0]==0, 20, 1)
        # weights => [ 1  1  1 ... 20 20 20]
        # np.cumsum(weights) => [       1        2        3 ... 42218046 42218066 42218086]
        # np.sum(weights) => 42218086
    
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
        # cut_vals =>     
        # [[1.         0.99752211]
        #  [1.         0.99749161]
        #  ...
        #  [1.         0.69189778]
        #  [0.         0.69189679]]    
    
    
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
        # np.sum(cut_vals[:,0]) => sum of actual labels from cut_vals array.
        # np.sum(labels[:,0]) => sum of actual labels from labels array.
        # np.sum(cut_vals[:,0]), np.sum(labels[:,0]) => 388343.0, 688746.0    
        # top_four => 0.5638406611435856

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]] # i = 1,0
            # when i==0 then labels => 
            # [[1.         0.67152673]
            #  [1.         0.6808289 ]
            #  ...
            #  [0.         0.5255297 ]
            #  [0.         0.0018315 ]]            
        weights        = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weights / np.sum(weights))
            # weight_random =>
            # [2.36865309e-08 4.73730619e-08 7.10595928e-08 ... 9.99999052e-01
            #  9.99999526e-01 1.00000000e+00]        
        total_pos      = np.sum(labels[:, 0] *  weights)
            # total_pos => 688746.0
        cum_pos_found  = np.cumsum(labels[:, 0] * weights)
            # cum_pos_found => 
            # [1.00000e+00 2.00000e+00 3.00000e+00 ... 6.88746e+05 6.88746e+05
            #  6.88746e+05]        
        lorentz        = cum_pos_found / total_pos
            # lorentz =>        
            # [1.45191406e-06 2.90382812e-06 4.35574217e-06 ... 1.00000000e+00
            #  1.00000000e+00 1.00000000e+00]        
        gini[i]        = np.sum((lorentz - weight_random) * weights)

    return 0.5 * (gini[1]/gini[0] + top_four)

def Write_log(logFile,text,isPrint=False):
    if isPrint:
        print(text)
    logFile.write(text)
    logFile.write('\n')
    return None

In [11]:
class SchedulerBase(object):
    def __init__(self):
        self._is_load_best_weight = True
        self._is_load_best_optim = True
        self._is_freeze_bn=False
        self._is_adjust_lr = True
        self._lr = 0.01
        self._cur_optimizer = None

    def schedule(self, net, epoch, epochs, **kwargs):
        raise Exception('Did not implemented')

    def step(self, net, epoch, epochs):
        optimizer, lr = self.schedule(net, epoch, epochs)
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr

        lr_list = []
        for param_group in optimizer.param_groups:
            lr_list += [param_group['lr']]
        return lr_list

    def is_load_best_weight(self):
        return self._is_load_best_weight

    def is_load_best_optim(self):
        return self._is_load_best_optim

    def is_freeze_bn(self):
        return self._is_freeze_bn

    def reset(self):
        self._is_load_best_weight = True
        self._load_best_optim = True
        self._is_freeze_bn = False

    def is_adjust_lr(self):
        return self._is_adjust_lr

In [12]:
class Adam12(SchedulerBase):
    def __init__(self, params_list=None):
        super().__init__()
        self._lr = 100e-6 # 100e-6
        self._cur_optimizer = None
        self.params_list=params_list

    def schedule(self, net, epoch, epochs, **kwargs):
        lr = 100e-5 # 100e-5
        if epoch > 4:
            lr = 100e-6 # 100e-6
        if epoch > 8:
            lr = 100e-7 # 100e-7
        # if epoch > 9:
        #     lr = 1e-5
        # if epoch > 12:
        #     lr = 1e-5
        self._lr = lr
        if self._cur_optimizer is None:
            self._cur_optimizer = optim.Adam(net.parameters(), lr=lr)#, eps=1e-5, weight_decay=0.001
        return self._cur_optimizer, self._lr

In [13]:
def NN_train_and_predict(train, test, model_class, config, use_series_oof, logit=False, output_root='./output/', run_id=None):
    if not run_id:
        run_id = 'run_nn_' + datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
        while os.path.exists(output_root+run_id+'/'):
            time.sleep(1)
            run_id = 'run_nn_' + datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
        output_path = output_root + f'{save_dir}/'
    else:
        output_path = output_root + run_id + '/'
    if not os.path.exists(output_path):
        os.mkdir(output_path)
        
#     os.system(f'cp ./*.py {output_path}')
    
    feature_name = config['feature_name']
    obj_max = config['obj_max']
    epochs = config['epochs']
    smoothing = config['smoothing']
    patience = config['patience']
    lr = config['lr']
    batch_size = config['batch_size']
    folds = config['folds']
    seed = config['seed']
    
    if train is not None:
        train_series,train_feature,train_y,train_series_idx = train

        oof = train_y[[id_name]]
        oof['fold'] = -1
        oof[label_name] = 0.0
        oof[label_name] = oof[label_name].astype(np.float32)
    else:
        oof = None

    if train is not None:
        log = open(output_path + 'train.log','w',buffering=1)
        log.write(str(config)+'\n')

        all_valid_metric = []

        skf = StratifiedKFold(n_splits = folds, shuffle=True, random_state=seed)

        model_num = 0
        train_folds = []

        for fold, (trn_index, val_index) in enumerate(skf.split(train_y,train_y[label_name])):

            train_dataset = TaskDataset(train_series,train_feature,[train_series_idx[i] for i in trn_index],train_y)
            train_dataloader = DataLoader(train_dataset,batch_size=batch_size,shuffle=True, drop_last=True, collate_fn=train_dataset.collate_fn,num_workers=num_workers)
            valid_dataset = TaskDataset(train_series,train_feature,[train_series_idx[i] for i in val_index],train_y)
            valid_dataloader = DataLoader(valid_dataset,batch_size=batch_size,shuffle=False, drop_last=False, collate_fn=valid_dataset.collate_fn,num_workers=num_workers)

            model = model_class(223,(6372+13)*2,1,3,128,use_series_oof=use_series_oof) # 6375+13
            scheduler = Adam12()

            model.cuda()
            if use_amp:
                scaler = amp.GradScaler()
            optimizer = scheduler.schedule(model, 0, epochs)[0]

            # optimizer = torch.optim.Adam(model.parameters(), lr=1e-2, weight_decay=1e-8)
            # scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e5,
            #                                                 max_lr=1e-2, epochs=epochs, steps_per_epoch=len(train_dataloader))
            #torch.optim.Adam(model.parameters(),betas=(0.9, 0.99), lr=lr, weight_decay=0.00001,eps=1e-5)
            if len(gpus) > 1:
                model = nn.DataParallel(model, device_ids=gpus, output_device=gpus[0])


            loss_tr = nn.BCELoss()
            loss_tr1 = nn.BCELoss(reduction='none')
            if obj_max == 1:
                best_valid_metric = 0
            else:
                best_valid_metric = 1e9
            not_improve_epochs = 0
            if do_train:
                for epoch in range(epochs):
                    # if epoch <= 13:
                    #     continue
                    np.random.seed(666*epoch)
                    train_loss = 0.0
                    train_num = 0
                    scheduler.step(model,epoch,epochs)
                    model.train()
                    bar = tqdm(train_dataloader)
                    for data in bar:
                        optimizer.zero_grad()
                        for k in data:
                            data[k] = data[k].cuda()
                        y = data['batch_y']
                        if use_amp:
                            # torch.amp (automatic mixed precision) => mixed precision tries to match each op to its appropriate datatype.
                            # autocast() => Instances of autocast serve as context managers or decorators that allow regions of your script to run in mixed precision. 
                            with amp.autocast():
                                outputs = model(data)
                                # loss_series = loss_tr1(series_outputs,y.repeat(1,13))
                                # loss_series = (loss_series * data['batch_mask']).sum() / data['batch_mask'].sum()
                                # if epoch < 30:
                                #     loss = loss_series
                                # else:
                                loss = loss_tr(outputs,y) #+ loss_series # 0.5 * (loss_tr(outputs,y) + loss_feature(feature,y))
                            if str(loss.item()) == 'nan': continue
                            scaler.scale(loss).backward()
                            torch.nn.utils.clip_grad_norm(model.parameters(), clipnorm)
                            scaler.step(optimizer)
                            scaler.update()
                        else:
                            outputs = model(data)
                            loss = loss_tr(outputs,y)
                            loss.backward()
                            optimizer.step()
                        # scheduler.step()
                        train_num += data['batch_feature'].shape[0]
                        train_loss += data['batch_feature'].shape[0] * loss.item()
                        bar.set_description('loss: %.4f' % (loss.item()))

                    train_loss /= train_num

                    # eval
                    model.eval()
                    valid_preds = []
                    for data in tqdm(valid_dataloader):
                        for k in data:
                            data[k] = data[k].cuda()
                        with torch.no_grad():
                            if logit:
                                outputs = model(data).sigmoid()
                                # feature,outputs = model(data)
                                # outputs = outputs.sigmoid()
                            else:
                                outputs = model(data)
                                # feature,outputs = model(data)
                        valid_preds.append(outputs.detach().cpu().numpy())

                    valid_preds = np.concatenate(valid_preds).reshape(-1)
                    valid_Y = train_y.loc[val_index,label_name].values # oof train
                    valid_mean = np.mean(valid_preds)
                    valid_metric = Metric(valid_Y,valid_preds)

                    if obj_max*(valid_metric) > obj_max*best_valid_metric:
                        if len(gpus) > 1:
                            torch.save(model.module.state_dict(),output_path + 'fold%s.ckpt'%fold)
                        else:
                            torch.save(model.state_dict(),output_path + 'fold%s.ckpt'%fold)
                        not_improve_epochs = 0
                        best_valid_metric = valid_metric
                        Write_log(log,'[epoch %s] lr: %.6f, train_loss: %.6f, valid_metric: %.6f, valid_mean:%.6f'%(epoch,optimizer.param_groups[0]['lr'],train_loss,valid_metric,valid_mean))
                    else:
                        not_improve_epochs += 1
                        Write_log(log,'[epoch %s] lr: %.6f, train_loss: %.6f, valid_metric: %.6f, valid_mean:%.6f, NIE +1 ---> %s'%(epoch,optimizer.param_groups[0]['lr'],train_loss,valid_metric,valid_mean,not_improve_epochs))
                        if not_improve_epochs >= patience:
                            break

            state_dict = torch.load(output_path + 'fold%s.ckpt'%fold, torch.device('cuda' if torch.cuda.is_available() else 'cpu') )

            model = model_class(223,(6372+13)*2,1,3,128,use_series_oof=use_series_oof)
            model.cuda()
            model.load_state_dict(state_dict)
            if len(gpus) > 1:
                model = nn.DataParallel(model, device_ids=gpus, output_device=gpus[0])

            model.eval()

            valid_preds = []
            valid_Y = []
            for data in tqdm(valid_dataloader):
                for k in data:
                    data[k] = data[k].cuda()
                with torch.no_grad():
                    if logit:
                        outputs = model(data).sigmoid()
                        # feature,outputs = model(data)
                        # outputs = outputs.sigmoid()
                    else:
                        outputs = model(data)
                        # feature,outputs = model(data)
                valid_preds.append(outputs.detach().cpu().numpy())
                valid_Y.append(y.detach().cpu().numpy())

            valid_preds = np.concatenate(valid_preds).reshape(-1)
            valid_Y = train_y.loc[val_index,label_name].values # oof train
            valid_mean = np.mean(valid_preds)
            valid_metric = Metric(valid_Y,valid_preds)
            Write_log(log,'[fold %s] best_valid_metric: %.6f, best_valid_mean: %.6f'%(fold,valid_metric,valid_mean))

            all_valid_metric.append(valid_metric)
            oof.loc[val_index,label_name] = valid_preds
            oof.loc[val_index,'fold'] = fold
            train_folds.append(fold)

        mean_valid_metric = np.mean(all_valid_metric)
        Write_log(log,'all valid mean metric:%.6f'%(mean_valid_metric))
        oof.loc[oof['fold'].isin(train_folds)].to_csv(output_path + 'oof.csv',index=False)

        if test is None:
            log.close()
            os.rename(output_path + 'train.log', output_path + 'train_%.6f.log'%mean_valid_metric)

        log_df = pd.DataFrame({'run_id':[run_id],'folds':folds,'metric':[round(mean_valid_metric,6)],'lb':[np.nan],'remark':[config['remark']]})
        if not os.path.exists(output_root + 'experiment_log.csv'):
            log_df.to_csv(output_root + 'experiment_log.csv',index=False)
        else:
            log_df.to_csv(output_root + 'experiment_log.csv',index=False,mode='a',header=None)

    if test is not None:
        if train is None:
            log = open(output_path + 'test.log','w', buffering=1)
            Write_log(log,str(config)+'\n')
        test_series,test_feature,test_series_idx = test

        sub = test_feature[-len(test_series_idx):][[id_name]].reset_index(drop=True)
        sub['prediction'] = 0

        test_dataset = TaskDataset(test_series,test_feature,test_series_idx)
        test_dataloader = DataLoader(test_dataset,batch_size=batch_size,shuffle=False, drop_last=False, collate_fn=test_dataset.collate_fn,num_workers=num_workers)
        models = []
        for fold in range(folds):
            if not os.path.exists(output_path + 'fold%s.ckpt'%fold):
                continue
            model = model_class(223,(6372+13)*2,1,3,128,use_series_oof=use_series_oof)
            model.cuda()
            state_dict = torch.load(output_path + 'fold%s.ckpt'%fold, torch.device('cuda') )
            model.load_state_dict(state_dict)
            if len(gpus) > 1:
                model = nn.DataParallel(model, device_ids=gpus, output_device=gpus[0])

            model.eval()
            models.append(model)
        print('model count:',len(models))
        test_preds = []
        with torch.no_grad():
            for data in tqdm(test_dataloader):
                for k in data:
                    data[k] = data[k].cuda()

                if logit:
                    # outputs = model(data).sigmoid()
                    outputs = torch.stack([m(data).sigmoid() for m in models],0).mean(0)
                    # feature,outputs = model(data)
                    # outputs = outputs.sigmoid()
                else:
                    # outputs = model(data)
                    outputs = torch.stack([m(data) for m in models],0).mean(0)
                    # feature,outputs = model(data)
                test_preds.append(outputs.cpu().detach().numpy())
        test_preds = np.concatenate(test_preds).reshape(-1)
        test_mean = np.mean(test_preds)
        Write_log(log,'test_mean: %.6f'%(test_mean))
        sub['prediction'] = test_preds
        sub.to_csv(output_path+'submission.csv.zip',index=False, compression='zip')
    else:
        sub = None

#     if save_dir in output_path:
#         os.rename(output_path,output_root+run_id+'/')

#     return oof,sub


In [16]:
df =  pd.read_parquet('nn_series_train_feature')
df['idx'] = df.index

# getting 'index range' of each customer_ID.
series_idx = df.groupby('customer_ID',sort=False)['idx'].agg(['min','max'])
    # series_idx => 
    #                                                                 min  max
    # customer_ID
    # 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a 0    12
    # 00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5 13   25
    # ...
    
series_idx['feature_idx'] = np.arange(len(series_idx))
    # len(series_idx) => 1383534
    # np.arange(len(series_idx)) => array([      0,       1,       2, ..., 1383531, 1383532, 1383533])
    
df = df.drop(['idx'],axis=1)

In [17]:
y = pd.read_parquet(f'train_labels')
f = pd.read_parquet(f'nn_manual_train_feature')

In [14]:
nn_config = {
    'id_name':id_name,
    'feature_name':[],
    'label_name':label_name,
    'obj_max': 1,
    'epochs': 9, # 10
    'smoothing': 0.001,
    'clipnorm': 1,
    'patience': 100,
    'lr': 123e-6, # 3e-4
    'batch_size': 256, # 256
    'folds': 5,
    'seed': seed,
    'remark': remark
}

In [None]:
# conversion of dataframe 'series_idx' to array.
# series_idx.values => 
# array([[       0,       12,        0],
#        [      13,       25,        1],
#        [      26,       38,        2],
test = None
NN_train_and_predict([df,f,y,series_idx.values],test,Amodel,nn_config,use_series_oof=True,output_root='./output/',run_id='NN_with_series_and_manual_feature')

In [20]:
test=None
NN_train_and_predict([df,f,y,series_idx.values],test,Amodel,nn_config,use_series_oof=False,output_root='./output/',run_id='NN_with_series_feature')

loss: 0.2306: 100%|█████████████████████████| 1434/1434 [25:56<00:00,  1.09s/it]
100%|█████████████████████████████████████████| 359/359 [06:30<00:00,  1.09s/it]
loss: 0.2509: 100%|█████████████████████████| 1434/1434 [25:52<00:00,  1.08s/it]
100%|█████████████████████████████████████████| 359/359 [06:29<00:00,  1.08s/it]
loss: 0.2070: 100%|█████████████████████████| 1434/1434 [25:57<00:00,  1.09s/it]
100%|█████████████████████████████████████████| 359/359 [06:30<00:00,  1.09s/it]
loss: 0.1889: 100%|█████████████████████████| 1434/1434 [25:51<00:00,  1.08s/it]
100%|█████████████████████████████████████████| 359/359 [06:34<00:00,  1.10s/it]
loss: 0.2450: 100%|█████████████████████████| 1434/1434 [26:02<00:00,  1.09s/it]
100%|█████████████████████████████████████████| 359/359 [06:30<00:00,  1.09s/it]
loss: 0.2210: 100%|█████████████████████████| 1434/1434 [25:52<00:00,  1.08s/it]
100%|█████████████████████████████████████████| 359/359 [06:32<00:00,  1.09s/it]
loss: 0.2273: 100%|█████████

## Inference

In [15]:
df =  pd.read_parquet('nn_series_test_feature')
df['idx'] = df.index

# getting 'index range' of each customer_ID.
series_idx = df.groupby('customer_ID',sort=False)['idx'].agg(['min','max'])
    # series_idx => 
    #                                                                 min  max
    # customer_ID
    # 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a 0    12
    # 00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5 13   25
    # ...
    
series_idx['feature_idx'] = np.arange(len(series_idx))
    # len(series_idx) => 1383534
    # np.arange(len(series_idx)) => array([      0,       1,       2, ..., 1383531, 1383532, 1383533])
    
df = df.drop(['idx'],axis=1)

In [16]:
f = pd.read_parquet(f'nn_manual_test_feature')

In [17]:
train = None
NN_train_and_predict(train,[df,f,series_idx.values],Amodel,nn_config,use_series_oof=True,output_root='./output/',run_id='NN_with_series_and_manual_feature')

model count: 5


100%|█████████████████████████████████████| 3612/3612 [1:04:53<00:00,  1.08s/it]


In [18]:
train = None
NN_train_and_predict(train,[df,f,series_idx.values],Amodel,nn_config,use_series_oof=False,output_root='./output/',run_id='NN_with_series_feature')

model count: 5


100%|█████████████████████████████████████| 3612/3612 [1:05:39<00:00,  1.09s/it]
