In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
import sys
import os
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import LambdaLR
import itertools
from bisect import bisect

In [None]:
# http://stackoverflow.com/questions/34950201/pycharm-print-end-r-statement-not-working
class Logger(object):
    def __init__(self):
        self.terminal = sys.stdout  # stdout
        self.file = None

    def open(self, file, mode=None):
        if mode is None: mode = 'w'
        self.file = open(file, mode)

    def write(self, message, is_terminal=1, is_file=1):
        if '\r' in message: is_file = 0

        if is_terminal == 1:
            self.terminal.write(message)
            self.terminal.flush()
            # time.sleep(1)

        if is_file == 1:
            self.file.write(message)
            self.file.flush()

    def flush(self):
        # this flush method is needed for python 3 compatibility.
        # this handles the flush command by doing nothing.
        # you might want to specify some extra behavior here.
        pass

In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0.
        self.avg = 0.
        self.sum = 0.
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
def save_model(model, save_path, model_name):
#     if not os.path.exists(save_path):
#         os.makedirs(save_path)
    filename = os.path.join(save_path, model_name + '.pth.tar')
    torch.save({'state_dict': model.state_dict(), }, filename)
    
    # if is_best:
    #     best_filename = os.path.join(save_path, model_name + '_best_model.pth.tar')
    #     shutil.copyfile(filename, best_filename)


In [None]:
def load_model(model, load_path, model_name):
    if not os.path.exists(load_path):
        os.makedirs(load_path)
    filename = os.path.join(load_path, model_name + '.pth.tar')
    model.load_state_dict(torch.load(filename)['state_dict'])
    return model


In [None]:
def adjust_learning_rate(optimizer, epoch, args):
    """Sets the learning rate to the initial LR decayed every 10 epochs"""
    # lr = args.lr * (0.5 ** (epoch // 10))
    for param_group in optimizer.param_groups:
        param_group['lr'] = param_group['lr'] * (0.3 ** (epoch // 10))


In [1]:
def worker_init_fn(worker_id):
    """
    Handles PyTorch x Numpy seeding issues.
    Args:
        worker_id (int): Id of the worker.
    """
    np.random.seed(np.random.get_state()[1][0] + worker_id)

In [None]:
class MyLoss(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, inputs, rank, rank_mask):
        # loss = (inputs - rank) ** 2 * rank_mask
        # .abs(); Computes the absolute value of each element in input.
        loss = torch.abs(inputs - rank) * rank_mask
            # loss.shape => torch.Size([1, 509])
            # torch.sum(loss, dim=1).shape, torch.sum(rank_mask, dim=1).shape => torch.Size([1]) torch.Size([1])
        loss = torch.sum(loss, dim=1) / torch.sum(rank_mask, dim=1)
            # loss.shape => torch.Size([1])
        loss = loss.mean()
        return loss

In [None]:
class MyBCELoss(nn.Module):
    def __init__(self, class_weight=False):
        super().__init__()
        self.class_weight = class_weight

    def forward(self, inputs, targets, mask, sample_weight=None):
        # print(inputs)
        # inputs = inputs[:,:targets.shape[1]]
        bce1 = F.binary_cross_entropy(inputs, torch.ones_like(inputs), reduction='none')
        bce2 = F.binary_cross_entropy(inputs, torch.zeros_like(inputs), reduction='none')
        bce = 1 * bce1 * targets + bce2 * (1 - targets)
        # mask = torch.where(targets >= 0, torch.ones_like(bce), torch.zeros_like(bce))
        bce = bce * mask
        # print(bce)
        #         if sample_weight is not None:
        #             bce = bce * sample_weight.unsqueeze(1)
        loss = torch.sum(bce, dim=1) / torch.sum(mask, dim=1)
        loss = loss.mean()
        return loss

In [None]:
class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}

    def attack(self, epsilon=1., emb_name='emb'):
        # emb_name This parameter should be replaced with the parameter name of embedding in your model
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name and param.grad is not None:
                # print(name, param)
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0 and not torch.isnan(norm):
                    r_at = epsilon * param.grad / max(norm, 0.001)
                    param.data.add_(r_at)

    def restore(self, emb_name='emb'):
        # emb_name This parameter should be replaced with the parameter name of embedding in your model
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name and param.grad is not None:
                assert name in self.backup
                param.data = self.backup[name]
        self.backup = {}

In [None]:
# from https://www.kaggle.com/code/ryanholbrook/competition-metric-kendall-tau-correlation
# Actually O(N^2), but fast in practice for our data
def count_inversions(a):
    inversions = 0
    sorted_so_far = []
    for i, u in enumerate(a):  # O(N)
        j = bisect(sorted_so_far, u)  # O(log N)
        inversions += i - j
        sorted_so_far.insert(j, u)  # O(N)
    return inversions

In [None]:
def kendall_tau(ground_truth, predictions):
    total_inversions = 0  # total inversions in predicted ranks across all instances
    total_2max = 0  # maximum possible inversions across all instances
    for gt, pred in zip(ground_truth, predictions):
        assert len(gt) == len(pred)
        ranks = [gt.index(x) for x in pred]  # rank predicted order in terms of ground truth
        total_inversions += count_inversions(ranks)
        n = len(gt)
        total_2max += n * (n - 1)
    return 1 - 4 * total_inversions / total_2max

In [None]:
def get_score(df, masks, rank_pred, code_df_valid, data_dir):
    #     df['cell_id2'] = [[y[i] for i in range(len(x)) if x[i] == 1] for x, y in
    #                           zip(df['cell_type'].values, df['cell_id'].values)]
    df['cell_id2'] = df['cell_id']
    df = df[['id', 'cell_id2']].explode('cell_id2')
        # df.head(2) =>
        #                    id  cell_id2
        # 0      0001bdd4021779  3fdc37be
        # 0      0001bdd4021779  073782ca

        # pd.isnull(df['cell_id2']) =>
        # 0       False
        # 0       False
        #         ...
        # Name: cell_id2, Length: 52282, dtype: bool
    df = df[~pd.isnull(df['cell_id2'])]
        # rank_pred.flatten()[0:10]
        # [-0.01354445  0.06333275  0.09860276  0.12805334  0.04778271  0.03880716 ...
        
        # masks.flatten()[0:10] =>
        # [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
    preds = rank_pred.flatten()[np.where(masks.flatten() == 1)]        
        # preds.flatten()[0:10]
        # [-0.01354445 -0.01515285 -0.01671559  0.00495259 -0.02522139 -0.02115827
    df['rank2'] = preds
        # df.groupby(by=['id', 'cell_id2'], as_index=False)['rank2'].head(2) => 
        # 0      -0.013544
        # 0      -0.015153
        #         ...
        # Name: rank2, Length: 47718, dtype: float64

        # ( showing all rows of id 0001bdd4021779 ), df.head(10) => 
        #                id  cell_id2     rank2
        # 0  0001bdd4021779  3fdc37be -0.013544
        # 0  0001bdd4021779  073782ca -0.015153
        # 0  0001bdd4021779  8ea7263c -0.016716
        # 0  0001bdd4021779  80543cd8  0.004953
        # 0  0001bdd4021779  38310c80 -0.025221
        # 0  0001bdd4021779  073e27e5 -0.021158
        # 0  0001bdd4021779  015d52a4  0.042571
        # 0  0001bdd4021779  ad7679ef  0.043002
        # 0  0001bdd4021779  07c52510 -0.010163
        # 0  0001bdd4021779  0a1a7a39  0.006047
        # 0  0001bdd4021779  0bcd3fef -0.015030
        # 0  0001bdd4021779  7fde4f04  0.083458
        # 0  0001bdd4021779  58bf360b  0.093474
    df = df.groupby(by=['id', 'cell_id2'], as_index=False)['rank2'].agg('mean')
        # df.groupby(by=['id', 'cell_id2'], as_index=True)['rank2'].agg('mean').head(10) =>
        # id              cell_id2
        # 0001bdd4021779  015d52a4    0.042571
        #                 073782ca   -0.015153
        #                 073e27e5   -0.021158
        #                 07c52510   -0.010163
        #                 0a1a7a39    0.006047
        #                 0bcd3fef   -0.015030
        #                 38310c80   -0.025221
        #                 3fdc37be   -0.013544
        #                 58bf360b    0.093474
        #                 7fde4f04    0.083458
        #                 80543cd8    0.004953
        #                 8ea7263c   -0.016716
        #                 ad7679ef    0.043002
        # Name: rank2, dtype: float64    
        
        # ( showing all rows of id 0001bdd4021779 ), df.head(10) =>
        #                 id  cell_id2     rank2
        # 0   0001bdd4021779  015d52a4  0.042571
        # 1   0001bdd4021779  073782ca -0.015153
        # 2   0001bdd4021779  073e27e5 -0.021158
        # 3   0001bdd4021779  07c52510 -0.010163
        # 4   0001bdd4021779  0a1a7a39  0.006047
        # 5   0001bdd4021779  0bcd3fef -0.015030
        # 6   0001bdd4021779  38310c80 -0.025221
        # 7   0001bdd4021779  3fdc37be -0.013544
        # 8   0001bdd4021779  58bf360b  0.093474
        # 9   0001bdd4021779  7fde4f04  0.083458
        
    df.rename(columns={'cell_id2': 'cell_id'}, inplace=True)
        # code_df_valid.head(2) =>
        #                id   cell_id     rank2
        # 0  00001756c60be8  1862f0a6  0.033333
        # 1  00001756c60be8  2a9e43d6  0.066667
    code_df_valid_tmp = code_df_valid[code_df_valid['id'].isin(df['id'])]
        # code_df_valid_tmp.head(2) =>    
        #                  id   cell_id     rank2
        # 102  0001bdd4021779  3fdc37be  0.090909
        # 103  0001bdd4021779  073782ca  0.181818    
    code_df_valid_tmp['rank3'] = code_df_valid_tmp.groupby(by=['id'])['rank2'].rank(ascending=True, method='first')
        # code_df_valid_tmp.head(3) =>
        #                  id   cell_id     rank2  rank3
        # 102  0001bdd4021779  3fdc37be  0.090909    1.0
        # 103  0001bdd4021779  073782ca  0.181818    2.0
        # 104  0001bdd4021779  8ea7263c  0.272727    3.0
        
    # df is much larger dataframe than code_df_valid_tmp, thus we find many empty values in ['rank3'] -
    # - column of "code_df_valid_tmp".
    tmp = code_df_valid_tmp[['id', 'cell_id', 'rank3']].merge(df, how='inner', on=['id', 'cell_id'])
        # tmp.head(3) =>    
        #                id   cell_id  rank3     rank2
        # 0  0001bdd4021779  3fdc37be    1.0 -0.013544
        # 1  0001bdd4021779  073782ca    2.0 -0.015153
        # 2  0001bdd4021779  8ea7263c    3.0 -0.016716    
        
    tmp['rank4'] = tmp.groupby(by=['id'])['rank2'].rank(ascending=True, method='first')
        # tmp.head(15) =>    
        #                 id   cell_id  rank3     rank2  rank4
        # 0   0001bdd4021779  3fdc37be    1.0 -0.013544    6.0
        # 1   0001bdd4021779  073782ca    2.0 -0.015153    4.0
        # 2   0001bdd4021779  8ea7263c    3.0 -0.016716    3.0
        # 3   0001bdd4021779  80543cd8    4.0  0.004953    8.0
        # 4   0001bdd4021779  38310c80    5.0 -0.025221    1.0
        # 5   0001bdd4021779  073e27e5    6.0 -0.021158    2.0
        # 6   0001bdd4021779  015d52a4    7.0  0.042571   10.0
        # 7   0001bdd4021779  ad7679ef    8.0  0.043002   11.0
        # 8   0001bdd4021779  07c52510    9.0 -0.010163    7.0
        # 9   0001bdd4021779  0a1a7a39   10.0  0.006047    9.0
        # 10  0001bdd4021779  0bcd3fef   11.0 -0.015030    5.0
        # 11  0002115f48f982  18281c6c    1.0 -0.024155    3.0
        # 12  0002115f48f982  e3b6b115    2.0 -0.072884    2.0

    tmp = tmp[['id', 'cell_id', 'rank3']].merge(tmp[['id', 'rank4', 'rank2']].rename(columns={'rank4': 'rank3'}),
                                                how='inner', on=['id', 'rank3'])
        # tmp.head(15) =>
        #                 id   cell_id  rank3     rank2
        # 0   0001bdd4021779  3fdc37be    1.0 -0.025221
        # 1   0001bdd4021779  073782ca    2.0 -0.021158
        # 2   0001bdd4021779  8ea7263c    3.0 -0.016716
        # 3   0001bdd4021779  80543cd8    4.0 -0.015153
        # 4   0001bdd4021779  38310c80    5.0 -0.015030
        # 5   0001bdd4021779  073e27e5    6.0 -0.013544
        # 6   0001bdd4021779  015d52a4    7.0 -0.010163
        # 7   0001bdd4021779  ad7679ef    8.0  0.004953
        # 8   0001bdd4021779  07c52510    9.0  0.006047
        # 9   0001bdd4021779  0a1a7a39   10.0  0.042571
        # 10  0001bdd4021779  0bcd3fef   11.0  0.043002
        # 11  0002115f48f982  18281c6c    1.0 -0.125229
        # 12  0002115f48f982  e3b6b115    2.0 -0.072884
        # 13  0002115f48f982  4a044c54    3.0 -0.024155
        # 14  0002115f48f982  365fe576    4.0 -0.014178
        
    tmp = tmp[['id', 'cell_id', 'rank2']]
    df = df.merge(tmp[['id', 'cell_id', 'rank2']].rename(columns={'rank2': 'rank3'}), how='left', on=['id', 'cell_id'])
    # now, ['rank3'] contains predictions for only code cells.
    # ['rank2'] contains predictions for both cell types.
        # df.head(4) =>
        #                id   cell_id     rank2     rank3
        # 0  0001bdd4021779  015d52a4  0.042571 -0.010163
        # 1  0001bdd4021779  073782ca -0.015153 -0.021158
        # 2  0001bdd4021779  073e27e5 -0.021158 -0.013544
        # 3  0001bdd4021779  07c52510 -0.010163  0.006047        
        
    
    df['rank2'] = np.where(pd.isnull(df['rank3']), df['rank2'], df['rank3']) 

    # df = pd.concat([df[['id', 'cell_id', 'rank2']], code_df_valid_tmp]).reset_index(drop=True)
    
    # sort ['id'] and then within each id sort ['rank2'].
    df = df.sort_values(by=['id', 'rank2'], ascending=True)
        # df.head(4) =>
        #                 id   cell_id     rank2     rank3
        # 7   0001bdd4021779  3fdc37be -0.025221 -0.025221
        # 1   0001bdd4021779  073782ca -0.021158 -0.021158
        # 11  0001bdd4021779  8ea7263c -0.016716 -0.016716
        # 10  0001bdd4021779  80543cd8 -0.015153 -0.015153
        
    res = df.groupby(by=['id'], sort=False, as_index=False)['cell_id'].agg(list)
        # res.head(2) =>
        #                id                                            cell_id
        # 0  0001bdd4021779  [3fdc37be, 073782ca, 8ea7263c, 80543cd8, 38310...
        # 1  0002115f48f982  [18281c6c, e3b6b115, 4a044c54, 365fe576, a3188...
    train_orders = pd.read_csv(data_dir + 'train_orders.csv')
    train_orders['cell_order'] = train_orders['cell_order'].str.split()      
    res = res.merge(train_orders, how='left', on='id')
        #                id                                            cell_id                                          cell_order
        # 0  0001bdd4021779  [3fdc37be, 073782ca, 8ea7263c, 80543cd8, 38310...   [3fdc37be, 073782ca, 8ea7263c, 80543cd8, 38310... 
        # 1  0002115f48f982  [18281c6c, e3b6b115, 4a044c54, 365fe576, a3188...   [9ec225f0, 18281c6c, e3b6b115, 4a044c54, 365fe... 

    score = kendall_tau(res['cell_order'], res['cell_id'])
    return score

In [1]:
def get_model_path(model_name, res = '../input/'):    
    if model_name in ['distilroberta-base', 'roberta-base', 'roberta-large']:
        res += 'roberta-transformers-pytorch/' + model_name
    elif model_name in ['bart-base', 'bart-large']:
        res += 'bartbase' if model_name == 'bart-base' else 'bartlarge'
        res += '/'
    elif model_name in ['deberta-base', 'deberta-large', 'deberta-v2-xlarge', 'deberta-v2-xxlarge']:
        res += 'deberta/' + model_name.replace('deberta-', '')
    elif model_name in ['deberta-v3-large']:
        res += 'deberta-v3-large/' + model_name
    elif model_name in ['electra-base', 'electra-large']:
        res += 'electra/' + model_name + '-discriminator'
    elif 'albert' in model_name:
        res += 'pretrained-albert-pytorch/' + model_name
    elif model_name == 'deberta-v3-base':
        res += 'deberta-v3-base/' + model_name
    elif model_name == 'deberta-v3-large':
        res += 'deberta-v3-large/' + model_name
    elif model_name == 'funnel-large':
        res += 'funnel-large/'
    elif model_name == 'xlnet-base':
        res += 'xlnet-pretrained/xlnet-pretrained/'
    elif model_name == 'deberta-base-mnli':
        res += 'huggingface-deberta-variants/deberta-base-mnli/deberta-base-mnli/'
    elif model_name == 'deberta-xlarge':
        res += 'huggingface-deberta-variants/deberta-xlarge/deberta-xlarge/'
    elif model_name == 'codebert-base':
        res += 'codebert-base/codebert-base/'
    elif model_name == 'CodeBERTa-small-v1':
        res += 'huggingface-code-models/CodeBERTa-small-v1/'
    elif model_name == 'mdeberta-v3-base':
        res += 'mdeberta-v3-base/'
    else:
        raise ValueError(model_name)
    return res