In [None]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
sys.path.append('../input/shopee-pytorch-models')
import numpy as np 
import pandas as pd 

import math
import random 
import os 
import cv2
import timm

from tqdm import tqdm 

import albumentations as A 
from albumentations.pytorch.transforms import ToTensorV2

import torch 
from torch.utils.data import Dataset, DataLoader 
from torch import nn
import torch.nn.functional as F 

import gc
import cudf
import cuml
import cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors



#--------------
#torch
import torch
import torch.nn as nn
from torch.nn import Parameter

import transformers
from transformers import (BertTokenizer, BertModel,
                          DistilBertTokenizer, DistilBertModel,
                          RobertaTokenizer, RobertaModel,
                          AutoTokenizer, AutoModel)

from cuml import PCA
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import LabelEncoder

## Imagenet-Config

In [13]:
GET_CV = False #True # kaggle commit할때는 False로 지정
CHECK_SUB = False

df = cudf.read_csv('../input/shopee-product-matching/test.csv')
# If we are comitting, replace train set for test set and dont get cv
if len(df) > 3:
    GET_CV = False
del df

print(GET_CV)

class CFG:
    img_size = 512
    batch_size = 12
    seed = 2020
    
    device = 'cuda'
    classes = 11014
    
    model_name1 = 'eca_nfnet_l1'
    model_path1 = '../input/nf-45epochs/Curr_Arc_512x512_eca_nfnet_l1(mish)_45EpochStep_adamw.pt'
    
    model_name2 = 'efficientnet_b3'
    model_path2 = '../input/effb3-cur-arc-30epoch-weight/Curr_Arc_512x512_efficientnet_b3(mish)_29EpochStep_adamw (1).pt'
                   
    model_name3 = 'eca_nfnet_l0'
    model_path3 = '../input/curr-arc-nfnet-weight/Curr_Arc_512x512_nfnet_l0(mish)_29EpochStep_adamw.pt'
    
    # model_name4 = 'eca_nfnet_l0'
    # model_path4 = '../input/curr-arc-nfnet-weight/Curr_Arc_512x512_nfnet_l0(mish)_29EpochStep_adamw.pt'
    
    scale = 30 
    margin = 0.5

False


In [14]:
class CFG2:
    bert_hidden_size = 768 #Bert-base
    SEED = 42
    batch_size = 16 #64 #32?
    num_workers = 4
    max_length = 30
    device = "cuda"
    NUM_CLASSES = 11014
    TEXT_MODEL_PATH1 = '../input/robertdistilbertweights/model_100epochs.pt'    
    TEXT_MODEL_PATH2 = '../input/wholeroberta/whole_roberta_150epochs.pt_147'
    TEXT_MODEL_PATH3 = '../input/distilmerge-20epochs/Mergeface_adamw_20epochStep_model_100epochs.pt'

In [15]:
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

In [16]:
def read_dataset():
    if GET_CV:
        df = pd.read_csv('../input/shopee-product-matching/train.csv')
        tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
        df['matches'] = df['label_group'].map(tmp)
        df['matches'] = df['matches'].apply(lambda x: ' '.join(x))
        if CHECK_SUB:
            df = pd.concat([df, df], axis = 0)
            df.reset_index(drop = True, inplace = True)
        df_cu = cudf.DataFrame(df)
        image_paths = '../input/shopee-product-matching/train_images/' + df['image']
    else:
        df = pd.read_csv('../input/shopee-product-matching/test.csv')
        df_cu = cudf.DataFrame(df)
        image_paths = '../input/shopee-product-matching/test_images/' + df['image']
        
    return df, df_cu, image_paths

In [17]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_torch(CFG.seed)

## Augmentation

In [18]:
def get_test_transforms():

    return A.Compose(
        [
            A.Resize(CFG.img_size,CFG.img_size,always_apply=True),
            A.Normalize(),
        ToTensorV2(p=1.0)
        ]
    )

## Imagenet-Dataset

In [19]:
class ShopeeDataset(Dataset):
    def __init__(self, image_paths, transforms=None):

        self.image_paths = image_paths
        self.augmentations = transforms

    def __len__(self):
        return self.image_paths.shape[0]

    def __getitem__(self, index):
        image_path = self.image_paths[index]
        
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.augmentations:
            augmented = self.augmentations(image=image)
            image = augmented['image']       
    
        return image,torch.tensor(1)

## Bert-Dataset

In [20]:
class ShopeeDataset2(Dataset):
    def __init__(self, dataframe, tokenizer, mode="test", max_length=None):
        self.dataframe = dataframe
        if mode != "test":
            self.targets = dataframe['label_code'].values
        texts = list(dataframe['title'].apply(lambda o: str(o)).values)
        self.encodings = tokenizer(texts, 
                                   padding=True, 
                                   truncation=True, 
                                   max_length=max_length)
        self.mode = mode
        
        
    def __getitem__(self, idx):

        item = {key: torch.tensor(values[idx]) for key, values in self.encodings.items()}
        if self.mode != "test":
            item['labels'] = torch.tensor(self.targets[idx]).long()
        return item
    
    def __len__(self):
        return len(self.dataframe)

## Curr + Arc

In [21]:
'''
credit : https://github.com/HuangYG123/CurricularFace/blob/8b2f47318117995aa05490c05b455b113489917e/head/metrics.py#L70
'''
class ArcMarginProduct(nn.Module):
    def __init__(self, in_features, out_features, scale=30.0, margin=0.50, easy_margin=False, ls_eps=0.0):
        super(ArcMarginProduct, self).__init__()
        
        print('Using ArcFace')
        
        self.in_features = in_features
        self.out_features = out_features
        self.scale = scale
        self.margin = margin
        self.ls_eps = ls_eps  # label smoothing
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(margin)
        self.sin_m = math.sin(margin)
        self.th = math.cos(math.pi - margin)
        self.mm = math.sin(math.pi - margin) * margin

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
        one_hot = torch.zeros(cosine.size(), device='cuda:1')
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.scale
        return output#, nn.CrossEntropyLoss()(output,label)



def l2_norm(input, axis = 1):
    norm = torch.norm(input, 2, axis, True)
    output = torch.div(input, norm)

    return output

class CurricularFace(nn.Module):
    def __init__(self, in_features, out_features, s = 30, m = 0.50):
        super(CurricularFace, self).__init__()

        print('Using Curricular Face')

        self.in_features = in_features
        self.out_features = out_features
        self.m = m
        self.s = s
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.threshold = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m
        self.kernel = nn.Parameter(torch.Tensor(in_features, out_features))
        self.register_buffer('t', torch.zeros(1))
        nn.init.normal_(self.kernel, std=0.01)

    def forward(self, embbedings, label):
        embbedings = l2_norm(embbedings, axis = 1)
        kernel_norm = l2_norm(self.kernel, axis = 0)
        cos_theta = torch.mm(embbedings, kernel_norm)
        cos_theta = cos_theta.clamp(-1, 1)  # for numerical stability
        with torch.no_grad():
            origin_cos = cos_theta.clone()
        target_logit = cos_theta[torch.arange(0, embbedings.size(0)), label].view(-1, 1)

        sin_theta = torch.sqrt(1.0 - torch.pow(target_logit, 2))
        cos_theta_m = target_logit * self.cos_m - sin_theta * self.sin_m #cos(target+margin)
        mask = cos_theta > cos_theta_m
        final_target_logit = torch.where(target_logit > self.threshold, cos_theta_m, target_logit - self.mm)

        hard_example = cos_theta[mask]
        with torch.no_grad():
            self.t = target_logit.mean() * 0.01 + (1 - 0.01) * self.t
        cos_theta[mask] = hard_example * (self.t + hard_example)
        cos_theta.scatter_(1, label.view(-1, 1).long(), final_target_logit)
        output = cos_theta * self.s
        return output #, nn.CrossEntropyLoss()(output,label)

class MergeLossLayer(nn.Module):
    def __init__(self):
        super(MergeLossLayer, self).__init__()
        print('Using Merge Face')
    
    def forward(self, embbedings1, embbedings2, label):
        embbedings = embbedings1 + embbedings2
        scal = torch.tensor(np.ones((embbedings.shape[0],embbedings.shape[1])) * 2, device='cuda:1')
        output = torch.div(embbedings, scal)
        return output, nn.CrossEntropyLoss()(output,label)

## Imagenet-Model

In [22]:
class ShopeeModel_Two(nn.Module):

    def __init__(
        self,model_name,
        n_classes = CFG.classes,
        fc_dim = 512,
        margin = CFG.margin,
        scale = CFG.scale,
        use_fc = True,
        pretrained = False):


        super(ShopeeModel_Two,self).__init__()
        print('Building Model Backbone for {} model'.format(model_name))

        self.backbone = timm.create_model(model_name, pretrained=pretrained)

        if 'efficientnet' in model_name:
            final_in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
            self.backbone.global_pool = nn.Identity()
        
        elif 'nfnet' in model_name:
            final_in_features = self.backbone.head.fc.in_features
            self.backbone.head.fc = nn.Identity()
            self.backbone.head.global_pool = nn.Identity()

        self.pooling =  nn.AdaptiveAvgPool2d(1)

        self.use_fc = use_fc

        if use_fc:
            self.dropout = nn.Dropout(p=0.0)
            self.fc = nn.Linear(final_in_features, fc_dim)
            self.bn = nn.BatchNorm1d(fc_dim)
            self._init_params()
            final_in_features = fc_dim

        self.final = CurricularFace(final_in_features, 
                                           n_classes, 
                                           s=scale, 
                                           m=margin)

        self.final2 = ArcMarginProduct(final_in_features,
                                       n_classes,
                                       scale = scale,
                                       margin = margin,
                                       easy_margin = False,
                                       ls_eps = 0.0)
    
        self.Merge_final = MergeLossLayer()
        
    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, image, label):
        feature = self.extract_feat(image)
        #logits1 = self.final(feature,label) # Curr 
        #logits2 = self.final2(feature,label) # ArcFace
        #logits = self.Merge_final(logits1, logits2, label)
        return feature

    def extract_feat(self, x):
        batch_size = x.shape[0]
        x = self.backbone(x)
        x = self.pooling(x).view(batch_size, -1)

        if self.use_fc:
            x = self.dropout(x)
            x = self.fc(x)
            x = self.bn(x)
        return x

## Bert-Model

In [23]:
class ShopeeNet1(nn.Module): #DistilBert-Indonesian
    def __init__(self, 
                 bert_model,
                 num_classes=CFG2.NUM_CLASSES, 
                 last_hidden_size=CFG2.bert_hidden_size):
        
        super().__init__()
        self.bert_model = bert_model

    
    def get_bert_features(self, batch):
        output = self.bert_model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        last_hidden_state = output.last_hidden_state 
        CLS_token_state = last_hidden_state[0] 
        CLS_token_state = last_hidden_state[:, 0, :] 
        return CLS_token_state
    
    def forward(self, batch):
        CLS_hidden_state = self.get_bert_features(batch)
        return F.normalize(CLS_hidden_state)

In [24]:
class ShopeeNet2(nn.Module): #Roberta
    def __init__(self, 
                 roberta_model,
                 num_classes=CFG2.NUM_CLASSES, 
                 last_hidden_size=CFG2.bert_hidden_size):
        
        super().__init__()
        self.roberta_model = roberta_model

    def get_bert_features(self, batch):
        output = self.roberta_model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        last_hidden_state = output.last_hidden_state 
        CLS_token_state = last_hidden_state[0] 
        CLS_token_state = last_hidden_state[:, 0, :] 
        return CLS_token_state
    
    def forward(self, batch):
        CLS_hidden_state = self.get_bert_features(batch)
        return F.normalize(CLS_hidden_state)

In [25]:
class Mish_func(torch.autograd.Function):
    
    """from: https://github.com/tyunist/memory_efficient_mish_swish/blob/master/mish.py"""
    
    @staticmethod
    def forward(ctx, i):
        result = i * torch.tanh(F.softplus(i))
        ctx.save_for_backward(i)
        return result

    @staticmethod
    def backward(ctx, grad_output):
        i = ctx.saved_variables[0]
  
        v = 1. + i.exp()
        h = v.log() 
        grad_gh = 1./h.cosh().pow_(2) 

        # Note that grad_hv * grad_vx = sigmoid(x)
        #grad_hv = 1./v  
        #grad_vx = i.exp()
        
        grad_hx = i.sigmoid()

        grad_gx = grad_gh *  grad_hx #grad_hv * grad_vx 
        
        grad_f =  torch.tanh(F.softplus(i)) + i * grad_gx 
        
        return grad_output * grad_f 


class Mish(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        pass
    def forward(self, input_tensor):
        return Mish_func.apply(input_tensor)


def replace_activations(model, existing_layer, new_layer):
    
    """A function for replacing existing activation layers"""
    
    for name, module in reversed(model._modules.items()):
        if len(list(module.children())) > 0:
            model._modules[name] = replace_activations(module, existing_layer, new_layer)

        if type(module) == existing_layer:
            layer_old = module
            layer_new = new_layer
            model._modules[name] = layer_new
    return model

## Imagenet get_image_predictions

In [26]:
def get_image_predictions(df, embeddings,threshold = 0.0): # GET_CV=Ture일 경우 threshold값 지정과 상관 없음.(False일 경우 사용) 
    
    if len(df) > 3:
        KNN = 50
    else : 
        KNN = 3
    print(f'KNN={KNN}')
    model = NearestNeighbors(n_neighbors = KNN, metric = 'cosine')
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    if GET_CV: # train 데이터로 CV score 계산 시작       
        thresholds = list(np.arange(0.2, 3.0, 0.01)) # image prediction threshold 범위를 여기서 지정할 것 
        scores = []
        for threshold in thresholds:
            predictions = []
            for k in range(embeddings.shape[0]):
                idx = np.where(distances[k,] < threshold)[0]
                ids = indices[k,idx]
                posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
                predictions.append(posting_ids)
            df['pred_matches'] = predictions
            df['f1'] = f1_score(df['matches'], df['pred_matches'])
            score = df['f1'].mean()
            print(f'Our f1 score for threshold {threshold} is {score}')
            scores.append(score)
        thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_threshold}') # 여기서 best threshold 출력
        
        # best threshold로 최종 return 하도록 진행
        predictions = []
        for k in tqdm(range(embeddings.shape[0])):
            idx = np.where(distances[k,] < best_threshold)[0] # best threshold가 들어감.
            ids = indices[k,idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions.append(posting_ids)

        del model, distances, indices
        gc.collect()
        return predictions # best threshold로 예측한 것을 return
    
    else: # test 데이터로 예측 - threshold는 수동으로 지정해야함.
        predictions = []
        for k in tqdm(range(embeddings.shape[0])):
            # 사전에 train으로 best cv나온 threshold값을 함수 인자로서 수동으로 지정해줄 것!
            idx = np.where(distances[k,] < threshold)[0]
            ids = indices[k,idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions.append(posting_ids)

        del model, distances, indices
        gc.collect()
        return predictions

## Get image embeddings

In [27]:
def get_image_embeddings(image_paths, model_name, model_path):
    embeds = []
    
    if model_name == 'eca_nfnet_l0':
        model = ShopeeModel_Two(model_name=model_name)
        model.eval()
        model = replace_activations(model, torch.nn.SiLU, Mish())
        model.load_state_dict(torch.load(model_path, map_location="cuda:0"))
        model = model.to(CFG.device)
    
    elif model_name == 'eca_nfnet_l1':
        model = ShopeeModel_Two(model_name=model_name)
        model.eval()
        model = replace_activations(model, torch.nn.SiLU, Mish())
        model.load_state_dict(torch.load(model_path, map_location="cuda:0"))
        model = model.to(CFG.device)
    
    
    elif model_name == 'efficientnet_b3':
        model = ShopeeModel_Two(model_name=model_name)
        model.eval()
        model = replace_activations(model, torch.nn.SiLU, Mish())
        model.load_state_dict(torch.load(model_path, map_location="cuda:0"))
        model = model.to(CFG.device)
    
    else:
        model = ShopeeModel(model_name=model_name, LossTypes='ArcFace')
        model.eval()
        model.load_state_dict(torch.load(model_path))
        model = model.to(CFG.device)

    image_dataset = ShopeeDataset(image_paths=image_paths,transforms=get_test_transforms())
    image_loader = torch.utils.data.DataLoader(
        image_dataset,
        batch_size=CFG.batch_size,
        pin_memory=True,
        drop_last=False,
        num_workers=4
    )
    
    
    with torch.no_grad():
        for img,label in tqdm(image_loader): 
            img = img.cuda()
            label = label.cuda()
            feat = model(img,label)
            image_embeddings = feat.detach().cpu().numpy()
            embeds.append(image_embeddings)
    
    
    del model
    image_embeddings = np.concatenate(embeds)
    print(f'Our image embeddings shape is {image_embeddings.shape}')
    del embeds
    gc.collect()
    return image_embeddings

## TFIDF embeddings

In [28]:
def get_text_predictions(df, max_features=25_000):
    
    model = TfidfVectorizer(stop_words='english', binary=True, max_features=max_features) # stop_words='english'
    text_embeddings = model.fit_transform(df_cu['title']).toarray()
    print('Finding similar titles...')
    CHUNK = 1024 * 4
    CTS = len(df) // CHUNK
    if (len(df)%CHUNK) != 0:
        CTS += 1
    thresholds = list(np.arange(0.55,0.8,0.025)) # text threshold 범위를 여기서 지정할것.
    scores = []
    if GET_CV:    
        for threshold in thresholds:
            preds = []
            for j in range( CTS ):
                a = j * CHUNK
                b = (j+1) * CHUNK
                b = min(b, len(df))
                print('chunk', a, 'to', b)

                # COSINE SIMILARITY DISTANCE
                cts = cupy.matmul(text_embeddings, text_embeddings[a:b].T).T
                for k in range(b-a):
                    IDX = cupy.where(cts[k,]>threshold)[0]
                    o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
                    o = ' '.join(o)
                    preds.append(o)
            df['pred_matches'] = preds
            df['f1'] = f1_score(df['matches'], df['pred_matches'])
            score = df['f1'].mean()
            print(f'Our f1 score for threshold {threshold} is {score}')
            scores.append(score)
        thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_threshold}')
        print(' ')
        print(f'Final text embeddings prediction using best threshold {best_threshold}')
        preds = []
        for j in range( CTS ):
            a = j * CHUNK
            b = (j+1) * CHUNK
            b = min(b, len(df))
            print('chunk', a, 'to', b)
            # COSINE SIMILARITY DISTANCE
            cts = cupy.matmul(text_embeddings, text_embeddings[a:b].T).T
            for k in range(b-a):
                IDX = cupy.where(cts[k,]>best_threshold)[0]
                o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
                preds.append(o)
        del model,text_embeddings
        gc.collect()
        return preds # best cv score를 기록한 threshold값으로 return
    
    
    else: # test 데이터로 예측 - threshold는 수동으로 지정해야함.
        #model = TfidfVectorizer(stop_words = 'english', binary = True, max_features = max_features)
        #text_embeddings = model.fit_transform(df_cu['title']).toarray()
        preds = []
        CHUNK = 1024*4

        print('Finding similar titles...')
        CTS = len(df)//CHUNK
        if len(df)%CHUNK!=0: CTS += 1
        for j in range( CTS ):

            a = j*CHUNK
            b = (j+1)*CHUNK
            b = min(b,len(df))
            print('chunk',a,'to',b)

            # COSINE SIMILARITY DISTANCE
            cts = cupy.matmul( text_embeddings, text_embeddings[a:b].T).T

            for k in range(b-a):
                IDX = cupy.where(cts[k,]>0.75)[0] # 현재 defualt는 0.75. 여기서 train best cv score의 threshold를 수동으로 지정.
                o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
                preds.append(o)
                
        del model,text_embeddings
        gc.collect()
        return preds

## Bert One_epoch function

In [29]:
def one_epoch(model, 
              loader,
              optimizer=None, 
              lr_scheduler=None):
    
    embeds = []
    
    tqdm_object = tqdm(loader, total=len(loader))
    for batch in tqdm_object:
        batch = {k: v.to(CFG.device) for k, v in batch.items()}
        feat = model(batch)
        text_embeddings = feat.detach().cpu().numpy()
        embeds.append(text_embeddings)
    return embeds

## Bert cosine sim

In [30]:
def get_neighbours_cos_sim(df,embeddings):
    '''
    When using cos_sim use normalized features else use normal features
    '''
    embeddings = cupy.array(embeddings)
    
    if GET_CV:
        thresholds = list(np.arange(0.5,0.7,0.05))

        scores = []
        for threshold in thresholds:
            
################################################# Code for Getting Preds #########################################
            preds = []
            CHUNK = 1024*4

            print('Finding similar titles...for threshold :',threshold)
            CTS = len(embeddings)//CHUNK
            if len(embeddings)%CHUNK!=0: CTS += 1

            for j in range( CTS ):
                a = j*CHUNK
                b = (j+1)*CHUNK
                b = min(b,len(embeddings))

                cts = cupy.matmul(embeddings,embeddings[a:b].T).T

                for k in range(b-a):
                    IDX = cupy.where(cts[k,]>threshold)[0]
                    o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
                    o = ' '.join(o)
                    preds.append(o)
######################################################################################################################
            df['pred_matches'] = preds
            df['f1'] = f1_score(df['matches'], df['pred_matches'])
            score = df['f1'].mean()
            print(f'Our f1 score for threshold {threshold} is {score}')
            scores.append(score)
            
        thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_threshold}')
            
    else:
        preds = []
        CHUNK = 1024*4
        threshold = 0.8

        print('Finding similar texts...for threshold :',threshold)
        CTS = len(embeddings)//CHUNK
        if len(embeddings)%CHUNK!=0: CTS += 1

        for j in range( CTS ):
            a = j*CHUNK
            b = (j+1)*CHUNK
            b = min(b,len(embeddings))
            print('chunk',a,'to',b)

            cts = cupy.matmul(embeddings,embeddings[a:b].T).T

            for k in range(b-a):
                IDX = cupy.where(cts[k,]>threshold)[0]
                o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
                preds.append(o)
                    
    return df, preds

## Bert text embeddings

In [31]:
def get_text_embeddings_bert(df, model):
    embeds = []
    
    if model == 'distil':
        bert_model = DistilBertModel.from_pretrained('../input/distilbert-base-indonesian')
        model = ShopeeNet1(bert_model)
        model.load_state_dict(dict(list(torch.load(CFG2.TEXT_MODEL_PATH1).items())[:-1]))
        model.eval()
        model = model.to(CFG2.device)

        model_name = '../input/distilbert-base-indonesian'
        tokenizer = DistilBertTokenizer.from_pretrained(model_name)
        
    elif model == 'roberta':
        roberta_model = RobertaModel.from_pretrained('../input/roberta-base') 
        model = ShopeeNet2(roberta_model)
        model.load_state_dict(dict(list(torch.load(CFG2.TEXT_MODEL_PATH2).items())[:-1]))
        model.eval()
        model = model.to(CFG2.device)

        model_name = '../input/roberta-base'
        tokenizer = RobertaTokenizer.from_pretrained(model_name)

    elif model == 'distil_mergeface':
        bert_model = DistilBertModel.from_pretrained('../input/distilbert-base-indonesian')
        model = ShopeeNet1(bert_model)
        model.load_state_dict(dict(list(torch.load(CFG2.TEXT_MODEL_PATH3).items())[:-3]))
        model.eval()
        model = model.to(CFG2.device)

        model_name = '../input/distilbert-base-indonesian'
        tokenizer = DistilBertTokenizer.from_pretrained(model_name)


    text_dataset = ShopeeDataset2(df, tokenizer, max_length=CFG2.max_length)  
    text_loader = torch.utils.data.DataLoader(
        text_dataset,
        batch_size=CFG2.batch_size,
        pin_memory=True,
        drop_last=False,
        num_workers=CFG2.num_workers
    ) 
    
    with torch.no_grad(): 
        embeds = one_epoch(model, 
                           text_loader, 
                           optimizer=None,
                           lr_scheduler=None)
    
    
    del model
    text_embeddings = np.concatenate(embeds)
    print(f'Our text embeddings shape is {text_embeddings.shape}')
    del embeds
    gc.collect()
    return text_embeddings

## Bert Inference

In [32]:
df,df_cu, image_paths = read_dataset()

if GET_CV :
    lbl_encoder = LabelEncoder()
    df['label_code'] = lbl_encoder.fit_transform(df['label_group'])

In [33]:
%time finetuned_emb1 = get_text_embeddings_bert(df,'distil')
# %time finetuned_emb2 = get_text_embeddings_bert(df,'roberta')
# %time finetuned_emb3 = np.concatenate([finetuned_emb1,finetuned_emb2],axis=1)

100%|██████████| 1/1 [00:01<00:00,  1.06s/it]

Our text embeddings shape is (3, 768)
CPU times: user 3.77 s, sys: 971 ms, total: 4.75 s
Wall time: 11.7 s





In [34]:
print("***** DistillBert-Indonesian CV score *****")
df_t1,t1 = get_neighbours_cos_sim(df,finetuned_emb1)
print("-"*80)
# print("***** RoBerta CV score *****")
# df_t2,t2 = get_neighbours_cos_sim(df,finetuned_emb2)
# print("-"*80)

# print("***** Concat Embedding CV score *****")
# df_t3,t3 = get_neighbours_cos_sim(df,finetuned_emb3)
# print("-"*80)

***** DistillBert-Indonesian CV score *****
Finding similar texts...for threshold : 0.8
chunk 0 to 3
--------------------------------------------------------------------------------


## # Image&TFIDF Final inference

In [35]:
image_embeddings1 = get_image_embeddings(image_paths.values, CFG.model_name1, CFG.model_path1) # NFl0(m)
image_embeddings2 = get_image_embeddings(image_paths.values, CFG.model_name2, CFG.model_path2) # B3(merge)
image_embeddings3 = get_image_embeddings(image_paths.values, CFG.model_name3, CFG.model_path3) # NFl1(m)

image_embeddings = np.concatenate([image_embeddings1,image_embeddings2, image_embeddings3],axis=1)

# 최종 image embeddings shape를 확인
print(f'final embeddings ensemble shape : {image_embeddings.shape}')

Building Model Backbone for eca_nfnet_l1 model
Using Curricular Face
Using ArcFace
Using Merge Face


100%|██████████| 1/1 [00:00<00:00,  2.12it/s]


Our image embeddings shape is (3, 512)
Building Model Backbone for efficientnet_b3 model
Using Curricular Face
Using ArcFace
Using Merge Face


100%|██████████| 1/1 [00:00<00:00,  2.81it/s]


Our image embeddings shape is (3, 512)
Building Model Backbone for eca_nfnet_l0 model
Using Curricular Face
Using ArcFace
Using Merge Face


100%|██████████| 1/1 [00:00<00:00,  2.77it/s]

Our image embeddings shape is (3, 512)
final embeddings ensemble shape : (3, 1536)





In [36]:
image_predictions = get_image_predictions(df, image_embeddings, threshold = 0.36)#0.36 # train으로 먼저 best threshold를 찾을 것!

KNN=3


100%|██████████| 3/3 [00:00<00:00, 2726.52it/s]


In [37]:
text_predictions = get_text_predictions(df, max_features = 21_500) # tfidf_predictions

Finding similar titles...
Finding similar titles...
chunk 0 to 3


## Combine function

In [38]:
# hash 제외할 시 
def combine_predictions(row):
    x = np.concatenate([row['image_predictions'], row['text_predictions'],row['distilbert_predictions']]) 
    return ' '.join( np.unique(x))

In [39]:
import numpy as np
import pandas as pd
from functools import reduce

def intersect(*args):
    return reduce(np.intersect1d, args)

def concat(*args):
    return np.unique(np.concatenate(args))

def higher(f,*args):
    res = {}
    keys = np.unique(np.concatenate(args)) #중복없이 제출 합집합 생성
    for k in keys: 
        res[k] = np.count_nonzero(np.concatenate(args) == k)
    output_dict = dict(filter(lambda item: item[1] >= f, res.items()))
    
    return np.array(list(output_dict.keys()))

def count(*args):
    res = {}
    keys = np.unique(np.concatenate(args))
    for k in keys: 
        res[k] = np.count_nonzero(np.concatenate(args) == k)
    return res

## Submission

In [40]:
# Concatenate image predctions with text predictions
# tmp = df.groupby('image_phash').posting_id.agg('unique').to_dict()
# df['oof_hash'] = df.image_phash.map(tmp)

if GET_CV:
    df['image_predictions'] = image_predictions
    #df['text_word2vec'] = non_dup_train_df['test_matches']
    df['text_predictions_bert'] = text_predictions_bert
    df['text_predictions'] = text_predictions
    df['pred_matches'] = df.apply(combine_predictions, axis = 1)
    df['f1'] = f1_score(df['matches'], df['pred_matches'])
    score = df['f1'].mean()
    print(f'Our final f1 cv score is {score}')
    df['matches'] = df['pred_matches']
    df[['posting_id', 'matches']].to_csv('submission.csv', index = False)
else:
    df['image_predictions'] = image_predictions
#     df['text_predictions_bert'] = text_predictions_bert
    df['text_predictions'] = text_predictions
    df['distilbert_predictions'] = t1
#     df['roberta_predictions'] = t2
# #     df['concat_predictions'] = t3
#     p1 = [x for x in df['image_predictions']]
#     p2 = [x for x in df['text_predictions']]
#     p3 = [x for x in df['distilbert_predictions']]
#     df['matches'] = higher(2,p1,p2,p3)
    df['matches'] = df.apply(combine_predictions, axis = 1)
    df[['posting_id', 'matches']].to_csv('submission.csv', index = False)
