* https://www.kaggle.com/tanulsingh077/reaching-0-612-with-text-only-shopee?scriptVersionId=59862390

위 SBERT의 Inference코드를 참조하여 짜보려고한다

In [1]:
# Preliminaries
from tqdm import tqdm
import math
import random
import os
import pandas as pd
import numpy as np

#torch
import torch
import torch.nn as nn
from torch.nn import Parameter
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader

import transformers
from transformers import (BertTokenizer, BertModel,
                          DistilBertTokenizer, DistilBertModel)

import gc
import matplotlib.pyplot as plt
import cudf
import cuml
import cupy
from cuml import PCA
from cuml.neighbors import NearestNeighbors
from sklearn.preprocessing import Normalizer


In [2]:
data_path = '/mnt/hdd1/wearly/kaggle/shopee/'
train= pd.read_csv(data_path+'train.csv')

In [3]:
NUM_WORKERS = 4
BATCH_SIZE = 32
SEED = 42
DistilBERT = True

device = torch.device('cuda')

################################################  ADJUSTING FOR CV OR SUBMIT ##############################################

CHECK_SUB = False
GET_CV = True

test = pd.read_csv(data_path+'test.csv')
if len(test)>3: GET_CV = False
else: print('this submission notebook will compute CV score, but commit notebook will not')


################################################# MODEL ####################################################################

model_name='cahya/distilbert-base-indonesian'
TOKENIZER = DistilBertTokenizer.from_pretrained(model_name)

################################################ MODEL PATH ###############################################################

TEXT_MODEL_PATH = '/mnt/hdd1/wearly/ethan/shopee/Train/weights/model_100epochs.pt'

model_params = {
    'n_classes':11014,
    'model_name':model_name,
    'use_fc':False,
    'dropout':0.3,
}

this submission notebook will compute CV score, but commit notebook will not


## Reading Data

In [4]:
def read_dataset():
    if GET_CV:
        df = pd.read_csv(data_path+'train.csv')
        tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
        df['matches'] = df['label_group'].map(tmp)
        df['matches'] = df['matches'].apply(lambda x: ' '.join(x))
        if CHECK_SUB:
            df = pd.concat([df, df], axis = 0)
            df.reset_index(drop = True, inplace = True)
        df_cu = cudf.DataFrame(df)
    else:
        df = pd.read_csv(data_path+'test.csv')
        df_cu = cudf.DataFrame(df)
        
    return df, df_cu

## Utils

In [5]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_torch(SEED)

In [6]:
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

In [7]:
def get_neighbors_knn(df, embeddings, KNN = 50):
    '''
    https://www.kaggle.com/ragnar123/unsupervised-baseline-arcface?scriptVersionId=57121538
    '''

    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    # Iterate through different thresholds to maximize cv, run this in interactive mode, then replace else clause with a solid threshold
    if GET_CV:
        thresholds = list(np.arange(0.6,0.8,0.05))
        
        scores = []
        for threshold in thresholds:
            predictions = []
            for k in range(embeddings.shape[0]):
                idx = np.where(distances[k,] < threshold)[0]
                ids = indices[k,idx]
                posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
                predictions.append(posting_ids)
            df['pred_matches'] = predictions
            df['f1'] = f1_score(df['matches'], df['pred_matches'])
            score = df['f1'].mean()
            print(f'Our f1 score for threshold {threshold} is {score}')
            scores.append(score)
        thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_threshold}')
        
        # Use threshold
        predictions = []
        for k in range(embeddings.shape[0]):
            # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
            idx = np.where(distances[k,] < 0.60)[0]
            ids = indices[k,idx]
            posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
            predictions.append(posting_ids)
    
    # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
    else:
        predictions = []
        for k in tqdm(range(embeddings.shape[0])):
            idx = np.where(distances[k,] < 0.60)[0]
            ids = indices[k,idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return df, predictions

In [8]:
def get_neighbours_cos_sim(df,embeddings):
    '''
    When using cos_sim use normalized features else use normal features
    '''
    embeddings = cupy.array(embeddings)
    
    if GET_CV:
        thresholds = list(np.arange(0.5,0.7,0.05))

        scores = []
        for threshold in thresholds:
            
################################################# Code for Getting Preds #########################################
            preds = []
            CHUNK = 1024*4

            print('Finding similar titles...for threshold :',threshold)
            CTS = len(embeddings)//CHUNK
            if len(embeddings)%CHUNK!=0: CTS += 1

            for j in range( CTS ):
                a = j*CHUNK
                b = (j+1)*CHUNK
                b = min(b,len(embeddings))

                cts = cupy.matmul(embeddings,embeddings[a:b].T).T

                for k in range(b-a):
                    IDX = cupy.where(cts[k,]>threshold)[0]
                    o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
                    o = ' '.join(o)
                    preds.append(o)
######################################################################################################################
            df['pred_matches'] = preds
            df['f1'] = f1_score(df['matches'], df['pred_matches'])
            score = df['f1'].mean()
            print(f'Our f1 score for threshold {threshold} is {score}')
            scores.append(score)
            
        thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_threshold}')
            
    else:
        preds = []
        CHUNK = 1024*4
        threshold = 0.7

        print('Finding similar texts...for threshold :',threshold)
        CTS = len(embeddings)//CHUNK
        if len(embeddings)%CHUNK!=0: CTS += 1

        for j in range( CTS ):
            a = j*CHUNK
            b = (j+1)*CHUNK
            b = min(b,len(embeddings))
            print('chunk',a,'to',b)

            cts = cupy.matmul(embeddings,embeddings[a:b].T).T

            for k in range(b-a):
                IDX = cupy.where(cts[k,]>threshold)[0]
                o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
                preds.append(o)
                    
    return df, preds

## Generating Embeddings

In [9]:
class ShopeeDataset(Dataset):
    def __init__(self, csv):
        self.csv = csv.reset_index()

    def __len__(self):
        return self.csv.shape[0]

    def __getitem__(self, index):
        row = self.csv.iloc[index]
        
        text = row.title
        
        text = TOKENIZER(text, padding='max_length', truncation=True, max_length=30, return_tensors="pt")
        input_ids = text['input_ids'][0]
        attention_mask = text['attention_mask'][0]  
        
        return input_ids, attention_mask

In [10]:
a = DistilBertModel.from_pretrained(model_name)

In [11]:
a.config.hidden_size

768

In [15]:
class ShopeeNet(nn.Module):

    def __init__(self,
                 n_classes,
                 model_name=model_name,
                 use_fc=False,
                 fc_dim=512,
                 dropout=0.0):
        """
        :param n_classes:
        :param model_name: name of model from pretrainedmodels
            e.g. resnet50, resnext101_32x4d, pnasnet5large
        :param pooling: One of ('SPoC', 'MAC', 'RMAC', 'GeM', 'Rpool', 'Flatten', 'CompactBilinearPooling')
        :param loss_module: One of ('arcface', 'cosface', 'softmax')
        """
        super(ShopeeNet, self).__init__()

        self.bert_model= DistilBertTokenizer.from_pretrained(model_name) #TOKENIZER
        final_in_features = 768 #self.bert_model.config.hidden_size
        
#         self.transformer = transformers.AutoModel.from_pretrained(model_name)
#         final_in_features = self.transformer.config.hidden_size
        
        self.use_fc = use_fc
    
        if use_fc: #사용안함
            self.dropout = nn.Dropout(p=dropout)
            self.fc = nn.Linear(final_in_features, fc_dim)
            self.bn = nn.BatchNorm1d(fc_dim)
            self._init_params()
            final_in_features = fc_dim


    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, input_ids,attention_mask):
        feature = self.extract_feat(input_ids,attention_mask)
        return F.normalize(feature)

    def extract_feat(self, input_ids,attention_mask):
        x = self.bert_model(input_ids=input_ids,attention_mask=attention_mask)
        
        features = x[0]
        features = features[:,0,:]

        if self.use_fc:
            features = self.dropout(features)
            features = self.fc(features)
            features = self.bn(features)

        return features

In [16]:
model_params

{'n_classes': 11014,
 'model_name': 'cahya/distilbert-base-indonesian',
 'use_fc': False,
 'dropout': 0.3}

In [17]:
model = ShopeeNet(**model_params)

In [19]:
def get_text_embeddings(df):
    embeds = []
    
    model = ShopeeNet(**model_params)
    model.eval()
    
    model.load_state_dict(dict(list(torch.load(TEXT_MODEL_PATH).items())[:-1])) #여기서 에러
    model = model.to(device)

    text_dataset = ShopeeDataset(df)
    text_loader = torch.utils.data.DataLoader(
        text_dataset,
        batch_size=BATCH_SIZE,
        pin_memory=True,
        drop_last=False,
        num_workers=NUM_WORKERS
    )
    
    
    with torch.no_grad():
        for input_ids, attention_mask in tqdm(text_loader): 
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()
            feat = model(input_ids, attention_mask)
            text_embeddings = feat.detach().cpu().numpy()
            embeds.append(text_embeddings)
    
    
    del model
    text_embeddings = np.concatenate(embeds)
    print(f'Our text embeddings shape is {text_embeddings.shape}')
    del embeds
    gc.collect()
    return text_embeddings

In [20]:
df,df_cu = read_dataset()
df.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,matches
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794,train_129225211 train_2278313361
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,train_3386243561 train_3423213080
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,train_2288590299 train_3803689425
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,train_2406599165 train_3342059966
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,train_3369186413 train_921438619


____

In [21]:
text_embeddings = get_text_embeddings(df)

RuntimeError: Error(s) in loading state_dict for ShopeeNet:
	Unexpected key(s) in state_dict: "bert_model.embeddings.word_embeddings.weight", "bert_model.embeddings.position_embeddings.weight", "bert_model.embeddings.LayerNorm.weight", "bert_model.embeddings.LayerNorm.bias", "bert_model.transformer.layer.0.attention.q_lin.weight", "bert_model.transformer.layer.0.attention.q_lin.bias", "bert_model.transformer.layer.0.attention.k_lin.weight", "bert_model.transformer.layer.0.attention.k_lin.bias", "bert_model.transformer.layer.0.attention.v_lin.weight", "bert_model.transformer.layer.0.attention.v_lin.bias", "bert_model.transformer.layer.0.attention.out_lin.weight", "bert_model.transformer.layer.0.attention.out_lin.bias", "bert_model.transformer.layer.0.sa_layer_norm.weight", "bert_model.transformer.layer.0.sa_layer_norm.bias", "bert_model.transformer.layer.0.ffn.lin1.weight", "bert_model.transformer.layer.0.ffn.lin1.bias", "bert_model.transformer.layer.0.ffn.lin2.weight", "bert_model.transformer.layer.0.ffn.lin2.bias", "bert_model.transformer.layer.0.output_layer_norm.weight", "bert_model.transformer.layer.0.output_layer_norm.bias", "bert_model.transformer.layer.1.attention.q_lin.weight", "bert_model.transformer.layer.1.attention.q_lin.bias", "bert_model.transformer.layer.1.attention.k_lin.weight", "bert_model.transformer.layer.1.attention.k_lin.bias", "bert_model.transformer.layer.1.attention.v_lin.weight", "bert_model.transformer.layer.1.attention.v_lin.bias", "bert_model.transformer.layer.1.attention.out_lin.weight", "bert_model.transformer.layer.1.attention.out_lin.bias", "bert_model.transformer.layer.1.sa_layer_norm.weight", "bert_model.transformer.layer.1.sa_layer_norm.bias", "bert_model.transformer.layer.1.ffn.lin1.weight", "bert_model.transformer.layer.1.ffn.lin1.bias", "bert_model.transformer.layer.1.ffn.lin2.weight", "bert_model.transformer.layer.1.ffn.lin2.bias", "bert_model.transformer.layer.1.output_layer_norm.weight", "bert_model.transformer.layer.1.output_layer_norm.bias", "bert_model.transformer.layer.2.attention.q_lin.weight", "bert_model.transformer.layer.2.attention.q_lin.bias", "bert_model.transformer.layer.2.attention.k_lin.weight", "bert_model.transformer.layer.2.attention.k_lin.bias", "bert_model.transformer.layer.2.attention.v_lin.weight", "bert_model.transformer.layer.2.attention.v_lin.bias", "bert_model.transformer.layer.2.attention.out_lin.weight", "bert_model.transformer.layer.2.attention.out_lin.bias", "bert_model.transformer.layer.2.sa_layer_norm.weight", "bert_model.transformer.layer.2.sa_layer_norm.bias", "bert_model.transformer.layer.2.ffn.lin1.weight", "bert_model.transformer.layer.2.ffn.lin1.bias", "bert_model.transformer.layer.2.ffn.lin2.weight", "bert_model.transformer.layer.2.ffn.lin2.bias", "bert_model.transformer.layer.2.output_layer_norm.weight", "bert_model.transformer.layer.2.output_layer_norm.bias", "bert_model.transformer.layer.3.attention.q_lin.weight", "bert_model.transformer.layer.3.attention.q_lin.bias", "bert_model.transformer.layer.3.attention.k_lin.weight", "bert_model.transformer.layer.3.attention.k_lin.bias", "bert_model.transformer.layer.3.attention.v_lin.weight", "bert_model.transformer.layer.3.attention.v_lin.bias", "bert_model.transformer.layer.3.attention.out_lin.weight", "bert_model.transformer.layer.3.attention.out_lin.bias", "bert_model.transformer.layer.3.sa_layer_norm.weight", "bert_model.transformer.layer.3.sa_layer_norm.bias", "bert_model.transformer.layer.3.ffn.lin1.weight", "bert_model.transformer.layer.3.ffn.lin1.bias", "bert_model.transformer.layer.3.ffn.lin2.weight", "bert_model.transformer.layer.3.ffn.lin2.bias", "bert_model.transformer.layer.3.output_layer_norm.weight", "bert_model.transformer.layer.3.output_layer_norm.bias", "bert_model.transformer.layer.4.attention.q_lin.weight", "bert_model.transformer.layer.4.attention.q_lin.bias", "bert_model.transformer.layer.4.attention.k_lin.weight", "bert_model.transformer.layer.4.attention.k_lin.bias", "bert_model.transformer.layer.4.attention.v_lin.weight", "bert_model.transformer.layer.4.attention.v_lin.bias", "bert_model.transformer.layer.4.attention.out_lin.weight", "bert_model.transformer.layer.4.attention.out_lin.bias", "bert_model.transformer.layer.4.sa_layer_norm.weight", "bert_model.transformer.layer.4.sa_layer_norm.bias", "bert_model.transformer.layer.4.ffn.lin1.weight", "bert_model.transformer.layer.4.ffn.lin1.bias", "bert_model.transformer.layer.4.ffn.lin2.weight", "bert_model.transformer.layer.4.ffn.lin2.bias", "bert_model.transformer.layer.4.output_layer_norm.weight", "bert_model.transformer.layer.4.output_layer_norm.bias", "bert_model.transformer.layer.5.attention.q_lin.weight", "bert_model.transformer.layer.5.attention.q_lin.bias", "bert_model.transformer.layer.5.attention.k_lin.weight", "bert_model.transformer.layer.5.attention.k_lin.bias", "bert_model.transformer.layer.5.attention.v_lin.weight", "bert_model.transformer.layer.5.attention.v_lin.bias", "bert_model.transformer.layer.5.attention.out_lin.weight", "bert_model.transformer.layer.5.attention.out_lin.bias", "bert_model.transformer.layer.5.sa_layer_norm.weight", "bert_model.transformer.layer.5.sa_layer_norm.bias", "bert_model.transformer.layer.5.ffn.lin1.weight", "bert_model.transformer.layer.5.ffn.lin1.bias", "bert_model.transformer.layer.5.ffn.lin2.weight", "bert_model.transformer.layer.5.ffn.lin2.bias", "bert_model.transformer.layer.5.output_layer_norm.weight", "bert_model.transformer.layer.5.output_layer_norm.bias". 

## 여기서부터 custom

In [None]:
test_df = df.copy()
test_dataset = test_df['title'].tolist()

In [None]:
model_name='cahya/distilbert-base-indonesian'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
bert_model = DistilBertModel.from_pretrained(model_name)

In [None]:
batch = tokenizer(test_dataset, padding=True)

In [None]:
pretrained_model_path = '/mnt/hdd1/wearly/ethan/shopee/Train/weights/model_100epochs.pt'

In [None]:
revised_state_dict = dict()

for k,v in torch.load(pretrained_model_path).items() : 
    key = k.replace("bert_model.",'')
    revised_state_dict[key] = v

In [None]:
bert_model.load_state_dict(revised_state_dict, strict=False)
bert_model.eval()


In [None]:
class ShopeeDataset(Dataset):
    def __init__(self, batch):
        self.batch = batch

    def __len__(self):
        return len(self.batch)

    def __getitem__(self, idx):
        tmp_ids = torch.tensor(self.batch['input_ids'])[idx]
        tmp_mask = torch.tensor(self.batch['attention_mask'])[idx]
        
        return tmp_ids, tmp_mask

In [None]:
id_tensor = torch.tensor(batch['input_ids'])
mask_tensor = torch.tensor(batch['attention_mask'])

In [None]:
from tqdm import tqdm_notebook

In [None]:
device = 'cuda:1'

In [None]:
bert_model.to(device)

In [None]:
mem = []
batch_size = 10

for batch_idx in tqdm_notebook(range(0, df.shape[0], batch_size)) : 
    batch_id = id_tensor[batch_idx:batch_idx+batch_size].to(device)
    batch_mask = mask_tensor[batch_idx:batch_idx+batch_size].to(device)
    
    output = bert_model(input_ids=batch_id, attention_mask=batch_mask)
    last_hidden_state = output.last_hidden_state # shape: (batch_size, seq_length, bert_hidden_dim)
    mem.append(last_hidden_state.cpu().data.numpy())

In [None]:
embeddings = torch.tensor(np.vstack(mem))
embeddings = embeddings[:,0,:]

### pre-trained performance ( did not anything...)

In [None]:
_ = get_neighbors_knn(test_df, embeddings.numpy())

### fine-tuned performance (arcface metric-learning)

In [None]:
_ = get_neighbors_knn(test_df, embeddings.numpy())