In [1]:
import math
from tqdm import tqdm
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Visuals and CV2
import cv2

# albumentations for augs
import albumentations
from albumentations.pytorch.transforms import ToTensorV2

from sklearn.model_selection import KFold, train_test_split

#torch
import torch
import torch.nn as nn
from torch.nn import Parameter
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader
from torch.optim import Adam
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim import Adam, lr_scheduler

import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup,get_cosine_schedule_with_warmup
from transformers import get_cosine_with_hard_restarts_schedule_with_warmup

# Day 3- 
# paraphrase-xlm-r-multilingual-v1 (done)
# roberta-base-nli-stsb-mean-tokens
# bert-base-nli-cls-token_25_epochs.pt
# quora-distilbert-multilingual
# ce-distilroberta-base-stsb
# stsb-xlm-r-multilingual

ImportError: cannot import name 'Adam' from 'transformers' (unknown location)

In [2]:
NUM_WORKERS = 4
TRAIN_BATCH_SIZE = 32
EPOCHS = 25
SEED = 2020
LR = 5e-5

device = torch.device('cuda')

################################################# MODEL ####################################################################

# transformer_model = 'sentence-transformers/paraphrase-xlm-r-multilingual-v1'
my_model_name = 'roberta-base-nli-stsb-mean-tokens'
transformer_model = '/home/watts/lal/Kaggle/kshop/roberta-base-nli-stsb-mean-tokens'
TOKENIZER = transformers.AutoTokenizer.from_pretrained(transformer_model)

################################################ Metric Loss and its params #######################################################
loss_module = 'arcface'#'softmax'
s = 30.0
m = 0.5 
ls_eps = 0.0
easy_margin = False

In [3]:
model_params = {
    'n_classes':11014,
    'model_name':transformer_model,
    'pooling':'clf',
    'use_fc':False,
    'fc_dim':512,
    'dropout':0.0,
    'loss_module':loss_module,
    's':30.0,
    'margin':0.50,
    'ls_eps':0.0,
    'theta_zero':0.785
}

In [4]:
class AverageMeter(object):
    def __init__(self):
        self.reset()
    
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
    
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [5]:
def fetch_loss():
    loss = nn.CrossEntropyLoss()
    return loss


In [6]:
class ShopeeDataset(Dataset):
    def __init__(self, csv):
        self.csv = csv.reset_index()

    def __len__(self):
        return self.csv.shape[0]

    def __getitem__(self, index):
        row = self.csv.iloc[index]
        
        text = row.title
        
        text = TOKENIZER(text, padding='max_length', truncation=True, max_length=64, return_tensors="pt")
        input_ids = text['input_ids'][0]
        attention_mask = text['attention_mask'][0]  
        
        return input_ids, attention_mask, torch.tensor(row.label_group)

In [7]:
class ArcMarginProduct(nn.Module):
    r"""Implement of large margin arc distance: :
        Args:
            in_features: size of each input sample
            out_features: size of each output sample
            s: norm of input feature
            m: margin
            cos(theta + m)
        """
    def __init__(self, in_features, out_features, s=30.0, m=0.50, easy_margin=False, ls_eps=0.0):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.ls_eps = ls_eps  # label smoothing
        self.weight = Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m
        
    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
        one_hot = torch.zeros(cosine.size(), device='cuda')
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s

        return output

In [8]:
class ShopeeNet(nn.Module):

    def __init__(self,
                 n_classes,
                 model_name='bert-base-uncased',
                 pooling='mean_pooling',
                 use_fc=False,
                 fc_dim=512,
                 dropout=0.0,
                 loss_module='softmax',
                 s=30.0,
                 margin=0.50,
                 ls_eps=0.0,
                 theta_zero=0.785):
        """
        :param n_classes:
        :param model_name: name of model from pretrainedmodels
            e.g. resnet50, resnext101_32x4d, pnasnet5large
        :param pooling: One of ('SPoC', 'MAC', 'RMAC', 'GeM', 'Rpool', 'Flatten', 'CompactBilinearPooling')
        :param loss_module: One of ('arcface', 'cosface', 'softmax')
        """
        super(ShopeeNet, self).__init__()

        self.transformer = transformers.AutoModel.from_pretrained(transformer_model)
        final_in_features = self.transformer.config.hidden_size
        
        self.pooling = pooling
        self.use_fc = use_fc
        if use_fc:
            self.dropout = nn.Dropout(p=dropout)
            self.fc = nn.Linear(final_in_features, fc_dim)
            self.bn = nn.BatchNorm1d(fc_dim)
            self.relu = nn.ReLU()
            self._init_params()
            final_in_features = fc_dim

        self.loss_module = loss_module
        if loss_module == 'arcface':
            self.final = ArcMarginProduct(final_in_features, n_classes,
                                          s=s, m=margin, easy_margin=False, ls_eps=ls_eps)
        else:
            self.final = nn.Linear(final_in_features, n_classes)

    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, input_ids,attention_mask, label):
        feature = self.extract_feat(input_ids,attention_mask)
        if self.loss_module == 'arcface':
            logits = self.final(feature, label)
        else:
            logits = self.final(feature)
        return logits
    def extract_feat(self, input_ids,attention_mask):
        x = self.transformer(input_ids=input_ids,attention_mask=attention_mask)
        
        features = x[0]
        features = features[:,0,:]

        if self.use_fc:
            features = self.dropout(features)
            features = self.fc(features)
            features = self.bn(features)
            features = self.relu(features)

        return features

In [9]:
def train_fn(dataloader,model,criterion,optimizer,device,scheduler,epoch):
    model.train()
    loss_score = AverageMeter()
    
    tk0 = tqdm(enumerate(dataloader), total=len(dataloader))
    for bi,d in tk0:
        
        batch_size = d[0].shape[0]

        input_ids = d[0]
        attention_mask = d[1]
        targets = d[2]

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()

        output = model(input_ids,attention_mask,targets)
        
        loss = criterion(output,targets)
        
        loss.backward()
        optimizer.step()
        
        loss_score.update(loss.detach().item(), batch_size)
        tk0.set_postfix(Train_Loss=loss_score.avg,Epoch=epoch,LR=optimizer.param_groups[0]['lr'])
        
        if scheduler is not None:
                scheduler.step()
        
    return loss_score

In [10]:
data = pd.read_csv('../data/train.csv')
data['filepath'] = data['image'].apply(lambda x: os.path.join('../data/', 'train_images', x))

In [11]:
encoder = LabelEncoder()
data['label_group'] = encoder.fit_transform(data['label_group'])

In [12]:
def run():
    # Defining DataSet
    train_dataset = ShopeeDataset(
        csv=data
    )
        
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=TRAIN_BATCH_SIZE,
        pin_memory=True,
        drop_last=True,
        num_workers=NUM_WORKERS
    )
    
    # Defining Device
    device = torch.device("cuda")
    
    # Defining Model for specific fold
    model = ShopeeNet(**model_params)
    model.to(device)
    
    #DEfining criterion
    criterion = fetch_loss()
    criterion.to(device)
        
    # Defining Optimizer with weight decay to params other than bias and layer norms
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.0001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
            ]  
    
    optimizer = AdamW(optimizer_parameters, lr=LR)
    
        
    #Defining LR SCheduler
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=len(train_loader)*2, 
        num_training_steps=len(train_loader)*EPOCHS
    )
        
    # THE ENGINE LOOP
    best_loss = 10000
    for epoch in range(EPOCHS):
        train_loss = train_fn(train_loader, model,criterion, optimizer, device,scheduler=scheduler,epoch=epoch)
        
        if train_loss.avg < best_loss:
            best_loss = train_loss.avg
            torch.save(model.state_dict(),f'../cache/text_model_{my_model_name}_best_loss_num_epochs_{EPOCHS}_{loss_module}.bin')

In [13]:
run()

100%|██████████| 1070/1070 [03:54<00:00,  4.57it/s, Epoch=0, LR=2.5e-5, Train_Loss=24.1] 
100%|██████████| 1070/1070 [04:15<00:00,  4.19it/s, Epoch=1, LR=5e-5, Train_Loss=22.4]   
100%|██████████| 1070/1070 [04:11<00:00,  4.26it/s, Epoch=2, LR=4.78e-5, Train_Loss=19.8]
100%|██████████| 1070/1070 [04:13<00:00,  4.23it/s, Epoch=3, LR=4.57e-5, Train_Loss=17.1]
100%|██████████| 1070/1070 [04:01<00:00,  4.43it/s, Epoch=4, LR=4.35e-5, Train_Loss=14.9]
100%|██████████| 1070/1070 [03:51<00:00,  4.63it/s, Epoch=5, LR=4.13e-5, Train_Loss=13.1]
100%|██████████| 1070/1070 [03:50<00:00,  4.64it/s, Epoch=6, LR=3.91e-5, Train_Loss=11.5]
100%|██████████| 1070/1070 [04:07<00:00,  4.32it/s, Epoch=7, LR=3.7e-5, Train_Loss=10.1] 
100%|██████████| 1070/1070 [04:11<00:00,  4.26it/s, Epoch=8, LR=3.48e-5, Train_Loss=8.8] 
100%|██████████| 1070/1070 [03:48<00:00,  4.68it/s, Epoch=9, LR=3.26e-5, Train_Loss=7.72]
 76%|███████▋  | 816/1070 [02:55<00:54,  4.66it/s, Epoch=10, LR=3.1e-5, Train_Loss=6.86] 


KeyboardInterrupt: 