# Foursquare XLM-Roberta training

In [1]:
import gc
import math
import time
import torch
import itertools
import numpy as np
import tqdm as tqdm
import pandas as pd
import torch.nn as nn
from joblib import dump, load
from sklearn.metrics import f1_score
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from transformers import XLMRobertaTokenizer, XLMRobertaModel
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel, AutoConfig, XLMRobertaConfig, XLMRobertaModel

# CFG

In [2]:
class CFG:
    model = "xlm-roberta-base"
    max_len = 200
    batch_size = 18
    encoder_lr=2e-5
    decoder_lr=2e-5
    num_warmup_steps = 0
    use_scheduler = True
    print_freq = 5000
    debug = False
    epochs = 5
    scheduler = 'cosine'
OUTPUT_DIR = './'

# Helper functions 

In [3]:
def get_results(predictions, th = 0.5):
    preds = np.concatenate(predictions)
    preds = np.where(preds>=0.5, 1, 0)
    return preds

def euclidianDistance(lat1,long1,lat2,long2):
    return round(((lat2-lat1)**2+(long2-long1)**2)**(1/2), 6)

# Tokenizer & device

In [4]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = XLMRobertaTokenizer.from_pretrained(CFG.model)
CFG.tokenizer = tokenizer
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')

('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/sentencepiece.bpe.model',
 './tokenizer/added_tokens.json')

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
CFG.device = device
print(device)

cuda


# Import dataset

In [6]:
train = pd.read_csv('../folds/Fold_0_train.csv')
test = pd.read_csv('../folds/Fold_0_test.csv')
train = train.drop(columns=['poi1','poi2'])
test = test.drop(columns=['poi1','poi2'])

In [7]:
if CFG.debug:
    #train = pairsFull.iloc[:1000]
    #test = pairsFull.iloc[1001:1100]
    train = train.iloc[:100000]
    test = test.iloc[100001:130000]

In [8]:
train.fillna('unknown', inplace = True)
test.fillna('unknown', inplace = True)

In [9]:
display(train.groupby('match')['id_1'].count())
display(test.groupby('match')['id_1'].count())

match
False    774993
True     640370
Name: id_1, dtype: int64

match
False    292315
True     310133
Name: id_1, dtype: int64

In [10]:
# equalize match/nonmatch
def equalizeMatches(df):
    locations_of_matches = list(df[df['match'] == True].index)
    locations_of_non_matches = list(df[df['match'] == False].index)
    
    if len(locations_of_matches) > len(locations_of_non_matches):
        locations_of_matches = locations_of_matches[:len(locations_of_matches) - len(locations_of_non_matches)]
        df = df.drop(locations_of_matches).reset_index(drop = True)
    elif len(locations_of_matches) < len(locations_of_non_matches):
        locations_of_non_matches = locations_of_non_matches[:len(locations_of_non_matches) - len(locations_of_matches)]
        df = df.drop(locations_of_non_matches).reset_index(drop = True)
    return df

train = equalizeMatches(train)
test = equalizeMatches(test)

display(train.groupby('match')['id_1'].count())
display(test.groupby('match')['id_1'].count())

match
False    640370
True     640370
Name: id_1, dtype: int64

match
False    292315
True     292315
Name: id_1, dtype: int64

In [11]:
#scaling data
scaler = StandardScaler()
num_features = [['latitude_1','longitude_1'],['latitude_2','longitude_2']]
scaler.fit(train[num_features[0]])
# save scaler for new data
dump(scaler, 'std_scaler_main.bin', compress=True)
train[num_features[0]] = scaler.transform(train[num_features[0]])
train[num_features[1]] = scaler.transform(train[num_features[1]])
test[num_features[0]] = scaler.transform(test[num_features[0]])
test[num_features[1]] = scaler.transform(test[num_features[1]])

In [12]:
# calculate euclidian distance and add new feature
train['distance'] = train.apply(lambda x: euclidianDistance(x.latitude_1,x.longitude_1,x.latitude_2,x.longitude_2), axis = 1)
train = train.drop(columns = ['latitude_1','longitude_1','latitude_2','longitude_2'])

test['distance'] = test.apply(lambda x: euclidianDistance(x.latitude_1,x.longitude_1,x.latitude_2,x.longitude_2), axis = 1)
test = test.drop(columns = ['latitude_1','longitude_1','latitude_2','longitude_2'])


In [13]:
train.head()

Unnamed: 0,id_1,name_1,categories_1,id_2,name_2,categories_2,match,distance
0,E_594e4698d8d09b,Çırağan Sarayı,unknown,E_944324f1dcfe52,Çırağan Palace Kempinski Istanbul,Hotels,True,2.5e-05
1,E_b1fe6e102949c7,MK Live,"Hotpot Restaurants, Chinese Restaurants",E_b7f2f3e748ee2a,MK Restaurant @ Central Festival Phuket,Diners,True,1e-05
2,E_6ac370df3121eb,Ikea Alam Sutera Tangerang,unknown,E_8d3c176a5bf81a,IKEA,unknown,True,0.002272
3,E_50c2f4fbe37487,Morrison Dental,Dentist's Offices,E_aa31f5f5be7994,Morrison Dental Care,Dentist's Offices,True,0.033926
4,E_0c12d1ffe7f5eb,Baskin-Robbins,Ice Cream Shops,E_a18a2a08db2c6c,Baskin Robbins,Miscellaneous Shops,True,1.1e-05


# Dataset

In [14]:
# ====================================================
# Dataset
# ====================================================
def prepareInputs(line):
    text = str(line['name_1']) + '[SEP]' + str(line['categories_1']) + '[SEP]' + str(line['distance']) \
    + '[SEP]'+ str(line['name_2']) + '[SEP]'+ str(line['categories_2'])
    
    inputs = CFG.tokenizer(text, 
                           add_special_tokens=True,
                           max_length=CFG.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs

def prepareLabels(value):
    if value == True:
        return 1.
    else:
        return 0.

class PairsDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs
        self.labels = pairs['match'].values

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        inputs = prepareInputs(self.pairs.iloc[idx])
        label = prepareLabels(self.labels[idx])
        
        return inputs, label

# Model

In [15]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, config_path=None, pretrained=False):
        super().__init__()

        if config_path is None:
            self.config = XLMRobertaConfig.from_pretrained(CFG.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = XLMRobertaModel.from_pretrained(CFG.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
            

        self.model = XLMRobertaModel.from_pretrained(CFG.model, config=self.config)
        
        self.fc_dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs.last_hidden_state[:,0,:]
        return last_hidden_states

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

# Training

In [16]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))        
        
        
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    start = end = time.time()
    losses = AverageMeter()
    for step, (inputs, labels) in enumerate(train_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        y_preds = model(inputs)
        loss = criterion(y_preds.view(-1,1),labels.view(-1,1))
        losses.update(loss.item(), batch_size)
        loss.backward()
        optimizer.step()
        if CFG.use_scheduler:
            scheduler.step()
        optimizer.zero_grad()
        end = time.time()
        
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          lr=scheduler.get_lr()[0]))
    return losses.avg

def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    
    predictions = np.concatenate(preds)
    return losses.avg, predictions 

In [17]:
def trainigLoop(train,test, epochs):
    # ====================================================
    # loader
    # ====================================================
    train_dataset = PairsDataset(train)
    test_dataset = PairsDataset(test)
    test_labels = np.where(test['match'].to_numpy() == True, 1,0)
    
    train_dataloader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True, pin_memory = True)
    test_dataloader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False, pin_memory = True)
    
    # ====================================================
    # model & optimizer
    # ====================================================
    tokenizer = CFG.tokenizer
    model = CustomModel(pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(CFG.device)

    optimizer = AdamW(model.parameters(), lr=CFG.encoder_lr)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler=='linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler=='cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        return scheduler
    
    num_train_steps = int(len(train_dataset) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)
    
    # ====================================================
    # loop
    # ====================================================
    
    criterion = nn.BCEWithLogitsLoss()
    best_score = 0.
    
    for epoch in range(CFG.epochs):
        
        start_time = time.time()

        # train
        avg_loss = train_fn(train_dataloader, model, criterion, optimizer, epoch, scheduler, CFG.device)
        
        # eval
        avg_val_loss, predictions = valid_fn(test_dataloader, model, criterion, CFG.device)
        results = get_results(predictions, th = 0.5)
        score = f1_score(test_labels,results)
        
        elapsed = time.time() - start_time
        print(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        print(f'Epoch {epoch+1} - Score: {score:.4f}')
        
        if best_score < score:
            best_score = score
            print(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            print(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_best.pth")
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_best.pth")
    
    torch.cuda.empty_cache()
    gc.collect()


In [18]:
if __name__ == '__main__':
    trainigLoop(train,test, CFG.epochs)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/71153] Elapsed 0m 11s (remain 13115m 1s) Loss: 0.6715(0.6715) LR: 0.00002000  
Epoch: [1][5000/71153] Elapsed 32m 15s (remain 426m 37s) Loss: 0.0603(0.2370) LR: 0.00001999  
Epoch: [1][10000/71153] Elapsed 64m 19s (remain 393m 19s) Loss: 0.2160(0.2131) LR: 0.00001996  
Epoch: [1][15000/71153] Elapsed 96m 23s (remain 360m 50s) Loss: 0.1270(0.2000) LR: 0.00001991  
Epoch: [1][20000/71153] Elapsed 128m 27s (remain 328m 32s) Loss: 0.0243(0.1916) LR: 0.00001984  
Epoch: [1][25000/71153] Elapsed 160m 31s (remain 296m 20s) Loss: 0.0376(0.1855) LR: 0.00001976  
Epoch: [1][30000/71153] Elapsed 192m 57s (remain 264m 40s) Loss: 0.0461(0.1800) LR: 0.00001965  
Epoch: [1][35000/71153] Elapsed 225m 1s (remain 232m 24s) Loss: 0.0529(0.1759) LR: 0.00001953  
Epoch: [1][40000/71153] Elapsed 257m 4s (remain 200m 12s) Loss: 0.0141(0.1725) LR: 0.00001938  
Epoch: [1][45000/71153] Elapsed 289m 8s (remain 168m 1s) Loss: 0.1082(0.1696) LR: 0.00001922  
Epoch: [1][50000/71153] Elapsed 321m 12s (r

KeyboardInterrupt: 