In [None]:
# set up the config
class Config:
#     BATCH_SIZE = 16
    BATCH_SIZE = 8
    MAX_LEN = 128
    TARGET = 'label'
    TEXT1 = 'sentence1'
    TEXT2 = 'sentence2'
    MODEL = 'distilbert-base-multilingual-cased'
    LEARNING_RATE = 1e-05
    EPOCHS = 5
    EPS = 1e-08
    random_seed = 0xfeedbeef
    dataset = "EENLP.ParaphraseDetection"
    dataset_version = "v4"
    full_data = "paraphrase_detection/english.jsonl"
    eval_data = {
        "armenian": "paraphrase_detection/armenian.jsonl",
        # "belarusian": "paraphrase_detection/belarusian.jsonl",
        # "bulgarian": "paraphrase_detection/bulgarian.jsonl",
        # "croatian": "paraphrase_detection/croatian.jsonl",
        # "czech": "paraphrase_detection/czech.jsonl",
        # "english": "paraphrase_detection/english.jsonl",
        # "estonian": "paraphrase_detection/estonian.jsonl",
        # "hungarian": "paraphrase_detection/hungarian.jsonl",
        # "lithuanian": "paraphrase_detection/lithuanian.jsonl",
        # "macedonian": "paraphrase_detection/macedonian.jsonl",
        "polish": "paraphrase_detection/polish.jsonl",
        "romanian": "paraphrase_detection/romanian.jsonl",
        # "russian": "paraphrase_detection/russian.jsonl",
        "serbian": "paraphrase_detection/serbian.jsonl",
        # "slovenian": "paraphrase_detection/slovenian.jsonl",
        # "ukrainian": "paraphrase_detection/ukrainian.jsonl",
    }


In [None]:
# Check if we have GPU
!nvidia-smi

In [None]:
# prepare env

!pip install transformers
!pip install wget
!pip install urllib2
!pip install wandb -qqq
!pip install jsonlines

In [None]:
import json
import random

import torch
import wandb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification 
from transformers import get_linear_schedule_with_warmup, AdamW


In [None]:
# Log in to your W&B account
wandb.login()
wandb.init(
      entity="eenlp",
      project="paraphrase_detection",
      # Track hyperparameters and run metadata
      config=dict([(k,v) for k,v in Config.__dict__.items() if k[0]!='_']),
      reinit=True
)
#     run = wandb.init(project="storydb_eval.task3", reinit=True)
wandb.run.name += f'_{Config.MODEL}'
wandb.run.save()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
wandb.run.use_artifact(f"eenlp/paraphrase_detection/paraphrase_detection-dataset:{Config.dataset_version}").download("paraphrase_detection")

In [None]:
# limit number of examples per dataset, because otherwise training runs for a day

df = pd.read_json("paraphrase_detection/english.jsonl", lines=True)
df = pd.concat([x[:10_000] for _, x in df.groupby("source")])
df.to_json("paraphrase_detection/english.jsonl", orient="records", lines=True)

In [None]:
label_encoder = None
labels_codes = None

def load_dataset(fn):
    return pd.read_json(fn, lines=True) 

def load_dataset_and_split(fn, fraction=.8):
    data = pd.read_json(fn, lines=True)
    df_train=data.sample(frac=fraction,random_state=200)
    df_test=data.drop(df_train.index).reset_index(drop=True)
    df_train = df_train.reset_index(drop=True)
    return df_train, df_test

def process_dataset(data, tokenizer, seq=False):
    global label_encoder, labels_codes
    if label_encoder is None:
        print('init of label encoder')
        label_encoder = LabelEncoder().fit(data[Config.TARGET])
        keys = list(sorted(set(data[Config.TARGET])))
        labels = label_encoder.transform(keys)
        labels_codes = dict(zip(keys, labels))
    data[Config.TARGET] = label_encoder.transform(data[Config.TARGET])

    input_ids = torch.tensor([])
    attention_masks = torch.tensor([])

    for sent1, sent2 in data.loc[:, [Config.TEXT1, Config.TEXT2]].itertuples(index=False):
        # https://github.com/huggingface/transformers/blob/364a5ae1f0dc0f9098ff1ad4f5ede4a424813095/docs/source/task_summary.rst#sequence-classification
        encoded_sent = tokenizer.encode_plus(sent1, sent2, add_special_tokens = True,
                                             max_length = Config.MAX_LEN, 
                                             padding = 'max_length',
                                             pad_to_max_length=True,
                                             truncation = True,
                                             return_tensors = 'pt')
        input_ids = torch.cat([input_ids, encoded_sent['input_ids']])
        attention_masks = torch.cat([attention_masks, encoded_sent['attention_mask']])
    labels = torch.tensor(data[Config.TARGET])
    dataset = TensorDataset(input_ids, attention_masks, labels)
    if seq:
        return DataLoader(dataset, sampler = SequentialSampler(dataset), batch_size = Config.BATCH_SIZE)
    else:
        return DataLoader(dataset, sampler = RandomSampler(dataset), batch_size = Config.BATCH_SIZE)


In [None]:
# fix PRNG
random.seed(Config.random_seed)
np.random.seed(Config.random_seed)
torch.manual_seed(Config.random_seed)
torch.cuda.manual_seed_all(Config.random_seed)

# init tokenizer
tokenizer = AutoTokenizer.from_pretrained(Config.MODEL, truncation=True, do_lower_case=False)

# split english dataset
print('parsing and preparing data, it will take a while.')
print('english', end='... \t')
train_df, test_df = load_dataset_and_split(Config.full_data, .8)
train_loader = process_dataset(train_df, tokenizer, seq=False)
eval_loaders = dict()
eval_loaders['english'] = process_dataset(test_df, tokenizer, seq=True)
print('done.')
# prepare eval for other languages
for lang, filename in Config.eval_data.items():
    print(lang, end='... \t')
    eval_loaders[lang] = process_dataset(load_dataset(filename), tokenizer, seq=True)
    print('done.')



In [None]:
# Init model
model = AutoModelForSequenceClassification.from_pretrained(
   Config.MODEL,
   num_labels = len(labels_codes),
   output_attentions = False,
   output_hidden_states = False    
)
model.to(device)

In [None]:
def train(model, epoch, loader, scheduler=None):
    model.train()

    train_loss_accum = 0
    fin_targets=[]
    fin_outputs=[]
    
    for index, (sentence, attention_mask, label) in tqdm(enumerate(loader)):
        model.zero_grad()

        sentence = sentence.to(device).long()
        attention_mask = attention_mask.to(device).long()
        label = label.to(device).long()

        output = model(sentence, attention_mask = attention_mask, labels = label)
        loss_value, logits = output[0], output[1]
        train_loss_accum += loss_value.item()
        fin_targets.extend(label.cpu().detach().numpy().tolist())
        logits = logits.cpu().detach().numpy()
        fin_outputs.extend(np.argmax(logits, axis=1))
        
        loss_value.backward()
        optimizer.step()
        if scheduler:
            scheduler.step()

    avg_loss = train_loss_accum / index
    train_accuracy = metrics.accuracy_score( fin_targets, fin_outputs )
    train_f1_micro = metrics.f1_score(fin_targets, fin_outputs, average='micro')
    train_f1_macro = metrics.f1_score(fin_targets, fin_outputs, average='macro')

    wandb.log({"train/loss": avg_loss,
               "train/acc":  train_accuracy,
               "train/f1_micro" : train_f1_micro,
               "train/f1_macro" : train_f1_macro,
               "epoch":epoch,
              })



In [None]:
def validation(model, testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for sentence, attention_mask, targets in testing_loader:
            sentence = sentence.to(device).long()
            attention_mask = attention_mask.to(device).long()
            outputs = model(sentence, attention_mask = attention_mask)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            logits = outputs.logits.cpu().detach().numpy()
            fin_outputs.extend(np.argmax(logits, axis=1))
#             break
    return fin_targets, fin_outputs

def eval_model(model, epoch=-1):
    for lang, eval_loader in eval_loaders.items():
        targets, preds = validation(model, eval_loader)
        scores = dict()
        scores[f'valid/acc/{lang}'] = metrics.accuracy_score( targets, preds )
        scores[f'valid/f1_micro/{lang}'] = metrics.f1_score(targets, preds, average='micro')
        scores[f'valid/f1_macro/{lang}'] = metrics.f1_score(targets, preds, average='macro')
        scores['epoch'] = epoch
        print(scores)
        wandb.log(scores)


In [None]:
optimizer = AdamW(model.parameters(), lr = Config.LEARNING_RATE, eps = Config.EPS)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, 
                                            num_training_steps = Config.EPOCHS*len(train_df)/Config.BATCH_SIZE)

eval_model(model, epoch=-1)
for epoch in range(Config.EPOCHS):
    train(model, epoch, train_loader, scheduler)
    eval_model(model, epoch)


In [None]:
wandb.run.finish()