In [None]:
# set up the config
class Config:
    BATCH_SIZE = 8
    MAX_LEN = 128
    TARGET = 'label'
    TEXT = 'text'
    MODEL = 'distilbert-base-multilingual-cased'
    #MODEL = 'bert-base-multilingual-cased'
    #MODEL = 'xlm-roberta-base'
    LEARNING_RATE = 1e-05
    EPOCHS = 5
    EPS = 1e-08
    random_seed = 0xfeedbeef
    dataset = "EENLP.SentimentTweets"
    train_on = 'Russian'


In [None]:
%%capture
# prepare env

!pip install transformers
!pip install wget
!pip install urllib2
!pip install wandb -qqq
!pip install jsonlines

In [None]:
import json
import random

import torch
import wandb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification 
from transformers import get_linear_schedule_with_warmup, AdamW

In [None]:
# TWEETS/COMMENTS DATASET

df = pd.read_csv('SENTIMENT_TWITTER.csv')

test_df = df[df['language']!=Config.train_on]
train_df = df[df['language']==Config.train_on]

if len(train_df) > 10_000:
    train_df = train_df.sample(10_000)

add_to_test = train_df.sample(round(len(train_df)*.15))

test_df = test_df.append(add_to_test)
train_df = train_df.drop(add_to_test.index)


# in case there are only two sentiments in training dataset
if len(train_df.label.drop_duplicates())==2:
    test_df = test_df[test_df.label != 'neutral']

In [None]:
# Log in to your W&B account
wandb.login()
wandb.init(
      project="twit_sentiment", 
      # Track hyperparameters and run metadata
      config=dict([(k,v) for k,v in Config.__dict__.items() if k[0]!='_']),
      reinit=True
)
wandb.run.name += Config.train_on.upper() + '-TRAIN'+f'_{Config.MODEL}'
wandb.run.save()    

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnorest[0m (use `wandb login --relogin` to force relogin)




device(type='cuda')

In [None]:
label_encoder = None
labels_codes = None

def load_dataset(fn):
    return pd.read_json(fn, lines=True) 

def load_dataset_and_split(fn, fraction=.8):
    data = pd.read_json(fn, lines=True)
    df_train=data.sample(frac=fraction,random_state=200)
    df_test=data.drop(df_train.index).reset_index(drop=True)
    df_train = df_train.reset_index(drop=True)
    return df_train, df_test

def process_dataset(data, tokenizer, seq=False):
    global label_encoder, labels_codes
    if label_encoder is None:
        print('init of label encoder')
        label_encoder = LabelEncoder().fit(data[Config.TARGET])
        keys = list(sorted(set(data[Config.TARGET])))
        labels = label_encoder.transform(keys)
        labels_codes = dict(zip(keys, labels))
    data[Config.TARGET] = label_encoder.transform(data[Config.TARGET])

    input_ids = torch.tensor([])
    attention_masks = torch.tensor([])

    for sent in data.loc[:, Config.TEXT]:
        encoded_sent = tokenizer.encode_plus(sent, add_special_tokens = True,
                                             max_length = Config.MAX_LEN, 
                                             padding = 'max_length',
                                             pad_to_max_length=True,
                                             truncation = True,
                                             return_tensors = 'pt')
        input_ids = torch.cat([input_ids, encoded_sent['input_ids']])
        attention_masks = torch.cat([attention_masks, encoded_sent['attention_mask']])
    labels = torch.tensor(data[Config.TARGET].values)
    dataset = TensorDataset(input_ids, attention_masks, labels)
    if seq:
        return DataLoader(dataset, sampler = SequentialSampler(dataset), batch_size = Config.BATCH_SIZE)
    else:
        return DataLoader(dataset, sampler = RandomSampler(dataset), batch_size = Config.BATCH_SIZE)


In [None]:
# fix PRNG
random.seed(Config.random_seed)
np.random.seed(Config.random_seed)
torch.manual_seed(Config.random_seed)
torch.cuda.manual_seed_all(Config.random_seed)

# init tokenizer
tokenizer = AutoTokenizer.from_pretrained(Config.MODEL, truncation=True, do_lower_case=False)

# split english dataset
print('parsing and preparing data, it will take a while.')
print(train_df.language.iloc[0], end='... \t')
#train_df, test_df = load_dataset_and_split(Config.full_data, .8)
train_loader = process_dataset(train_df, tokenizer, seq=False)

#eval_loaders['english'] = process_dataset(test_df, tokenizer, seq=True)
#print('done.')
# prepare eval for other language
eval_loaders = dict()
for lang in df.language.drop_duplicates().tolist():
    print(lang, end='... \t')
    eval_lang_df = test_df[test_df.language==lang]
    if len(eval_lang_df) > 10_000:
        eval_lang_df = eval_lang_df.sample(10_000)
    eval_loaders[lang] = process_dataset(eval_lang_df, tokenizer, seq=True)
    print('done.')

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

parsing and preparing data, it will take a while.
Chezh... 	init of label encoder
Slovak... 	

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


done.
Chezh... 	done.
Latvian... 	done.
Russian... 	done.


In [None]:
# Init model
model = AutoModelForSequenceClassification.from_pretrained(
   Config.MODEL,
   num_labels = len(labels_codes),
   output_attentions = False,
   output_hidden_states = False    
)
model.to(device)

Downloading:   0%|          | 0.00/517M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.we

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
      

In [None]:
def train(model, epoch, loader, scheduler=None):
    model.train()

    train_loss_accum = 0
    fin_targets=[]
    fin_outputs=[]
    
    for index, (sentence, attention_mask, label) in tqdm(enumerate(loader)):
        model.zero_grad()

        sentence = sentence.to(device).long()
        attention_mask = attention_mask.to(device).long()
        label = label.to(device).long()

        output = model(sentence, attention_mask = attention_mask, labels = label)
        loss_value, logits = output[0], output[1]
        train_loss_accum += loss_value.item()
        fin_targets.extend(label.cpu().detach().numpy().tolist())
        logits = logits.cpu().detach().numpy()
        fin_outputs.extend(np.argmax(logits, axis=1))
        
        loss_value.backward()
        optimizer.step()
        if scheduler:
            scheduler.step()

    avg_loss = train_loss_accum / index
    train_accuracy = metrics.accuracy_score( fin_targets, fin_outputs )
    train_f1_micro = metrics.f1_score(fin_targets, fin_outputs, average='micro')
    train_f1_macro = metrics.f1_score(fin_targets, fin_outputs, average='macro')

    wandb.log({"train/loss": avg_loss, 
               "train/acc":  train_accuracy,
               "train/f1_micro" : train_f1_micro,
               "train/f1_macro" : train_f1_macro,
               "epoch":epoch,
              })



In [None]:
def validation(model, testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for sentence, attention_mask, targets in testing_loader:
            sentence = sentence.to(device).long()
            attention_mask = attention_mask.to(device).long()
            outputs = model(sentence, attention_mask = attention_mask)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            logits = outputs.logits.cpu().detach().numpy()
            fin_outputs.extend(np.argmax(logits, axis=1))
#             break
    return fin_targets, fin_outputs

def eval_model(model, epoch=-1):
    for lang, eval_loader in eval_loaders.items():
        targets, preds = validation(model, eval_loader)
        scores = dict()
        scores[f'valid/acc/{lang}'] = metrics.accuracy_score( targets, preds )
        scores[f'valid/f1_micro/{lang}'] = metrics.f1_score(targets, preds, average='micro')
        scores[f'valid/f1_macro/{lang}'] = metrics.f1_score(targets, preds, average='macro')
        scores['epoch'] = epoch
        print(scores)
        wandb.log(scores)


In [None]:
optimizer = AdamW(model.parameters(), lr = Config.LEARNING_RATE, eps = Config.EPS)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, 
                                            num_training_steps = Config.EPOCHS*len(train_df)/Config.BATCH_SIZE)

eval_model(model, epoch=-1)
for epoch in range(Config.EPOCHS):
    train(model, epoch, train_loader, scheduler)
    eval_model(model, epoch)


{'valid/acc/Slovak': 0.44017632241813603, 'valid/f1_micro/Slovak': 0.44017632241813603, 'valid/f1_macro/Slovak': 0.2108554336049743, 'epoch': -1}
{'valid/acc/Chezh': 0.198, 'valid/f1_micro/Chezh': 0.198, 'valid/f1_macro/Chezh': 0.11159337943522868, 'epoch': -1}
{'valid/acc/Latvian': 0.14310344827586208, 'valid/f1_micro/Latvian': 0.14310344827586208, 'valid/f1_macro/Latvian': 0.0849662766762867, 'epoch': -1}
{'valid/acc/Russian': 0.4915, 'valid/f1_micro/Russian': 0.4915, 'valid/f1_macro/Russian': 0.329534026148173, 'epoch': -1}


1063it [04:04,  4.34it/s]


{'valid/acc/Slovak': 0.371536523929471, 'valid/f1_micro/Slovak': 0.371536523929471, 'valid/f1_macro/Slovak': 0.3814202685487879, 'epoch': 0}
{'valid/acc/Chezh': 0.7233333333333334, 'valid/f1_micro/Chezh': 0.7233333333333334, 'valid/f1_macro/Chezh': 0.6620300321777063, 'epoch': 0}
{'valid/acc/Latvian': 0.6008620689655172, 'valid/f1_micro/Latvian': 0.6008620689655172, 'valid/f1_macro/Latvian': 0.4604770448836688, 'epoch': 0}
{'valid/acc/Russian': 0.4066, 'valid/f1_micro/Russian': 0.4066, 'valid/f1_macro/Russian': 0.3567040024622969, 'epoch': 0}


1063it [04:05,  4.33it/s]


{'valid/acc/Slovak': 0.47795969773299746, 'valid/f1_micro/Slovak': 0.47795969773299746, 'valid/f1_macro/Slovak': 0.47754266664266165, 'epoch': 1}
{'valid/acc/Chezh': 0.7313333333333333, 'valid/f1_micro/Chezh': 0.7313333333333333, 'valid/f1_macro/Chezh': 0.7004385121196868, 'epoch': 1}
{'valid/acc/Latvian': 0.6, 'valid/f1_micro/Latvian': 0.6, 'valid/f1_macro/Latvian': 0.4891504927327644, 'epoch': 1}
{'valid/acc/Russian': 0.4546, 'valid/f1_micro/Russian': 0.4546, 'valid/f1_macro/Russian': 0.3994486911211356, 'epoch': 1}


1063it [04:05,  4.33it/s]


{'valid/acc/Slovak': 0.43198992443324935, 'valid/f1_micro/Slovak': 0.43198992443324935, 'valid/f1_macro/Slovak': 0.4439268431623336, 'epoch': 2}
{'valid/acc/Chezh': 0.7366666666666667, 'valid/f1_micro/Chezh': 0.7366666666666667, 'valid/f1_macro/Chezh': 0.6889250012082794, 'epoch': 2}
{'valid/acc/Latvian': 0.5844827586206897, 'valid/f1_micro/Latvian': 0.5844827586206897, 'valid/f1_macro/Latvian': 0.4449422907316231, 'epoch': 2}
{'valid/acc/Russian': 0.3296, 'valid/f1_micro/Russian': 0.3296, 'valid/f1_macro/Russian': 0.3159172260482535, 'epoch': 2}


1063it [04:04,  4.34it/s]


{'valid/acc/Slovak': 0.44710327455919396, 'valid/f1_micro/Slovak': 0.44710327455919396, 'valid/f1_macro/Slovak': 0.4570480090164333, 'epoch': 3}
{'valid/acc/Chezh': 0.7373333333333333, 'valid/f1_micro/Chezh': 0.7373333333333333, 'valid/f1_macro/Chezh': 0.6942542886007196, 'epoch': 3}
{'valid/acc/Latvian': 0.5775862068965517, 'valid/f1_micro/Latvian': 0.5775862068965517, 'valid/f1_macro/Latvian': 0.4377956743785704, 'epoch': 3}
{'valid/acc/Russian': 0.316, 'valid/f1_micro/Russian': 0.316, 'valid/f1_macro/Russian': 0.30553529822920106, 'epoch': 3}


1063it [04:05,  4.34it/s]


{'valid/acc/Slovak': 0.4716624685138539, 'valid/f1_micro/Slovak': 0.4716624685138539, 'valid/f1_macro/Slovak': 0.4745673370287155, 'epoch': 4}
{'valid/acc/Chezh': 0.7433333333333333, 'valid/f1_micro/Chezh': 0.7433333333333333, 'valid/f1_macro/Chezh': 0.7126493639928366, 'epoch': 4}
{'valid/acc/Latvian': 0.5827586206896552, 'valid/f1_micro/Latvian': 0.5827586206896552, 'valid/f1_macro/Latvian': 0.45392225412751946, 'epoch': 4}
{'valid/acc/Russian': 0.358, 'valid/f1_micro/Russian': 0.35800000000000004, 'valid/f1_macro/Russian': 0.33337144813052183, 'epoch': 4}
