In [1]:
# set up the config
class Config:
    BATCH_SIZE = 8
    MAX_LEN = 32
    #MAX_LEN = 256
    TARGET = 'label'
    TEXT = 'text'
    #MODEL = 'distilbert-base-multilingual-cased'
    #MODEL = 'bert-base-multilingual-cased'
    MODEL = 'xlm-roberta-base'
    LEARNING_RATE = 1e-05
    EPOCHS = 10
    EPS = 1e-08
    random_seed = 0xfeedbeef
    dataset = "EENLP.SentimentTweets"
    #dataset = "EENLP.SentimentNews"


In [2]:
%%capture
# prepare env

!pip install transformers
!pip install wget
!pip install urllib2
!pip install wandb -qqq
!pip install jsonlines

In [3]:
%%capture
!wget "https://drive.google.com/uc?export=download&id=19D-NSU78NntBXXd__a06aBl4tbu08-Wy" -O twitter.7z
!wget "https://drive.google.com/uc?export=download&id=10IqFyWwqto6R8bz2F0Mj81GXxU4AA6ux" -O news.7z

!7z x twitter.7z
!7z x news.7z

In [4]:
import json
import random

import torch
import wandb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification 
from transformers import get_linear_schedule_with_warmup, AdamW

In [5]:
# NEWS DATASET
train_df = pd.read_csv('/content/news/Russian_sentiment_news_8200.csv')

from glob import glob
dflist = []
for path in glob('news/*'):
    df = pd.read_csv(path)
    dflist.append(df)
df = pd.concat(dflist)

df = df.reset_index(drop=True)
df = df.drop(df[df.language == 'Russian'].index)
test_df = df.reset_index(drop=True)

In [5]:
# TWEETS/COMMENTS DATASET
train_df = pd.read_csv('/content/twitter/Chezh_sentiment_facebook_10000.csv')

from glob import glob
dflist = []
for path in glob('twitter/*'):
    df = pd.read_csv(path)
    if len(df)> 3000:
        df = df.sample(3000)
    dflist.append(df)
df = pd.concat(dflist)

df = df.reset_index(drop=True)
df = df.drop(df[df.language == 'Chezh'].index)
test_df = df.reset_index(drop=True)

In [6]:
# Log in to your W&B account
wandb.login()
wandb.init(
      entity="eenlp",
      project="twit_sentiment", 
      # Track hyperparameters and run metadata
      config=dict([(k,v) for k,v in Config.__dict__.items() if k[0]!='_']),
      reinit=True
)
wandb.run.name += 'CHEZH-TRAIN'+f'_{Config.MODEL}'
wandb.run.save()    

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter: ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnorest[0m (use `wandb login --relogin` to force relogin)




device(type='cuda')

In [7]:
label_encoder = None
labels_codes = None

def load_dataset(fn):
    return pd.read_json(fn, lines=True) 

def load_dataset_and_split(fn, fraction=.8):
    data = pd.read_json(fn, lines=True)
    df_train=data.sample(frac=fraction,random_state=200)
    df_test=data.drop(df_train.index).reset_index(drop=True)
    df_train = df_train.reset_index(drop=True)
    return df_train, df_test

def process_dataset(data, tokenizer, seq=False):
    global label_encoder, labels_codes
    if label_encoder is None:
        print('init of label encoder')
        label_encoder = LabelEncoder().fit(data[Config.TARGET])
        keys = list(sorted(set(data[Config.TARGET])))
        labels = label_encoder.transform(keys)
        labels_codes = dict(zip(keys, labels))
    data[Config.TARGET] = label_encoder.transform(data[Config.TARGET])

    input_ids = torch.tensor([])
    attention_masks = torch.tensor([])

    for sent in data.loc[:, Config.TEXT]:
        encoded_sent = tokenizer.encode_plus(sent, add_special_tokens = True,
                                             max_length = Config.MAX_LEN, 
                                             padding = 'max_length',
                                             pad_to_max_length=True,
                                             truncation = True,
                                             return_tensors = 'pt')
        input_ids = torch.cat([input_ids, encoded_sent['input_ids']])
        attention_masks = torch.cat([attention_masks, encoded_sent['attention_mask']])
    labels = torch.tensor(data[Config.TARGET].values)
    dataset = TensorDataset(input_ids, attention_masks, labels)
    if seq:
        return DataLoader(dataset, sampler = SequentialSampler(dataset), batch_size = Config.BATCH_SIZE)
    else:
        return DataLoader(dataset, sampler = RandomSampler(dataset), batch_size = Config.BATCH_SIZE)


In [8]:
# fix PRNG
random.seed(Config.random_seed)
np.random.seed(Config.random_seed)
torch.manual_seed(Config.random_seed)
torch.cuda.manual_seed_all(Config.random_seed)

# init tokenizer
tokenizer = AutoTokenizer.from_pretrained(Config.MODEL, truncation=True, do_lower_case=False)

# split english dataset
print('parsing and preparing data, it will take a while.')
print(train_df.language.iloc[0], end='... \t')
#train_df, test_df = load_dataset_and_split(Config.full_data, .8)
train_loader = process_dataset(train_df, tokenizer, seq=False)

#eval_loaders['english'] = process_dataset(test_df, tokenizer, seq=True)
#print('done.')
# prepare eval for other language
eval_loaders = dict()
for lang in df.language.drop_duplicates().tolist():
    print(lang, end='... \t')
    eval_lang_df = test_df[test_df.language==lang]
    eval_loaders[lang] = process_dataset(eval_lang_df, tokenizer, seq=True)
    print('done.')



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=512.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=9096718.0, style=ProgressStyle(descript…


parsing and preparing data, it will take a while.
Chezh... 	init of label encoder
Slovak... 	

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


done.
Russian... 	done.
Latvian... 	done.


In [9]:
# Init model
model = AutoModelForSequenceClassification.from_pretrained(
   Config.MODEL,
   num_labels = len(labels_codes),
   output_attentions = False,
   output_hidden_states = False    
)
model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1115590446.0, style=ProgressStyle(descr…




Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

XLMRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (La

In [10]:
def train(model, epoch, loader, scheduler=None):
    model.train()

    train_loss_accum = 0
    fin_targets=[]
    fin_outputs=[]
    
    for index, (sentence, attention_mask, label) in tqdm(enumerate(loader)):
        model.zero_grad()

        sentence = sentence.to(device).long()
        attention_mask = attention_mask.to(device).long()
        label = label.to(device).long()

        output = model(sentence, attention_mask = attention_mask, labels = label)
        loss_value, logits = output[0], output[1]
        train_loss_accum += loss_value.item()
        fin_targets.extend(label.cpu().detach().numpy().tolist())
        logits = logits.cpu().detach().numpy()
        fin_outputs.extend(np.argmax(logits, axis=1))
        
        loss_value.backward()
        optimizer.step()
        if scheduler:
            scheduler.step()

    avg_loss = train_loss_accum / index
    train_accuracy = metrics.accuracy_score( fin_targets, fin_outputs )
    train_f1_micro = metrics.f1_score(fin_targets, fin_outputs, average='micro')
    train_f1_macro = metrics.f1_score(fin_targets, fin_outputs, average='macro')

    wandb.log({"train/loss": avg_loss, 
               "train/acc":  train_accuracy,
               "train/f1_micro" : train_f1_micro,
               "train/f1_macro" : train_f1_macro,
               "epoch":epoch,
              })



In [11]:
def validation(model, testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for sentence, attention_mask, targets in testing_loader:
            sentence = sentence.to(device).long()
            attention_mask = attention_mask.to(device).long()
            outputs = model(sentence, attention_mask = attention_mask)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            logits = outputs.logits.cpu().detach().numpy()
            fin_outputs.extend(np.argmax(logits, axis=1))
#             break
    return fin_targets, fin_outputs

def eval_model(model, epoch=-1):
    for lang, eval_loader in eval_loaders.items():
        targets, preds = validation(model, eval_loader)
        scores = dict()
        scores[f'valid/acc/{lang}'] = metrics.accuracy_score( targets, preds )
        scores[f'valid/f1_micro/{lang}'] = metrics.f1_score(targets, preds, average='micro')
        scores[f'valid/f1_macro/{lang}'] = metrics.f1_score(targets, preds, average='macro')
        scores['epoch'] = epoch
        print(scores)
        wandb.log(scores)


In [12]:
optimizer = AdamW(model.parameters(), lr = Config.LEARNING_RATE, eps = Config.EPS)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, 
                                            num_training_steps = Config.EPOCHS*len(train_df)/Config.BATCH_SIZE)

eval_model(model, epoch=-1)
for epoch in range(Config.EPOCHS):
    train(model, epoch, train_loader, scheduler)
    eval_model(model, epoch)


{'valid/acc/Slovak': 0.3608312342569269, 'valid/f1_micro/Slovak': 0.3608312342569269, 'valid/f1_macro/Slovak': 0.17677001388246183, 'epoch': -1}
{'valid/acc/Russian': 0.49633333333333335, 'valid/f1_micro/Russian': 0.49633333333333335, 'valid/f1_macro/Russian': 0.33169971040320784, 'epoch': -1}


1it [00:00,  6.43it/s]

{'valid/acc/Latvian': 0.32413793103448274, 'valid/f1_micro/Latvian': 0.32413793103448274, 'valid/f1_macro/Latvian': 0.16319444444444445, 'epoch': -1}


1250it [02:59,  6.98it/s]


{'valid/acc/Slovak': 0.5170025188916877, 'valid/f1_micro/Slovak': 0.5170025188916877, 'valid/f1_macro/Slovak': 0.5266218692903845, 'epoch': 0}
{'valid/acc/Russian': 0.41633333333333333, 'valid/f1_micro/Russian': 0.4163333333333334, 'valid/f1_macro/Russian': 0.3543330375628512, 'epoch': 0}


1it [00:00,  7.70it/s]

{'valid/acc/Latvian': 0.656896551724138, 'valid/f1_micro/Latvian': 0.656896551724138, 'valid/f1_macro/Latvian': 0.5680025542212067, 'epoch': 0}


1250it [02:59,  6.97it/s]


{'valid/acc/Slovak': 0.5982367758186398, 'valid/f1_micro/Slovak': 0.5982367758186398, 'valid/f1_macro/Slovak': 0.5843503154117711, 'epoch': 1}
{'valid/acc/Russian': 0.5053333333333333, 'valid/f1_micro/Russian': 0.5053333333333333, 'valid/f1_macro/Russian': 0.4095829099433228, 'epoch': 1}


1it [00:00,  7.78it/s]

{'valid/acc/Latvian': 0.6922413793103448, 'valid/f1_micro/Latvian': 0.6922413793103448, 'valid/f1_macro/Latvian': 0.6436546980662448, 'epoch': 1}


1250it [02:59,  6.97it/s]


{'valid/acc/Slovak': 0.5667506297229219, 'valid/f1_micro/Slovak': 0.5667506297229219, 'valid/f1_macro/Slovak': 0.5614278734938484, 'epoch': 2}
{'valid/acc/Russian': 0.433, 'valid/f1_micro/Russian': 0.433, 'valid/f1_macro/Russian': 0.34912520136996356, 'epoch': 2}


1it [00:00,  7.75it/s]

{'valid/acc/Latvian': 0.6310344827586207, 'valid/f1_micro/Latvian': 0.6310344827586207, 'valid/f1_macro/Latvian': 0.5376152363276827, 'epoch': 2}


1250it [02:59,  6.97it/s]


{'valid/acc/Slovak': 0.5818639798488665, 'valid/f1_micro/Slovak': 0.5818639798488665, 'valid/f1_macro/Slovak': 0.5777557558944516, 'epoch': 3}
{'valid/acc/Russian': 0.39166666666666666, 'valid/f1_micro/Russian': 0.39166666666666666, 'valid/f1_macro/Russian': 0.34923513923470995, 'epoch': 3}


1it [00:00,  7.70it/s]

{'valid/acc/Latvian': 0.6775862068965517, 'valid/f1_micro/Latvian': 0.6775862068965517, 'valid/f1_macro/Latvian': 0.6108277165407413, 'epoch': 3}


1250it [02:59,  6.97it/s]


{'valid/acc/Slovak': 0.5976070528967254, 'valid/f1_micro/Slovak': 0.5976070528967254, 'valid/f1_macro/Slovak': 0.5839330033347846, 'epoch': 4}
{'valid/acc/Russian': 0.416, 'valid/f1_micro/Russian': 0.416, 'valid/f1_macro/Russian': 0.3516109004073606, 'epoch': 4}


1it [00:00,  7.80it/s]

{'valid/acc/Latvian': 0.6689655172413793, 'valid/f1_micro/Latvian': 0.6689655172413793, 'valid/f1_macro/Latvian': 0.600884296085864, 'epoch': 4}


1250it [02:59,  6.98it/s]


{'valid/acc/Slovak': 0.6032745591939547, 'valid/f1_micro/Slovak': 0.6032745591939547, 'valid/f1_macro/Slovak': 0.5889986140118828, 'epoch': 5}
{'valid/acc/Russian': 0.4136666666666667, 'valid/f1_micro/Russian': 0.4136666666666666, 'valid/f1_macro/Russian': 0.3571840431926268, 'epoch': 5}


1it [00:00,  7.67it/s]

{'valid/acc/Latvian': 0.6862068965517242, 'valid/f1_micro/Latvian': 0.6862068965517242, 'valid/f1_macro/Latvian': 0.6308047413361949, 'epoch': 5}


1250it [02:59,  6.97it/s]


{'valid/acc/Slovak': 0.5642317380352645, 'valid/f1_micro/Slovak': 0.5642317380352645, 'valid/f1_macro/Slovak': 0.5663481846934956, 'epoch': 6}
{'valid/acc/Russian': 0.37066666666666664, 'valid/f1_micro/Russian': 0.37066666666666664, 'valid/f1_macro/Russian': 0.33598002066041194, 'epoch': 6}


1it [00:00,  7.78it/s]

{'valid/acc/Latvian': 0.6827586206896552, 'valid/f1_micro/Latvian': 0.6827586206896552, 'valid/f1_macro/Latvian': 0.6124778630259219, 'epoch': 6}


1250it [02:59,  6.98it/s]


{'valid/acc/Slovak': 0.6076826196473551, 'valid/f1_micro/Slovak': 0.6076826196473551, 'valid/f1_macro/Slovak': 0.5883368514523388, 'epoch': 7}
{'valid/acc/Russian': 0.46366666666666667, 'valid/f1_micro/Russian': 0.46366666666666667, 'valid/f1_macro/Russian': 0.3755305074713217, 'epoch': 7}


1it [00:00,  7.74it/s]

{'valid/acc/Latvian': 0.6672413793103448, 'valid/f1_micro/Latvian': 0.6672413793103448, 'valid/f1_macro/Latvian': 0.6131147030270944, 'epoch': 7}


1250it [02:59,  6.98it/s]


{'valid/acc/Slovak': 0.577455919395466, 'valid/f1_micro/Slovak': 0.577455919395466, 'valid/f1_macro/Slovak': 0.5749012989627139, 'epoch': 8}
{'valid/acc/Russian': 0.38766666666666666, 'valid/f1_micro/Russian': 0.38766666666666666, 'valid/f1_macro/Russian': 0.3465887401788803, 'epoch': 8}


1it [00:00,  7.72it/s]

{'valid/acc/Latvian': 0.6758620689655173, 'valid/f1_micro/Latvian': 0.6758620689655173, 'valid/f1_macro/Latvian': 0.6116252233071008, 'epoch': 8}


1250it [02:59,  6.98it/s]


{'valid/acc/Slovak': 0.5812342569269522, 'valid/f1_micro/Slovak': 0.5812342569269522, 'valid/f1_macro/Slovak': 0.5762964032106064, 'epoch': 9}
{'valid/acc/Russian': 0.403, 'valid/f1_micro/Russian': 0.403, 'valid/f1_macro/Russian': 0.35452148622599683, 'epoch': 9}
{'valid/acc/Latvian': 0.675, 'valid/f1_micro/Latvian': 0.675, 'valid/f1_macro/Latvian': 0.6132714817363962, 'epoch': 9}
