In [2]:
!pip install tweet-preprocessor



In [3]:
import pandas as pd
import numpy as np
import re
import time
import random
import json
from pprint import pprint
import preprocessor as p
from functions import *
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

import tensorflow as tf
import torch
from torchmetrics import F1Score
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from keras_preprocessing.sequence import pad_sequences

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig, BertPreTrainedModel, BertModel
from transformers import get_linear_schedule_with_warmup

import nlpaug.augmenter.word as naw
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word.context_word_embs as nawcwe
import nlpaug.augmenter.word.word_embs as nawwe
import nlpaug.augmenter.sentence as nas
from nlpaug.util.file.download import DownloadUtil

import nlpaug.augmenter.word.spelling as naws

import nltk
nltk.download('wordnet')

from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler


# seed_val = 42

# random.seed(seed_val)
# np.random.seed(seed_val)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bharathia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
    np.random.seed(seed)  # Numpy module.
    random.seed(seed)  # Python random module.
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.use_deterministic_algorithms(True)
    
seed_everything(42)

In [5]:
import warnings
warnings.filterwarnings('ignore')

### UTILITY FUNCTIONS

In [6]:
def process_tweet(text):
    p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.SMILEY, p.OPT.NUMBER)
    # Remvoing Retweet symbols
    z = lambda x: re.compile('RT @').sub('@', x, count=1).strip()
    text = z(text)
    text = p.clean(text)
    #Don't remove the hashtag entirely, just remove the # symbol but keep the keyword
    text = text.replace("#", "")
    text = text.replace(':', "")
    return text


MAX_LEN = 128 # max sequences length
batch_size = 32

labels_encoding = {
    "negative": 0,
    "neutral": 1,
    "positive": 2
}

def preprocessing(df):
    sentences = df.processed_tweet.values
    labels = np.array([labels_encoding[l] for l in df.stance.values])

    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True)
    
    encoded_sentences = []
    for sent in sentences:
        encoded_sent = tokenizer.encode(
                            sent,
                            add_special_tokens = True,
                            truncation=True,
                            max_length = MAX_LEN
                    )
        
        encoded_sentences.append(encoded_sent)
    encoded_sentences = pad_sequences(encoded_sentences, maxlen=MAX_LEN, dtype="long", 
                            value=0, truncating="post", padding="post")
    return encoded_sentences, labels

def attention_masks(encoded_sentences):
    # attention masks, 0 for padding, 1 for actual token
    attention_masks = []
    for sent in encoded_sentences:
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks.append(att_mask)
    return attention_masks

def train_test_split(df, frac=0.1):
    
    # get random sample 
    test = df.sample(frac=frac, axis=0, random_state = 42)

    # get everything but the test sample
    train = df.drop(index=test.index)

    return train, test

https://stackoverflow.com/questions/71174306/expected-in-usr-lib-libc-1-dylib-installing-tensorflow-on-m1-macbook-pro

### Combining the English datasets

### 1. VADet

In [7]:
#vad_hydrated_tweets = [json.loads(line) for line in open('hydrated_tweets/vad_ids.json', 'r')]
vad_hydrated_tweets = pd.read_csv('hydrator-tweets/vad_ids.csv')
vad_class_labels = pd.read_csv('datasets/VADet/annotated_twids.csv')
vad_ids = []
vad_tweets = []

for index, element in vad_hydrated_tweets.iterrows():
    vad_ids.append(element['id'])
    vad_tweets.append(element['text'])
    
vad_data = pd.DataFrame(list(zip(vad_ids, vad_tweets)), columns = ['id', 'tweet'])
vad_data = vad_data.merge(vad_class_labels[['ID', 'stance']], how = 'inner', left_on = 'id', right_on = 'ID')[['id', 'tweet', 'stance']]
vad_data.head()

Unnamed: 0,id,tweet,stance
0,1372900215620001795,"1/1Today, I visited CPG Hospital to take the C...",positive
1,1372148534506573826,@notjustamummy2 @LozzaFox Well there is a viru...,negative
2,1373396702116376579,"@paulreiddublin @HSELive Mr Reid , may I ask i...",negative
3,1373276419527208966,@is_salsu @NphcdaNG 1. 2nd March 2021\n\n2. Th...,positive
4,1372964450437840906,@drsanjaygupta @ChrisCuomo @DaveedDiggs @Lin_M...,positive


In [10]:
len(vad_hydrated_tweets)

2046

In [8]:
vad_data['stance'].value_counts()

positive    1367
negative     517
neutral      162
Name: stance, dtype: int64

In [12]:
vad_data.to_csv('experimental_datasets/vad_data.csv')

In [9]:
vad_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2046 entries, 0 to 2045
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      2046 non-null   int64 
 1   tweet   2046 non-null   object
 2   stance  2046 non-null   object
dtypes: int64(1), object(2)
memory usage: 63.9+ KB


### 2. Vaccine Stance dataset (2021)

In [11]:
#vac_stance_hydrated_tweets = [json.loads(line) for line in open('hydrated_tweets/vac_stance_ids.json', 'r')]
vac_stance_hydrated_tweets = pd.read_csv('hydrator-tweets/vac_stance_ids.csv')
vac_stance_class_labels = pd.read_csv('datasets/VaccineStance/Data.csv')
vac_stance_ids = []
vac_stance_tweets = []
label_mapping = {1:'positive', 2:'negative', 3:'neutral'}

for index, element in vac_stance_hydrated_tweets.iterrows():
    vac_stance_ids.append(element['id'])
    vac_stance_tweets.append(element['text'])
    
vac_stance_data = pd.DataFrame(list(zip(vac_stance_ids, vac_stance_tweets)), columns = ['id', 'tweet'])
vac_stance_data = vac_stance_data.merge(vac_stance_class_labels[['ID', 'Code']], how = 'inner', left_on = 'id', right_on = 'ID')[['id', 'tweet', 'Code']]
vac_stance_data['stance'] = vac_stance_data['Code'].replace(label_mapping)
vac_stance_data.drop(columns=['Code'], inplace = True)
vac_stance_data.head()

Unnamed: 0,id,tweet,stance
0,1301859457274535936,หรือช้อปออนไลน์ได้ที่~\n\nShopee: https://t.co...,negative
1,1303711210622255104,@_rafaellugh HAHAHAHSHA PWDE MAN MAG ZUMBA SA ...,neutral
2,1303057753171202048,@djlavoie @chrislhayes big pharma says they al...,negative
3,1301867614302547968,"""Whether intentional or not, the response to #...",negative
4,1306393902899920896,@GovAbbott don't you dare mandate the coronavi...,negative


In [12]:
len(vac_stance_data)

55

In [13]:
vac_stance_data['stance'].value_counts()

positive    30
neutral     13
negative    12
Name: stance, dtype: int64

### 3. Opinions regarding COVID-19 vaccine in the first month (Nov 9th 2020 - Dec 8th 2020)

In [14]:
#vac_opinions_hydrated_tweets = [json.loads(line) for line in open('hydrated_tweets/vac_opinions_first_month_ids.json', 'r')]
vac_opinions_hydrated_tweets = pd.read_csv('hydrator-tweets/vac_opinions_first_month_ids.csv')
vac_opinions_class_labels = pd.read_csv('datasets/FirstMonthStance/vaccination_stance_first_month.csv')
vac_opinions_ids = []
vac_opinions_tweets = []
label_mapping = {2.0:'positive', 0.0:'negative', 1.0:'neutral'}

for index, element in vac_opinions_hydrated_tweets.iterrows():
    vac_opinions_ids.append(element['id'])
    vac_opinions_tweets.append(element['text'])
    
vac_opinions_data = pd.DataFrame(list(zip(vac_opinions_ids, vac_opinions_tweets)), columns = ['id', 'tweet'])
vac_opinions_data = vac_opinions_data.merge(vac_opinions_class_labels[['id', 'category']], how = 'inner', left_on = 'id', right_on = 'id')[['id', 'tweet', 'category']]
vac_opinions_data['stance'] = vac_opinions_data['category'].replace(label_mapping)
vac_opinions_data.drop(columns=['category'], inplace = True)
vac_opinions_data.head()

Unnamed: 0,id,tweet,stance
0,1325774333504073728,Great job by the #Pfizer #vaccine scientists w...,positive
1,1325997637640876032,Bother is Profs.of different sections who tell...,negative
2,1325892752467488770,They say the vaccine will be tested on animals...,negative
3,1325895792054427648,I don’t care who tf you voted for. If this goe...,positive
4,1329215336344809472,This is the vaccine that's so unstable that it...,positive


In [15]:
vac_opinions_data['stance'].value_counts()

neutral     901
positive    879
negative    612
Name: stance, dtype: int64

In [16]:
vac_opinions_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2392 entries, 0 to 2391
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      2392 non-null   int64 
 1   tweet   2392 non-null   object
 2   stance  2392 non-null   object
dtypes: int64(1), object(2)
memory usage: 74.8+ KB


In [20]:
combined_df = pd.concat([vad_data, vac_stance_data, vac_opinions_data], axis = 0)

In [23]:
combined_df['processed_tweet'] = combined_df['tweet'].apply(process_tweet)
combined_df = combined_df[combined_df['processed_tweet'].notna()]
combined_df.to_csv('processed_datasets/combined_english_tweets.csv')

### AUGMENTING DATA

In [10]:
combined_df = pd.read_csv('processed_datasets/combined_english_tweets.csv')
combined_df.head()

Unnamed: 0.1,Unnamed: 0,id,tweet,stance,processed_tweet
0,0,1372900215620001795,"1/1Today, I visited CPG Hospital to take the C...",positive,"/1Today, I visited CPG Hospital to take the Co..."
1,1,1372148534506573826,@notjustamummy2 @LozzaFox Well there is a viru...,negative,Well there is a virus but all of this is nothi...
2,2,1373396702116376579,"@paulreiddublin @HSELive Mr Reid , may I ask i...",negative,"Mr Reid , may I ask if you have received a vac..."
3,3,1373276419527208966,@is_salsu @NphcdaNG 1. 2nd March 2021\n\n2. Th...,positive,". nd March . The Head of the EMA, Emer Cooke, ..."
4,4,1372964450437840906,@drsanjaygupta @ChrisCuomo @DaveedDiggs @Lin_M...,positive,I got my AstraZeneca shot this week and feel g...


In [12]:
len(combined_df)

4493

In [6]:
final_training_data = combined_df[['processed_tweet', 'stance']]
train_data, test_data = train_test_split(final_training_data, 0.1)
train_data = train_data.dropna().reset_index(drop=True)
test_data = test_data.dropna().reset_index(drop=True)
train_data = train_data.sample(frac=1, random_state=42)

In [17]:
target_variable = 'stance'
minority_classes = ['negative', 'neutral']

majority_df = train_data[train_data[target_variable].isin(minority_classes)==False]
minority_df = train_data[train_data[target_variable].isin(minority_classes)]

le = LabelEncoder()
minority_df[target_variable] = le.fit_transform(minority_df[target_variable])
#print(minority_df.head())

oversampler = RandomOverSampler(sampling_strategy = {0:2031, 1:2031}, random_state = 42)
X_resampled, y_resampled = oversampler.fit_resample(minority_df.drop(target_variable, axis=1), minority_df[target_variable])
y_resampled = le.inverse_transform(y_resampled)

print(y_resampled)

resampled_df = pd.DataFrame({'processed_tweet':X_resampled['processed_tweet'], 'stance': y_resampled})
# X_resampled_df = pd.DataFrame(X_resampled, columns=minority_df.drop(target_variable, axis=1).columns)
# y_resampled_df = pd.DataFrame(y_resampled, columns=[target_variable])
train_data = pd.concat([majority_df, resampled_df])

# Print value counts of target variable to check if upsampling worked
print(train_data[target_variable].value_counts())

['negative' 'negative' 'negative' ... 'neutral' 'neutral' 'neutral']
positive    2031
negative    2031
neutral     2031
Name: stance, dtype: int64


In [9]:
test_data['stance'].value_counts(normalize=True)

positive    0.543430
neutral     0.240535
negative    0.216036
Name: stance, dtype: float64

In [33]:
# sample = combined_df.iloc[7]['processed_tweet']
# sample_original = combined_df.iloc[7]['tweet']
aug = naw.ContextualWordEmbsAug(
        model_path='distilbert-base-uncased')
# aug = naw.SynonymAug(aug_src='wordnet')
# generated_sentences = aug.augment(sample)
# print(sample_original)
# print(sample)
# print(generated_sentences)

original_sentences = []

augmented_sentences = []
augmented_labels = []
#final_training_data = final_training_data.dropna().reset_index(drop=True)

for index, row in train_data.iterrows():
    if row['stance'] == 'negative' or row['stance'] == 'neutral':
        generated_sentences = aug.augment(row['processed_tweet'])
        original_sentences.append(row['processed_tweet'])
        for sentence in generated_sentences:
            augmented_sentences.append(sentence)
            augmented_labels.append(row['stance'])
            
original_augmented_comparison_df = pd.DataFrame(list(zip(original_sentences, augmented_sentences)), columns = ['original', 'augmented'])
original_augmented_comparison_df.to_csv('experimental_datasets/CWEA_augments_inspect.csv')
augmented_training_data = train_data.append(pd.DataFrame(list(zip(augmented_sentences, augmented_labels)), columns = ['processed_tweet', 'stance']), ignore_index=True)

  augmented_training_data = train_data.append(pd.DataFrame(list(zip(augmented_sentences, augmented_labels)), columns = ['processed_tweet', 'stance']), ignore_index=True)


In [None]:
#TODO
Augment positive
Augment negative alone
Check precision, recall per class 

In [11]:
train_data['stance'].value_counts()

positive    2031
negative    1044
neutral      967
Name: stance, dtype: int64

In [34]:
train_data.to_csv('processed_datasets/final_training_data.csv')

In [8]:
print(len(final_training_data))

4493


In [30]:
augmented_training_data['stance'].value_counts()

negative    2088
positive    2031
neutral      967
Name: stance, dtype: int64

In [35]:
augmented_training_data.to_csv('processed_datasets/final_augmented_training_data.csv')

In [3]:
augmented_training_data = pd.read_csv('processed_datasets/final_augmented_training_data.csv')

### Loading Non-english datasets

### 1. VaccinEU

In [15]:
vaccin_eu_french = pd.read_csv('./datasets/VaccinEU/labeled_tweets_french.csv', sep = '\t', header = 0, names = ['tweet', 'stance'])
vaccin_eu_german = pd.read_csv('./datasets/VaccinEU/labeled_tweets_german.csv', sep = '\t', header = 0, names = ['tweet', 'stance'])
vaccin_eu_italian = pd.read_csv('./datasets/VaccinEU/labeled_tweets_italian.csv', sep = '\t', header = 0, names = ['tweet', 'stance'])

label_mapping = {1:'positive', 2:'negative', 3:'neutral'}

vaccin_eu_french = vaccin_eu_french[vaccin_eu_french['stance'] != 4]
vaccin_eu_german = vaccin_eu_german[vaccin_eu_german['stance'] != 4]
vaccin_eu_italian = vaccin_eu_italian[vaccin_eu_italian['stance'] != 4]

vaccin_eu_french['stance'] = vaccin_eu_french['stance'].replace(label_mapping)
vaccin_eu_german['stance'] = vaccin_eu_german['stance'].replace(label_mapping)
vaccin_eu_italian['stance'] = vaccin_eu_italian['stance'].replace(label_mapping)

vaccin_eu_french['processed_tweet'] = vaccin_eu_french['tweet'].apply(process_tweet)
vaccin_eu_german['processed_tweet'] = vaccin_eu_german['tweet'].apply(process_tweet)
vaccin_eu_italian['processed_tweet'] = vaccin_eu_italian['tweet'].apply(process_tweet)

vaccin_eu_french.to_csv('processed_datasets/VaccinEU/french_tweets.csv')
vaccin_eu_german.to_csv('processed_datasets/VaccinEU/german_tweets.csv')
vaccin_eu_italian.to_csv('processed_datasets/VaccinEU/italian_tweets.csv')


In [16]:
vaccin_eu_french['stance'].value_counts()

positive    419
neutral     279
negative    135
Name: stance, dtype: int64

In [17]:
vaccin_eu_german['stance'].value_counts()

positive    547
neutral     169
negative    108
Name: stance, dtype: int64

In [18]:
vaccin_eu_italian['stance'].value_counts()

neutral     458
positive    314
negative    151
Name: stance, dtype: int64

In [23]:
vaccin_eu_french.iloc[0]['processed_tweet']

'Info pratique  Je viens de bouger ma deuxime injection du juillet pour le juillet sur Doctolib. Profitez-en. jemevaccine'

### Finetuning MBERT on the English tweets

In [8]:
device_name = tf.test.gpu_device_name()
if device_name == '/device:GPU:0':
    print(f'Found GPU at: {device_name}')

Metal device set to: Apple M1
Found GPU at: /device:GPU:0


2023-03-20 22:06:25.874309: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-03-20 22:06:25.874979: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [9]:
if torch.has_mps:    
    device = torch.device("mps")
else:
    print('using the CPU')
    device = torch.device("cpu")

In [10]:
#augmented_df = pd.read_csv('processed_datasets/final_augmented_training_data.csv')
# print(train_data['stance'].value_counts(normalize=True))
# print(test_data['stance'].value_counts(normalize=True))
# print(train_data[~train_data['processed_tweet'].notna()])

train_encoded_sentences, train_labels = preprocessing(train_data)
train_attention_masks = attention_masks(train_encoded_sentences)

test_encoded_sentences, test_labels = preprocessing(test_data)
test_attention_masks = attention_masks(test_encoded_sentences)

train_inputs = torch.tensor(train_encoded_sentences)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_attention_masks)

validation_inputs = torch.tensor(test_encoded_sentences)
validation_labels = torch.tensor(test_labels)
validation_masks = torch.tensor(test_attention_masks)

# data loader for training
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = SequentialSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# data loader for validation
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=len(validation_data))

In [37]:
augmented_df['stance'].value_counts(normalize=True)

negative    0.340242
positive    0.339198
neutral     0.320561
Name: stance, dtype: float64

In [32]:
len(validation_data)

671

In [11]:
torch.manual_seed(seed_val)
#torch.cuda.manual_seed_all(seed_val)

model = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-uncased",
    num_labels = 3,   
    output_attentions = False, 
    output_hidden_states = True, 
)

model.to(device)

optimizer = AdamW(model.parameters(),
                  lr = 3e-5, 
                  eps = 1e-8, 
                  weight_decay = 0.01
                )

epochs = 4
lambd = 0.9
temperature = 0.3
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # 10% * datasetSize/batchSize
                                            num_training_steps = total_steps)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

In [17]:

def compute_accuracy(preds, labels):
    p = np.argmax(preds, axis=1).flatten()
    l = labels.flatten()
    return np.sum(p==l)/len(l)

def compute_f1(preds, labels):
    p = np.argmax(preds, axis=1).tolist()
    l = labels.tolist()
    f1_macro = f1_score(l, p, average='macro')
    f1_per_class = f1_score(l, p, average=None)
    return f1_macro, f1_per_class

def compute_precision_recall(preds, labels):
    p = np.argmax(preds, axis=1).tolist()
    l = labels.tolist()
    precision_per_class = precision_score(l, p, average=None)
    recall_per_class = recall_score(l, p, average=None)
    return precision_per_class, recall_per_class

def compute_contrastive_loss(temp, embedding, label):
    """calculate the contrastive loss
    """
    # cosine similarity between embeddings
    cosine_sim = cosine_similarity(embedding, embedding)
    # remove diagonal elements from matrix
    dis = cosine_sim[~np.eye(cosine_sim.shape[0], dtype=bool)].reshape(cosine_sim.shape[0], -1)
    # apply temprature to elements
    dis = dis / temp
    cosine_sim = cosine_sim / temp
    # apply exp to elements
    dis = np.exp(dis)
    cosine_sim = np.exp(cosine_sim)

    # calculate row sum
    row_sum = []
    for i in range(len(embedding)):
        row_sum.append(sum(dis[i]))
    # calculate outer sum
    contrastive_loss = 0
    for i in range(len(embedding)):
        n_i = label.tolist().count(label[i]) - 1
        inner_sum = 0
        # calculate inner sum
        for j in range(len(embedding)):
            if label[i] == label[j] and i != j:
                inner_sum = inner_sum + np.log(cosine_sim[i][j] / row_sum[i])
        if n_i != 0:
            contrastive_loss += (inner_sum / (-n_i))
        else:
            contrastive_loss += 0
    return contrastive_loss
    

def run_train(epochs):
    losses = []
    for e in range(epochs):
        print('======== Epoch {:} / {:} ========'.format(e + 1, epochs))
        start_train_time = time.time()
        total_loss = 0
        model.train()
        for step, batch in enumerate(train_dataloader):

            if step%10 == 0:
                elapsed = time.time()-start_train_time
                print(f'{step}/{len(train_dataloader)} --> Time elapsed {elapsed}')

            # input_data, input_masks, input_labels = batch
            input_data = batch[0].to(device)
            input_masks = batch[1].to(device)
            input_labels = batch[2].to(device)

            model.zero_grad()

            # forward propagation
            out = model(input_data,
                        token_type_ids = None, 
                        attention_mask = input_masks,
                        labels = input_labels)
            
            cross_loss = out[0]
            hidden_states = out[2]
            last_hidden_state = hidden_states[-1][:, 0, :]
            contrastive_loss = compute_contrastive_loss(temperature, last_hidden_state.cpu().detach().numpy(), input_labels)
            new_loss = (lambd * contrastive_loss) + (1-lambd)*(cross_loss)
            total_loss = total_loss + new_loss.item()

            # backward propagation
            new_loss.backward()
            
            torch.nn.utils.clip_grad_norm(model.parameters(), 1)

            optimizer.step()
        
        epoch_loss = total_loss/len(train_dataloader)
        losses.append(epoch_loss)
        print(f"Training took {time.time()-start_train_time}")

        # Validation
        start_validation_time = time.time()
        model.eval()
        eval_loss, eval_acc = 0,0
        
        for step, batch in enumerate(validation_dataloader):
            batch = tuple(t.to(device) for t in batch)
            eval_data, eval_masks, eval_labels = batch
            with torch.no_grad():
                out = model(eval_data,
                            token_type_ids = None, 
                            attention_mask=eval_masks)
            logits = out[0]

            #  Uncomment for GPU execution
            logits = logits.detach().cpu().numpy()
            eval_labels = eval_labels.to('cpu').numpy()
            batch_acc = compute_accuracy(logits, eval_labels)
            batch_f1_macro, batch_f1_per_class = compute_f1(logits, eval_labels)
            batch_precision, batch_recall = compute_precision_recall(logits, eval_labels)

            # Uncomment for CPU execution
            # batch_acc = compute_accuracy(logits.numpy(), eval_labels.numpy())
#             eval_f1 += batch_f1_macro
            
#             eval_acc += batch_acc
            
        print(f"Accuracy: {batch_acc}, Time elapsed: {time.time()-start_validation_time}")
        print(f"F1 score (Macro): {batch_f1_macro}")
        print(f"F1 score (Per class): {batch_f1_per_class}")
        print(f"Precision score (Per class): {batch_precision}")
        print(f"Recall score (Per class): {batch_recall}")
        
    
    torch.save(model.state_dict(), 'model/model_params.pth')
    
    return losses

In [13]:
losses = run_train(epochs)

0/127 --> Time elapsed 0.004640817642211914
10/127 --> Time elapsed 54.2857871055603
20/127 --> Time elapsed 104.97831702232361
30/127 --> Time elapsed 152.73749208450317
40/127 --> Time elapsed 200.0746431350708
50/127 --> Time elapsed 247.9450421333313
60/127 --> Time elapsed 293.7600841522217
70/127 --> Time elapsed 341.88068199157715
80/127 --> Time elapsed 389.26674604415894
90/127 --> Time elapsed 436.22749304771423
100/127 --> Time elapsed 484.44323110580444
110/127 --> Time elapsed 532.0291759967804
120/127 --> Time elapsed 580.200798034668
Training took 610.9949510097504
Accuracy: 0.7060133630289532, Time elapsed: 16.912190914154053
F1 score (Macro): 0.66513706606238
F1 score (Per class): [0.56179775 0.65714286 0.77647059]
Precision score (Per class): [0.61728395 0.67647059 0.7443609 ]
Recall score (Per class): [0.51546392 0.63888889 0.81147541]
0/127 --> Time elapsed 0.006830692291259766
10/127 --> Time elapsed 49.36410880088806
20/127 --> Time elapsed 96.47195172309875
30/12

In [14]:
def run_test(df_test):

    test_encoded_sentences, test_labels = preprocessing(df_test)
    test_attention_masks = attention_masks(test_encoded_sentences)

    test_inputs = torch.tensor(test_encoded_sentences)
    test_labels = torch.tensor(test_labels)
    test_masks = torch.tensor(test_attention_masks)

    test_data = TensorDataset(test_inputs, test_masks, test_labels)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=len(test_data))
    
    model = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-uncased",
    num_labels = 3,   
    output_attentions = False, 
    output_hidden_states = False,)
    model.to(device)
    
    model.load_state_dict(torch.load('model/model_params.pth'))

    model.eval()
    eval_loss, eval_acc = 0,0
    for step, batch in enumerate(test_dataloader):
        batch = tuple(t.to(device) for t in batch)
        eval_data, eval_masks, eval_labels = batch
        with torch.no_grad():
            out = model(eval_data,
                        token_type_ids = None,
                        attention_mask=eval_masks)
        logits = out[0]
        logits = logits.detach().cpu().numpy()
        eval_labels = eval_labels.to('cpu').numpy()
        batch_acc = compute_accuracy(logits, eval_labels)
        f1_macro, f1_classwise = compute_f1(logits, eval_labels)
        precision_test, recall_test = compute_precision_recall(logits, eval_labels)
        #eval_acc += batch_acc
    print(f"Accuracy: {batch_acc}")
    print(f"F1 Score (Macro): {f1_macro}")
    print(f"F1 Score (Classwise): {f1_classwise}")
    print(f"Precision score (Per class): {precision_test}")
    print(f"Recall score (Per class): {recall_test}")
    

In [16]:
run_test(vaccin_eu_french)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

RuntimeError: MPS backend out of memory (MPS allocated: 9.17 GB, other allocations: 9.35 GB, max allowed: 18.13 GB). Tried to allocate 1.22 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [17]:
run_test(vaccin_eu_german)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

Accuracy: 0.5752427184466019
F1 Score (Macro): 0.5109529812368666
F1 Score (Classwise): [0.33125    0.51073986 0.69086909]
Precision score (Per class): [0.25       0.428      0.86740331]
Recall score (Per class): [0.49074074 0.63313609 0.57404022]


In [18]:
run_test(vaccin_eu_italian)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

Accuracy: 0.49295774647887325
F1 Score (Macro): 0.4707186596868637
F1 Score (Classwise): [0.35698925 0.60201511 0.45315162]
Precision score (Per class): [0.26433121 0.71130952 0.48717949]
Recall score (Per class): [0.54966887 0.52183406 0.42356688]
