In [1]:
import os
os.environ["WANDB_DISABLED"] = "true"
import evaluate

import pandas as pd

df_train = pd.read_csv('/home/aflah20082/NLP_Project/Data/PreprocessedData/train_preprocessed.csv')
df_test = pd.read_csv('/home/aflah20082/NLP_Project/Data/PreprocessedData/test_preprocessed.csv')
df_val = pd.read_csv('/home/aflah20082/NLP_Project/Data/PreprocessedData/val_preprocessed.csv')

# Add Dummy Labels to Test
df_test['label'] = 0

df_train = df_train[['preprocessed_text', 'label']]
df_test = df_test[['preprocessed_text', 'label']]
df_val = df_val[['preprocessed_text', 'label']]

In [2]:
df_train = df_train.rename(columns={'preprocessed_text': 'text', 'label': 'label'})
df_test = df_test.rename(columns={'preprocessed_text': 'text', 'label': 'label'})
df_val = df_val.rename(columns={'preprocessed_text': 'text', 'label': 'label'})

In [3]:
# Change Label to 0 and 1
df_train['label'] = df_train['label'].replace({'NOT': 1, 'OFF': 0})
df_val['label'] = df_val['label'].replace({'NOT': 1, 'OFF': 0})

In [4]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import AutoModelForSequenceClassification

bertweet_tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", force_download=True)
bertbase_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", force_download=True)
hatebert_tokenizer = AutoTokenizer.from_pretrained("GroNLP/hateBERT", force_download=True)

Downloading:   0%|          | 0.00/558 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/824k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/151 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/151 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/151 [00:00<?, ?B/s]

In [5]:
from datasets import load_dataset,Dataset,DatasetDict
from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, AutoModel, AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
import torch
import torch.nn as nn
import pandas as pd

dataset_hf = DatasetDict({
    'train': Dataset.from_pandas(df_train),
    'test': Dataset.from_pandas(df_test),
    'validation': Dataset.from_pandas(df_val),
    'train_val': Dataset.from_pandas(pd.concat([df_train, df_val]))
})

bertweet_tokenized_dataset = dataset_hf.map(lambda examples: bertweet_tokenizer(examples['text'], truncation=True, padding='max_length'), batched=True, batch_size=16)
bertbase_tokenized_dataset = dataset_hf.map(lambda examples: bertbase_tokenizer(examples['text'], truncation=True, padding='max_length'), batched=True, batch_size=16)
hatebert_tokenized_dataset = dataset_hf.map(lambda examples: hatebert_tokenizer(examples['text'], truncation=True, padding='max_length'), batched=True, batch_size=16)

  0%|          | 0/662 [00:00<?, ?ba/s]

  0%|          | 0/54 [00:00<?, ?ba/s]

  0%|          | 0/166 [00:00<?, ?ba/s]

  0%|          | 0/828 [00:00<?, ?ba/s]

  0%|          | 0/662 [00:00<?, ?ba/s]

  0%|          | 0/54 [00:00<?, ?ba/s]

  0%|          | 0/166 [00:00<?, ?ba/s]

  0%|          | 0/828 [00:00<?, ?ba/s]

  0%|          | 0/662 [00:00<?, ?ba/s]

  0%|          | 0/54 [00:00<?, ?ba/s]

  0%|          | 0/166 [00:00<?, ?ba/s]

  0%|          | 0/828 [00:00<?, ?ba/s]

In [None]:
# Combine the two datasets
dataset_hf = DatasetDict({
    'train': Dataset.from_pandas(df_train),
    'test': Dataset.from_pandas(df_test),
    'validation': Dataset.from_pandas(df_val),
    'train_val': Dataset.from_pandas(pd.concat([df_train, df_val]))
})

bertweet_tokenized_dataset = dataset_hf.map(lambda examples: bertweet_tokenizer(examples['text'], truncation=True, padding='max_length'), batched=True)
bertbase_tokenized_dataset = dataset_hf.map(lambda examples: bertbase_tokenizer(examples['text'], truncation=True, padding='max_length'), batched=True)

bertweet_tokenized_dataset.set_format("torch",columns=["input_ids", "attention_mask", "label"])
data_collator_bertweet = DataCollatorWithPadding(tokenizer=bertweet_tokenizer)

bertbase_tokenized_dataset.set_format("torch",columns=["input_ids", "attention_mask", "label"])
data_collator_bertbase = DataCollatorWithPadding(tokenizer=bertbase_tokenizer)

hatebert_tokenized_dataset.set_format("torch",columns=["input_ids", "attention_mask", "label"])
data_collator_hatebert = DataCollatorWithPadding(tokenizer=hatebert_tokenizer)

# Combine the two datasets
class ConcatDataset(torch.utils.data.Dataset):
    def __init__(self, bertweet_dataset, bertbase_dataset, hatebert_dataset):
        self.dataset1 = bertweet_dataset
        self.dataset2 = bertbase_dataset
        self.dataset3 = hatebert_dataset

    def __getitem__(self, index):
        return {
            'input_ids_bertweet': self.dataset1[index]['input_ids'],
            'attention_mask_bertweet': self.dataset1[index]['attention_mask'],
            'input_ids_bertbase': self.dataset2[index]['input_ids'],
            'attention_mask_bertbase': self.dataset2[index]['attention_mask'],
            'input_ids_hatebert': self.dataset3[index]['input_ids'],
            'attention_mask_hatebert': self.dataset3[index]['attention_mask'],
            'label': self.dataset1[index]['label']
        }

    def __len__(self):
        return len(self.dataset1)

train_dataset = ConcatDataset(bertweet_tokenized_dataset['train'], bertbase_tokenized_dataset['train'], hatebert_tokenized_dataset['train'])
test_dataset = ConcatDataset(bertweet_tokenized_dataset['test'], bertbase_tokenized_dataset['test'], hatebert_tokenized_dataset['test'])
val_dataset = ConcatDataset(bertweet_tokenized_dataset['validation'], bertbase_tokenized_dataset['validation'], hatebert_tokenized_dataset['validation'])
train_val_dataset = ConcatDataset(bertweet_tokenized_dataset['train_val'], bertbase_tokenized_dataset['train_val'], hatebert_tokenized_dataset['train_val'])

  0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

  0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=16)
val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=16)
train_val_dataloader = DataLoader(train_val_dataset, shuffle=True, batch_size=16)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import torch
from torch import nn
class ConcatModel(torch.nn.Module):
    def __init__(self):
        super(ConcatModel, self).__init__()
        self.bertweet_model = AutoModel.from_pretrained("vinai/bertweet-base", config = AutoConfig.from_pretrained("vinai/bertweet-base", 
                                                                                                       output_attention = True, 
                                                                                                       output_hidden_state = True ) )
                                            
        self.bertbase_model = AutoModel.from_pretrained("bert-base-uncased", config = AutoConfig.from_pretrained("bert-base-uncased",
                                                                                                         output_attention = True, 
                                                                                                         output_hidden_state = True ) )  

        self.hatebert_model = AutoModel.from_pretrained("GroNLP/hateBERT", config = AutoConfig.from_pretrained("GroNLP/hateBERT",
                                                                                                         output_attention = True, 
                                                                                                         output_hidden_state = True ) )
        # Freeze first 10 layers of bertweet
        for param in self.bertweet_model.base_model.encoder.layer[:8].parameters():
            param.requires_grad = False
        
        # Freeze first 10 layers of bertbase
        for param in self.bertbase_model.base_model.encoder.layer[:8].parameters():
            param.requires_grad = False
            
        # Freeze first 10 layers of hatebert
        for param in self.hatebert_model.base_model.encoder.layer[:8].parameters():
            param.requires_grad = False

        self.classifier = nn.Sequential(
            # nn.Dropout(p=0.1),
            nn.Linear(768*3, 128),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(128, 2)
            # nn.Softmax(dim=1)
        )
    
    def forward(self, input_ids_bertweet, attention_mask_bertweet, input_ids_bertbase, attention_mask_bertbase, input_ids_hatebert, attention_mask_hatebert, label=None):
        logits_a = self.bertweet_model(input_ids_bertweet, attention_mask=attention_mask_bertweet).last_hidden_state[:, 0, :].view(-1, 768)
        logits_b = self.bertbase_model(input_ids_bertbase, attention_mask=attention_mask_bertbase).last_hidden_state[:, 0, :].view(-1, 768)
        logits_c = self.hatebert_model(input_ids_hatebert, attention_mask=attention_mask_hatebert).last_hidden_state[:, 0, :].view(-1, 768)
        # summed_vectors = torch.mean(torch.stack([logits_a, logits_b, logits_c]), dim=0)
        concat_vectors = torch.cat((logits_a, logits_b, logits_c), dim=1)
        output = self.classifier(concat_vectors)
        # print(output.shape)
        # Compute Loss
        loss = None
        if label is not None:
            loss_fct = nn.CrossEntropyLoss()
            # print(output.shape)
            # print(label.shape)
            # print(label.view(-1).shape)
            # print(output.view(-1, 2).shape)
            loss = loss_fct(output.view(-1, 2), label.view(-1))
            
            return TokenClassifierOutput(loss=loss, logits=output, hidden_states=None, attentions=None)

        
model = ConcatModel().to(device)

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNor

In [None]:
from transformers import AdamW, get_scheduler

optimizer = AdamW(model.parameters())

num_epoch = 2

num_training_steps = num_epoch * len(train_dataloader)

lr_scheduler = get_scheduler(
    'linear',
    optimizer = optimizer,
    num_warmup_steps = 10000,
    num_training_steps = num_training_steps,
    
)



In [None]:
from datasets import load_metric
metric = load_metric("accuracy")
f1 = load_metric("f1")

In [None]:
# from tqdm.auto import tqdm
# num_training_steps = num_epoch * len(train_dataloader)
# warmup_steps = 0
# num_epochs = 2

# progress_bar_train = tqdm(range(num_training_steps))
# progress_bar_eval = tqdm(range(num_epochs * len(val_dataloader)))

# step = 0

# for epoch in range(num_epochs):
  
#   for batch in train_dataloader:
#       model.train()
#       # print([type(v) for v in batch.values()])
#       batch = {k: v.to(device) for k, v in batch.items()}
      
#       outputs = model(**batch)
#       loss = outputs.loss
#       loss.backward()

#       optimizer.step()
#       lr_scheduler.step()
#       optimizer.zero_grad()
#       progress_bar_train.update(1)
#       step += 1

#       if step % 100 == 0:
#           print(f"Step {step} of {num_training_steps}: loss = {loss.item()}")
#           # Save model
#           torch.save(model.state_dict(), f"/home/aflah20082/NLP_Project/Models/CustomModelSaves/model_{step}.pt")
          

#   model.eval()
#   for batch in val_dataloader:
#     # print(batch.keys())
#     batch = {k: v.to(device) for k, v in batch.items()}
#     with torch.no_grad():
#         outputs = model(**batch)

#     logits = outputs.logits
#     predictions = torch.argmax(logits, dim=-1)
#     metric.add_batch(predictions=predictions, references=batch["label"])
#     progress_bar_eval.update(1)
    
#   print(metric.compute())

  0%|          | 0/1324 [00:00<?, ?it/s]

  0%|          | 0/332 [00:00<?, ?it/s]

Step 100 of 1324: loss = 0.6314172744750977
Step 200 of 1324: loss = 0.5211493372917175
Step 300 of 1324: loss = 0.5171315670013428
Step 400 of 1324: loss = 0.3491620719432831
Step 500 of 1324: loss = 0.48275768756866455
Step 600 of 1324: loss = 0.286018043756485
{'accuracy': 0.8032477341389728}
Step 700 of 1324: loss = 0.5556113123893738
Step 800 of 1324: loss = 0.24925510585308075
Step 900 of 1324: loss = 0.2936851680278778
Step 1000 of 1324: loss = 0.4179897606372833
Step 1100 of 1324: loss = 0.34476351737976074
Step 1200 of 1324: loss = 0.2903992831707001
Step 1300 of 1324: loss = 0.356641560792923
{'accuracy': 0.7794561933534743}


In [None]:
# Save the model
# torch.save(model.state_dict(), "model.pt")

# Load the model
model2 = ConcatModel().to(device)
model2.load_state_dict(torch.load("/home/aflah20082/NLP_Project/Models/CustomModelSaves/model_600.pt"))

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNor

<All keys matched successfully>

In [None]:
model2.eval()

for batch in val_dataloader:
    # print(batch.keys())
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model2(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["label"])
    f1.add_batch(predictions=predictions, references=batch["label"])

print(metric.compute(), f1.compute(average="macro"))

{'accuracy': 0.7945619335347432} {'f1': 0.7594238914051331}


In [None]:
model2.eval()

for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model2(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

In [None]:
# pickle the model
import pickle
with open('firstconcatmodel.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
# load the model
import pickle
with open('firstconcatmodel.pkl', 'rb') as f:
    model = pickle.load(f)

In [None]:
model.eval()
ls_preds = []
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    ls_preds.append(predictions)



In [None]:
ls_preds

[tensor([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], device='cuda:0'),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1], device='cuda:0'),
 tensor([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], device='cuda:0'),
 tensor([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0], device='cuda:0'),
 tensor([0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1], device='cuda:0'),
 tensor([1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1], device='cuda:0'),
 tensor([1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1], device='cuda:0'),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0'),
 tensor([1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1], device='cuda:0'),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1], device='cuda:0'),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], device='cuda:0'),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1], device='cuda:0'),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1], device='cuda:0'),
 tensor([1, 