In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import json
from tqdm import tqdm
import random
import pickle
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data import SequentialSampler, TensorDataset, RandomSampler
from torch.cuda.amp import GradScaler
from torch.cuda.amp import autocast
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
import torch
import torch.nn as nn
from datasets import load_dataset
import time
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
val_cats = json.load(open("/Users/aayush/Documents/IIITD/Assignments/NLP/Project/Final_Project/Project_Data/value-categories.json"))
tags = ["training", "validation"]
data_dict = {}
ratio_hard = 0.5

In [3]:
for tag in tags:
    data_dict[tag] = {}
    arg_df = pd.read_csv("/Users/aayush/Documents/IIITD/Assignments/NLP/Project/Final_Project/Project_Data/arguments-"+tag+".tsv", sep="\t")
    label_df = pd.read_csv("/Users/aayush/Documents/IIITD/Assignments/NLP/Project/Final_Project/Project_Data/labels-"+tag+".tsv", sep="\t")
    level1_label_df = pd.read_csv("/Users/aayush/Documents/IIITD/Assignments/NLP/Project/Final_Project/Project_Data/level1-labels-"+tag+".tsv", sep="\t")
    merged_df = arg_df.merge(label_df, how="inner", on ="Argument ID").merge(level1_label_df, 
                                                                             how="inner", 
                                                                             on ="Argument ID").reset_index(drop=True)
    print(arg_df.shape, label_df.shape, level1_label_df.shape, merged_df.shape)
#     merged_df.head(2)

    labels = [i for i in label_df.columns if i != 'Argument ID']
    level_1 = [i for i in level1_label_df.columns if i != 'Argument ID']
#     len(labels), len(level_1)

    option_map = {}
    for ix, row in merged_df.iterrows():
        options = {}
        used = []
        for l in labels:
            tmp = {}
            if row[l] == 1:
                for l1 in val_cats[l].keys():
                    if row[l1] == 1:
                        tmp[l1] = val_cats[l][l1]
                        used.extend([l, l1])
                options[l] = tmp
        all_tagged = set([c for c in labels + level_1 if row[c] == 1])
        assert len(all_tagged.difference(set(used))) == 0
        option_map[row["Argument ID"]] = options
#     len(option_map)

    for ix, row in tqdm(merged_df.iterrows()):
        dct = {"id": row["Argument ID"], "stance": row["Stance"], "premise": row["Premise"], 
               "conclusion": row["Conclusion"], "labels": list(option_map[row["Argument ID"]].keys())}
        stance = " against. " if dct["stance"] == "against" else " in favor of. "
        dct["sent"] = dct["premise"] + stance + dct["conclusion"]
        dct["opts"] = list(set([k2 + " by " + i for k, v in option_map[dct["id"]].items() for k2, v2 in v.items() for i in v2]))

        na_options_hard, na_options_easy = [], []
        for k, v in option_map[dct["id"]].items():
            l1_present = set(v.keys())
            l1_all = set(val_cats[k].keys())
            assert len(l1_all) >= len(l1_present)
            l1_not_present = l1_all.difference(l1_present)
            na_options_hard.extend([i + " by " + j for i in list(l1_not_present) for j in val_cats[k][i]])

        na_options_easy = [k + " by " + j for l in set(labels).difference(set(dct["labels"])) 
                           for k, v in val_cats[l].items() 
                           for j in v]
        random.shuffle(na_options_hard)
        random.shuffle(na_options_easy)

        hard_opts = na_options_hard[:int(len(dct["opts"])*ratio_hard)]
        easy_opts = na_options_easy[:(len(dct["opts"]) - len(hard_opts))]
        assert len(hard_opts) + len(easy_opts) == len(dct["opts"])
        dct["adverse_hard_opts"], dct["adverse_easy_opts"] = hard_opts, easy_opts
        data_dict[tag][row["Argument ID"]] = dct
#     break
        

(5393, 4) (5393, 21) (5393, 55) (5393, 78)


5393it [00:00, 14507.22it/s]


(1896, 4) (1896, 21) (1896, 55) (1896, 78)


1896it [00:00, 16978.31it/s]


In [4]:
# pickle.dump(data_dict, open("/Users/aayush/Documents/IIITD/Assignments/NLP/Project/Final_Project/Project_Data/data_dict_raw.pkl", "wb"))

In [16]:
data_dict

{'training': {'A01002': {'id': 'A01002',
   'stance': 'in favor of',
   'premise': 'we should ban human cloning as it will only cause huge issues when you have a bunch of the same humans running around all acting the same.',
   'conclusion': 'We should ban human cloning',
   'labels': ['Security: societal'],
   'sent': 'we should ban human cloning as it will only cause huge issues when you have a bunch of the same humans running around all acting the same. in favor of. We should ban human cloning',
   'opts': ['Have a stable society by promoting the social order',
    'Have a stable society by preventing chaos and disorder',
    'Have a stable society by accepting or maintaining the existing social structure',
    'Have a stable society by resulting in a country that is more stable'],
   'adverse_hard_opts': ['Have a safe country by resulting in a stronger state',
    'Have a safe country by resulting in a state that can better act on crimes'],
   'adverse_easy_opts': ['Have wealth by 

In [5]:
all_labels = sorted(list(val_cats.keys()))
all_labels_reduced = sorted(list(set([i.split(":")[0] for i in list(val_cats.keys())])))
id_2_class = {ix:i for ix, i in enumerate(all_labels)}
id_2_class_reduced = {ix:i for ix, i in enumerate(all_labels_reduced)}
### Create an inverse mapping as well
class_2_id = {v:k for k, v in id_2_class.items()}
class_2_id_reduced = {v:k for k, v in id_2_class_reduced.items()}

len(all_labels), len(all_labels_reduced)

(20, 12)

In [6]:
class_2_id

{'Achievement': 0,
 'Benevolence: caring': 1,
 'Benevolence: dependability': 2,
 'Conformity: interpersonal': 3,
 'Conformity: rules': 4,
 'Face': 5,
 'Hedonism': 6,
 'Humility': 7,
 'Power: dominance': 8,
 'Power: resources': 9,
 'Security: personal': 10,
 'Security: societal': 11,
 'Self-direction: action': 12,
 'Self-direction: thought': 13,
 'Stimulation': 14,
 'Tradition': 15,
 'Universalism: concern': 16,
 'Universalism: nature': 17,
 'Universalism: objectivity': 18,
 'Universalism: tolerance': 19}

In [7]:
reduced_to_full_mapping = {reduced_label: [] for reduced_label in all_labels_reduced}
for full_label in all_labels:
    reduced_label = full_label.split(":")[0]
    reduced_to_full_mapping[reduced_label].append(full_label)

# Convert the mapping from label names to label indices for use in your model
reduced_to_full_indices = {class_2_id_reduced[red]: [class_2_id[full] for full in fulls] for red, fulls in reduced_to_full_mapping.items()}

# Output the reduced to full label mapping based on indices
print(reduced_to_full_indices)

{0: [0], 1: [1, 2], 2: [3, 4], 3: [5], 4: [6], 5: [7], 6: [8, 9], 7: [10, 11], 8: [12, 13], 9: [14], 10: [15], 11: [16, 17, 18, 19]}


In [8]:
reduced_to_full_mapping

{'Achievement': ['Achievement'],
 'Benevolence': ['Benevolence: caring', 'Benevolence: dependability'],
 'Conformity': ['Conformity: interpersonal', 'Conformity: rules'],
 'Face': ['Face'],
 'Hedonism': ['Hedonism'],
 'Humility': ['Humility'],
 'Power': ['Power: dominance', 'Power: resources'],
 'Security': ['Security: personal', 'Security: societal'],
 'Self-direction': ['Self-direction: action', 'Self-direction: thought'],
 'Stimulation': ['Stimulation'],
 'Tradition': ['Tradition'],
 'Universalism': ['Universalism: concern',
  'Universalism: nature',
  'Universalism: objectivity',
  'Universalism: tolerance']}

In [9]:
all_labels_reduced

['Achievement',
 'Benevolence',
 'Conformity',
 'Face',
 'Hedonism',
 'Humility',
 'Power',
 'Security',
 'Self-direction',
 'Stimulation',
 'Tradition',
 'Universalism']

In [10]:
example_dict = {}
for tag in tags:
    if example_dict.get(tag, None) is None:
        example_dict[tag] = []
        
    for k, v in data_dict[tag].items():
        tmp = [0] * len(all_labels)
        tmp_red = [0] * len(all_labels_reduced)
        
        for ix, i in enumerate(all_labels):
            if i in v["labels"]:
                tmp[ix] = 1
        
        red_labels = set([i.split(":")[0] for i in v["labels"]])
        for ix, i in enumerate(all_labels_reduced):
            if i in red_labels:
                tmp_red[ix] = 1
        example_dict[tag].append([k, v["sent"], tmp, tmp_red])

In [11]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
# example_dict = pickle.load(open("/Users/aayush/Documents/IIITD/Assignments/NLP/Project/Final_Project/Project_Data/example_dict_standard_raw.pkl", "rb"))

In [12]:
def get_encodings(lst):
    all_toks = tokenizer([example[1] for example in lst], padding=True)
    all_lbl = [example[-2] for example in lst]
    red_lbl = [example[-1] for example in lst]
    return torch.tensor(all_toks.input_ids), torch.tensor(all_toks.attention_mask), \
            torch.tensor(all_lbl), torch.tensor(red_lbl)

In [13]:
train_input_ids, train_attention_mask, train_labels, train_red_labels = get_encodings(example_dict["training"])
valid_input_ids, valid_attention_mask, valid_labels, valid_red_labels = get_encodings(example_dict["validation"])

print(train_input_ids.shape, train_attention_mask.shape, train_labels.shape, train_red_labels.shape)
print(valid_input_ids.shape, valid_attention_mask.shape, valid_labels.shape, valid_red_labels.shape)

torch.Size([5393, 166]) torch.Size([5393, 166]) torch.Size([5393, 20]) torch.Size([5393, 12])
torch.Size([1896, 159]) torch.Size([1896, 159]) torch.Size([1896, 20]) torch.Size([1896, 12])


In [14]:


# Define the model
class BaselineModel(nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        self.base_model = RobertaModel.from_pretrained("roberta-base")
        self.classifier = nn.Linear(768, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        pooled_output = outputs.pooler_output
        return self.classifier(pooled_output)

# Initialize models
device = torch.device("cuda" if torch.cuda.is_available() else "mps")
model_full = BaselineModel(20).to(device)
model_reduced = BaselineModel(12).to(device)

# Prepare data loaders
train_full_data = TensorDataset(train_input_ids, train_attention_mask, train_labels)
valid_full_data = TensorDataset(valid_input_ids, valid_attention_mask, valid_labels)
train_reduced_data = TensorDataset(train_input_ids, train_attention_mask, train_red_labels)
valid_reduced_data = TensorDataset(valid_input_ids, valid_attention_mask, valid_red_labels)

train_full_loader = DataLoader(train_full_data, batch_size=16, sampler=RandomSampler(train_full_data))
valid_full_loader = DataLoader(valid_full_data, batch_size=16, sampler=SequentialSampler(valid_full_data))
train_reduced_loader = DataLoader(train_reduced_data, batch_size=16, sampler=RandomSampler(train_reduced_data))
valid_reduced_loader = DataLoader(valid_reduced_data, batch_size=16, sampler=SequentialSampler(valid_reduced_data))

### Expand the reduced predictions to full predictions
def expand_predictions(preds_reduced, mapping):
    expanded_preds = torch.zeros(preds_reduced.size(0), 20, device=device)  # Ensure 20 is the number of full labels
    for reduced_idx, full_indices in mapping.items():
        for full_idx in full_indices:
            expanded_preds[:, full_idx] = preds_reduced[:, reduced_idx]
    return expanded_preds


# Training function
def train(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    criterion = nn.BCEWithLogitsLoss()

    for batch in tqdm(dataloader, desc="Training"):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels.float())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(dataloader)

# Evaluation function
def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    criterion = nn.BCEWithLogitsLoss()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels.float())
            total_loss += loss.item()
            all_preds.append(torch.sigmoid(logits))
            all_labels.append(labels)

    all_preds = torch.cat(all_preds, dim=0)
    all_labels = torch.cat(all_labels, dim=0)
    return total_loss / len(dataloader), all_preds, all_labels

# Optimization
optimizer_full = torch.optim.AdamW(model_full.parameters(), lr=1e-5)
optimizer_reduced = torch.optim.AdamW(model_reduced.parameters(), lr=1e-5)
num_epochs = 10
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    print("Training Full Model:")
    train_loss_full = train(model_full, train_full_loader, optimizer_full, device)
    print("Training Reduced Model:")
    train_loss_reduced = train(model_reduced, train_reduced_loader, optimizer_reduced, device)

    print("Evaluating Full Model:")
    _, preds_full, _ = evaluate(model_full, valid_full_loader, device)
    print("Evaluating Reduced Model:")
    _, preds_reduced, _ = evaluate(model_reduced, valid_reduced_loader, device)

    # Expand reduced predictions and ensemble
    preds_reduced_expanded = expand_predictions(preds_reduced, reduced_to_full_indices)
    ensemble_preds = (preds_full + preds_reduced_expanded) / 2
    rounded_preds = ensemble_preds.round()
    f1 = f1_score(valid_labels.numpy(), rounded_preds.cpu().numpy(), average='macro')
    accuracy = accuracy_score(valid_labels.numpy(), rounded_preds.cpu().numpy())

    print(f"Validation F1 Score: {f1}")
    print(f"Validation Accuracy: {accuracy}")

# Save the models
torch.save(model_full.state_dict(), "model_full.pt")
torch.save(model_reduced.state_dict(), "model_reduced.pt")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Training Full Model:


Training: 100%|██████████| 338/338 [03:34<00:00,  1.58it/s]


Training Reduced Model:


Training: 100%|██████████| 338/338 [03:35<00:00,  1.57it/s]


Evaluating Full Model:


Evaluating: 100%|██████████| 119/119 [00:19<00:00,  6.01it/s]


Evaluating Reduced Model:


Evaluating: 100%|██████████| 119/119 [00:19<00:00,  6.02it/s]


Validation F1 Score: 0.1841966895245501
Validation Accuracy: 0.031118143459915613
Epoch 2/10
Training Full Model:


Training: 100%|██████████| 338/338 [03:31<00:00,  1.60it/s]


Training Reduced Model:


Training: 100%|██████████| 338/338 [03:31<00:00,  1.60it/s]


Evaluating Full Model:


Evaluating: 100%|██████████| 119/119 [00:19<00:00,  5.97it/s]


Evaluating Reduced Model:


Evaluating: 100%|██████████| 119/119 [00:19<00:00,  5.96it/s]


Validation F1 Score: 0.2988598652378924
Validation Accuracy: 0.04219409282700422
Epoch 3/10
Training Full Model:


Training: 100%|██████████| 338/338 [03:33<00:00,  1.58it/s]


Training Reduced Model:


Training: 100%|██████████| 338/338 [03:33<00:00,  1.58it/s]


Evaluating Full Model:


Evaluating: 100%|██████████| 119/119 [00:19<00:00,  5.96it/s]


Evaluating Reduced Model:


Evaluating: 100%|██████████| 119/119 [00:19<00:00,  5.97it/s]


Validation F1 Score: 0.3450756118272699
Validation Accuracy: 0.03850210970464135
Epoch 4/10
Training Full Model:


Training: 100%|██████████| 338/338 [03:33<00:00,  1.58it/s]


Training Reduced Model:


Training: 100%|██████████| 338/338 [03:34<00:00,  1.58it/s]


Evaluating Full Model:


Evaluating: 100%|██████████| 119/119 [00:19<00:00,  5.97it/s]


Evaluating Reduced Model:


Evaluating: 100%|██████████| 119/119 [00:19<00:00,  5.99it/s]


Validation F1 Score: 0.3907449526021393
Validation Accuracy: 0.04272151898734177
Epoch 5/10
Training Full Model:


Training: 100%|██████████| 338/338 [03:34<00:00,  1.58it/s]


Training Reduced Model:


Training: 100%|██████████| 338/338 [03:34<00:00,  1.58it/s]


Evaluating Full Model:


Evaluating: 100%|██████████| 119/119 [00:20<00:00,  5.87it/s]


Evaluating Reduced Model:


Evaluating: 100%|██████████| 119/119 [00:20<00:00,  5.87it/s]


Validation F1 Score: 0.39137378503068065
Validation Accuracy: 0.03691983122362869
Epoch 6/10
Training Full Model:


Training: 100%|██████████| 338/338 [03:33<00:00,  1.58it/s]


Training Reduced Model:


Training: 100%|██████████| 338/338 [03:32<00:00,  1.59it/s]


Evaluating Full Model:


Evaluating: 100%|██████████| 119/119 [00:19<00:00,  5.98it/s]


Evaluating Reduced Model:


Evaluating: 100%|██████████| 119/119 [00:19<00:00,  5.98it/s]


Validation F1 Score: 0.3932262748591634
Validation Accuracy: 0.03270042194092827
Epoch 7/10
Training Full Model:


Training: 100%|██████████| 338/338 [03:34<00:00,  1.58it/s]


Training Reduced Model:


Training: 100%|██████████| 338/338 [03:33<00:00,  1.58it/s]


Evaluating Full Model:


Evaluating: 100%|██████████| 119/119 [00:19<00:00,  5.98it/s]


Evaluating Reduced Model:


Evaluating: 100%|██████████| 119/119 [00:19<00:00,  5.97it/s]


Validation F1 Score: 0.4100395940866327
Validation Accuracy: 0.030063291139240507
Epoch 8/10
Training Full Model:


Training: 100%|██████████| 338/338 [03:32<00:00,  1.59it/s]


Training Reduced Model:


Training: 100%|██████████| 338/338 [03:34<00:00,  1.58it/s]


Evaluating Full Model:


Evaluating: 100%|██████████| 119/119 [00:20<00:00,  5.94it/s]


Evaluating Reduced Model:


Evaluating: 100%|██████████| 119/119 [00:19<00:00,  6.00it/s]


Validation F1 Score: 0.4093011941392081
Validation Accuracy: 0.031118143459915613
Epoch 9/10
Training Full Model:


Training: 100%|██████████| 338/338 [03:34<00:00,  1.58it/s]


Training Reduced Model:


Training: 100%|██████████| 338/338 [03:33<00:00,  1.58it/s]


Evaluating Full Model:


Evaluating: 100%|██████████| 119/119 [00:19<00:00,  5.96it/s]


Evaluating Reduced Model:


Evaluating: 100%|██████████| 119/119 [00:20<00:00,  5.88it/s]


Validation F1 Score: 0.40277887125808104
Validation Accuracy: 0.03270042194092827
Epoch 10/10
Training Full Model:


Training: 100%|██████████| 338/338 [03:32<00:00,  1.59it/s]


Training Reduced Model:


Training: 100%|██████████| 338/338 [03:31<00:00,  1.60it/s]


Evaluating Full Model:


Evaluating: 100%|██████████| 119/119 [00:19<00:00,  5.98it/s]


Evaluating Reduced Model:


Evaluating: 100%|██████████| 119/119 [00:19<00:00,  5.98it/s]


Validation F1 Score: 0.40832106273167257
Validation Accuracy: 0.026371308016877638


In [18]:
def evaluate_valid(model, dataloader, device):
    model.eval()
    total_loss = 0
    criterion = nn.BCEWithLogitsLoss()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels.float())
            total_loss += loss.item()
            all_preds.append(torch.sigmoid(logits))
            all_labels.append(labels)
    return torch.cat(all_preds, dim=0), torch.cat(all_labels, dim=0)

In [20]:
preds_full, labels_full = evaluate_valid(model_full, valid_full_loader, device)
preds_reduced, _ = evaluate_valid(model_reduced, valid_reduced_loader, device)

# Expand reduced predictions and ensemble
preds_reduced_expanded = expand_predictions(preds_reduced, reduced_to_full_indices)
ensemble_preds = (preds_full + preds_reduced_expanded) / 2
rounded_ensemble_preds = (ensemble_preds > 0.5).int()

# Compute label-wise F1-scores
f1_scores = f1_score(labels_full.cpu().numpy(), rounded_ensemble_preds.cpu().numpy(), average=None)
accuracy = accuracy_score(labels_full.cpu().numpy(), rounded_ensemble_preds.cpu().numpy())
print(f"Validation Accuracy: {accuracy}")
print("Label-wise F1 Scores:")
for i, score in enumerate(f1_scores):
    print(f"Label {id_2_class[i]} F1-score: {score:.4f}")

Evaluating: 100%|██████████| 119/119 [00:21<00:00,  5.66it/s]
Evaluating: 100%|██████████| 119/119 [00:19<00:00,  5.97it/s]

Validation Accuracy: 0.026371308016877638
Label-wise F1 Scores:
Label Achievement F1-score: 0.6531
Label Benevolence: caring F1-score: 0.6016
Label Benevolence: dependability F1-score: 0.3249
Label Conformity: interpersonal F1-score: 0.2000
Label Conformity: rules F1-score: 0.4733
Label Face F1-score: 0.0559
Label Hedonism F1-score: 0.4156
Label Humility F1-score: 0.0933
Label Power: dominance F1-score: 0.2473
Label Power: resources F1-score: 0.4494
Label Security: personal F1-score: 0.6865
Label Security: societal F1-score: 0.5773
Label Self-direction: action F1-score: 0.5648
Label Self-direction: thought F1-score: 0.5007
Label Stimulation F1-score: 0.2130
Label Tradition F1-score: 0.3793
Label Universalism: concern F1-score: 0.6480
Label Universalism: nature F1-score: 0.4018
Label Universalism: objectivity F1-score: 0.4357
Label Universalism: tolerance F1-score: 0.2447





In [15]:
# len(data_dict['validation'].keys())