#FINAL

# Final_RL

In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, T5ForConditionalGeneration, T5Tokenizer, AdamW
from sklearn.model_selection import train_test_split

# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "mps")

# Load Data
DATA_PATH = "/Users/aayush/Documents/IIITD/Assignments/NLP/Project/Final_Project/Project_Data"
arguments_data = pd.read_csv(f'{DATA_PATH}/arguments-training.tsv', delimiter='\t')
labels_data = pd.read_csv(f'{DATA_PATH}/labels-training.tsv', delimiter='\t')
full_data = pd.merge(arguments_data, labels_data, on="Argument ID")

# Initialize Tokenizers and Models
classification_model_name = 'pepa/roberta-base-snli'
classification_tokenizer = AutoTokenizer.from_pretrained(classification_model_name)
classification_model = AutoModelForSequenceClassification.from_pretrained(classification_model_name)
classification_model.to(device)

generation_model_name = 't5-small'
generation_tokenizer = T5Tokenizer.from_pretrained(generation_model_name)
generation_model = T5ForConditionalGeneration.from_pretrained(generation_model_name)
generation_model.to(device)




  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, T5Tokenizer, T5ForConditionalGeneration, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import numpy as np

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "mps")

# Assume 'full_data' is loaded as per your dataset structure
label_columns = full_data.columns[4:]  # all label columns

# Dataset class
class ArgumentDataset(Dataset):
    def __init__(self, dataframe, class_tokenizer, gen_tokenizer, max_length=512):
        self.class_tokenizer = class_tokenizer
        self.gen_tokenizer = gen_tokenizer
        self.data = dataframe
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data_row = self.data.iloc[idx]
        premise = data_row['Premise']
        stance = data_row['Stance']
        conclusion = data_row['Conclusion']
        labels = data_row[label_columns].astype(int).to_numpy()

        # Tokenization for classification
        class_inputs = self.class_tokenizer(stance + " " + premise, return_tensors="pt", max_length=self.max_length, padding="max_length", truncation=True)

        # Prepare labels as string for generative model input
        labels_str = " ".join([f"{label_col}:{label}" for label_col, label in zip(label_columns, labels)])
        gen_input = f"{stance} {premise} {labels_str}"

        # Tokenization for generation
        gen_inputs = self.gen_tokenizer(gen_input, return_tensors="pt", max_length=self.max_length, padding="max_length", truncation=True)
        gen_labels = self.gen_tokenizer.encode(conclusion, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt").squeeze()

        return {
            'input_ids': class_inputs['input_ids'].squeeze(0),
            'attention_mask': class_inputs['attention_mask'].squeeze(0),
            'labels': torch.tensor(labels, dtype=torch.float),
            'gen_input_ids': gen_inputs['input_ids'].squeeze(0),
            'gen_attention_mask': gen_inputs['attention_mask'].squeeze(0),
            'gen_labels': gen_labels
        }

# Load Tokenizers and Models
classification_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
classification_model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_columns))
generation_tokenizer = T5Tokenizer.from_pretrained('t5-small')
generation_model = T5ForConditionalGeneration.from_pretrained('t5-small')

classification_model.to(device)
generation_model.to(device)

# Data split
train_data, val_data = train_test_split(full_data, test_size=0.1, random_state=42)
train_dataset = ArgumentDataset(train_data, classification_tokenizer, generation_tokenizer)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

# Optimizers
optimizer = AdamW(list(classification_model.parameters()) + list(generation_model.parameters()), lr=5e-5)

# Training loop
for epoch in range(1):  # Adjust the number of epochs as necessary
    classification_model.train()
    generation_model.train()
    all_f1_scores = []  # To store the F1 scores for all labels from all batches

    for i, batch in enumerate(train_loader):
        input_ids, attention_mask, labels, gen_input_ids, gen_attention_mask, gen_labels = \
            batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device), \
            batch['gen_input_ids'].to(device), batch['gen_attention_mask'].to(device), batch['gen_labels'].to(device)

        # Forward passes
        optimizer.zero_grad()
        class_outputs = classification_model(input_ids, attention_mask=attention_mask, labels=labels)
        class_loss = class_outputs.loss

        # Predictions for F1 score
        predictions = torch.sigmoid(class_outputs.logits).round().detach().cpu().numpy()
        true_labels = labels.detach().cpu().numpy()

        gen_outputs = generation_model(input_ids=gen_input_ids, attention_mask=gen_attention_mask, labels=gen_labels)
        gen_loss = gen_outputs.loss

        # Combine losses and perform backpropagation
        total_loss = class_loss + gen_loss
        total_loss.backward()
        optimizer.step()

        # Calculate F1 scores for each label using zero_division=1 to handle undefined metric cases
        label_f1_scores = f1_score(true_labels, predictions, average=None, zero_division=1)
        all_f1_scores.append(label_f1_scores)

        # Output batch results
        if(i%50==0):
            print(f"Batch {i+1}/{len(train_loader)} - Epoch {epoch+1}:")
            print(f"Classification Loss: {class_loss.item()}, Generation Loss: {gen_loss.item()}")
            print(f"Batch F1-Scores per Label: {label_f1_scores}")

    # Aggregate F1-score over all batches
    all_f1_scores = np.vstack(all_f1_scores)  # Stack all F1-scores from each batch
    aggregate_f1_scores = np.mean(all_f1_scores, axis=0)  # Mean F1-score for each label
    overall_f1_score = np.mean(aggregate_f1_scores)  # Overall average F1-score across all labels

    # Output epoch results
    print(f"End of Epoch {epoch+1}:")
    print(f"Aggregate F1-Scores per Label: {aggregate_f1_scores}")
    print(f"Overall F1-Score: {overall_f1_score}")

print('Finished Training')



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


RuntimeError: MPS backend out of memory (MPS allocated: 20.21 GB, other allocations: 205.66 MB, max allowed: 20.40 GB). Tried to allocate 8.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, T5Tokenizer, T5ForConditionalGeneration, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
import numpy as np


arguments_data_val = pd.read_csv(f'{DATA_PATH}/arguments-validation.tsv', delimiter='\t')
labels_data_val = pd.read_csv(f'{DATA_PATH}/labels-validation.tsv', delimiter='\t')
full_data_val = pd.merge(arguments_data_val, labels_data_val, on="Argument ID")
val_data = full_data_val

# Assuming all initialization and imports are done as before

# Function to evaluate the model and calculate F1-score per label
def evaluate_model(data_loader, classification_model, device):
    classification_model.eval()
    true_labels = []
    predictions = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids, attention_mask, labels = \
                batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)

            # Classification predictions
            class_outputs = classification_model(input_ids, attention_mask=attention_mask)
            predicted_labels = torch.sigmoid(class_outputs.logits).round().detach().cpu().numpy()
            actual_labels = labels.cpu().numpy()

            predictions.append(predicted_labels)
            true_labels.append(actual_labels)

    predictions = np.vstack(predictions)
    true_labels = np.vstack(true_labels)

    # Calculate F1 scores and accuracy
    f1_scores_per_label = f1_score(true_labels, predictions, average=None, zero_division=1)
    overall_f1 = np.mean(f1_scores_per_label)
    
    # Calculate accuracy for each label
    accuracy_per_label = []
    for i in range(true_labels.shape[1]):  # Iterate over each label column
        accuracy_per_label.append(accuracy_score(true_labels[:, i], predictions[:, i]))
    
    overall_accuracy = np.mean(accuracy_per_label)  # Average accuracy over all labels
    
    return f1_scores_per_label, overall_f1, overall_accuracy

# Load validation data into DataLoader
val_dataset = ArgumentDataset(val_data, classification_tokenizer, generation_tokenizer, max_length=512)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

# Evaluate the model on the validation dataset
f1_scores_per_label, overall_f1_validation, acc = evaluate_model(val_loader, classification_model, device)

# Output F1 scores for each label and the overall average
print("F1 Scores per Label:", f1_scores_per_label)
print("Overall Average F1 Score on Validation Set:", overall_f1_validation)
print("Overall Accuracy on Val set: ", acc)


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, T5Tokenizer, T5ForConditionalGeneration, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import numpy as np

arguments_data_test = pd.read_csv(f'{DATA_PATH}/arguments-test.tsv', delimiter='\t')
labels_data_test = pd.read_csv(f'{DATA_PATH}/labels-test.tsv', delimiter='\t')
full_data_test = pd.merge(arguments_data_test, labels_data_test, on="Argument ID")
test_data = full_data_test

# Assuming all initialization and imports are done as before

# Function to evaluate the model and calculate F1-score per label
# def evaluate_model(data_loader, classification_model, device):
#     classification_model.eval()  # Set the model to evaluation mode
#     true_labels = []
#     predictions = []

#     with torch.no_grad():  # Disable gradient computation
#         for batch in data_loader:
#             input_ids, attention_mask, labels = \
#                 batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)

#             # Get classification outputs
#             class_outputs = classification_model(input_ids, attention_mask=attention_mask)
#             predicted_labels = torch.sigmoid(class_outputs.logits).round().cpu().numpy()  # Get predicted labels
#             actual_labels = labels.cpu().numpy()  # Get actual labels

#             # Store predictions and actual labels
#             predictions.append(predicted_labels)
#             true_labels.append(actual_labels)

#     # Convert lists to NumPy arrays for evaluation
#     predictions = np.vstack(predictions)
#     true_labels = np.vstack(true_labels)

#     # Calculate F1 scores for each label
#     f1_scores_per_label = f1_score(true_labels, predictions, average=None, zero_division=1)
    
#     # Calculate the overall average F1 score across all labels
#     overall_f1 = np.mean(f1_scores_per_label)
    
#     return f1_scores_per_label, overall_f1

# Load validation data into DataLoader
# test_data = 
test_dataset = ArgumentDataset(test_data, classification_tokenizer, generation_tokenizer, max_length=512)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Evaluate the model on the validation dataset
f1_scores_per_label, overall_f1_validation, acc = evaluate_model(test_loader, classification_model, device)

# Output F1 scores for each label and the overall average
print("F1 Scores per Label:", f1_scores_per_label)
print("Overall Average F1 Score on Test Set:", overall_f1_validation)
print("Overall Accuracy on the test set: ", acc)
