In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from transformers import XLMRobertaTokenizer, XLMRobertaModel
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import CosineAnnealingLR

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [3]:
task = 1

data_path = "drive/MyDrive/463_hw2_data/"

if task == 1:
    train_path = data_path + "train_data_orientation.tsv"
    test_path = data_path + "test_data_orientation.tsv"
    text_type = "text_en"
elif task == 2:
    train_path = data_path + "train_data_power.tsv"
    test_path = data_path + "test_data_power.tsv"
    text_type = "text"
else:
    raise ValueError("Invalid task number")


train_df = pd.read_csv(train_path, sep='\t')
test_df = pd.read_csv(test_path, sep='\t')

# Drop rows with missing or empty text_en
train_df = train_df.dropna(subset=["text_en"])  # Remove rows where text_en is NaN
train_df = train_df[train_df["text_en"].str.strip() != ""]  # Remove rows where text_en is empty or whitespace

test_df = test_df.dropna(subset=["text_en"])  # Remove rows where text_en is NaN
test_df = test_df[test_df["text_en"].str.strip() != ""]  # Remove rows where text_en is empty or whitespace

# Reset index after filtering
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [4]:
class CustomXLMRobertaModel(nn.Module):
    def __init__(self, pretrained_model_name="FacebookAI/xlm-roberta-base", num_additional_features=1, output_size=2):
        super(CustomXLMRobertaModel, self).__init__()
        self.tokenizer = XLMRobertaTokenizer.from_pretrained(pretrained_model_name)
        self.xlm_roberta = XLMRobertaModel.from_pretrained(pretrained_model_name)

        # Freeze all layers except the last few
        for param in self.xlm_roberta.parameters():
            param.requires_grad = False
        for param in self.xlm_roberta.encoder.layer[-8:].parameters():
            param.requires_grad = True

        hidden_size = self.xlm_roberta.config.hidden_size + num_additional_features

        self.fc1 = nn.Linear(hidden_size, 512)
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(0.5)

        num_heads = 8
        self.self_attention = nn.MultiheadAttention(embed_dim=512, num_heads=num_heads, batch_first=True)

        # Fully connected layers

        self.fc2 = nn.Linear(512, 128)
        self.tanh = nn.Tanh()
        self.fc3 = nn.Linear(128, 64)
        self.relu = nn.ReLU()
        self.fc4 = nn.Linear(64,output_size)  # Output size = 2

    def forward(self, input_ids, attention_mask, additional_features):
        # Step 1: XLM-RoBERTa processing
        outputs = self.xlm_roberta(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # Extract [CLS] token representation

        # Step 2: Concatenate additional features
        combined_features = torch.cat((cls_output, additional_features), dim=1)  # Combine CLS and additional features

        # Step 3: Pass through the first fully connected layer
        fc1_output = self.fc1(combined_features)
        fc1_output = self.sigmoid(fc1_output)  # Apply activation
        fc1_output = self.dropout(fc1_output)  # Apply dropout

        # Step 4: Prepare for self-attention (add sequence dimension)
        fc1_output = fc1_output.unsqueeze(1)

        # Step 5: Pass through the self-attention layer
        attn_output, _ = self.self_attention(fc1_output, fc1_output, fc1_output)

        # Remove sequence dimension
        attn_output = attn_output.squeeze(1)

        # Step 6: Pass through the remaining fully connected layers
        x = self.fc2(attn_output)
        x = self.tanh(x)  # Apply activation

        x = self.fc3(x)
        x = self.relu(x)  # Apply activation

        x = self.fc4(x)
        x = torch.softmax(x, dim=1)

        return x

class ParliamentaryDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Extract text and additional features
        text = self.data.loc[idx, text_type]
        additional_features = {
            "id": self.data.loc[idx, "id"],
            "speaker": self.data.loc[idx, "speaker"],
            "sex": self.data.loc[idx, "sex"]
        }
        label = self.data.loc[idx, "label"]

        # Tokenize text
        tokenized = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        sex_encoded = 1 if additional_features["sex"] == "M" else 0 if additional_features["sex"] == "F" else -1

        # Combine encoded features into a tensor
        extra_features = torch.tensor([sex_encoded], dtype=torch.float)

        return {
            "input_ids": tokenized["input_ids"].squeeze(0),
            "attention_mask": tokenized["attention_mask"].squeeze(0),
            "additional_features": extra_features,
            "label": torch.tensor(label, dtype=torch.float)
        }

def initialize_weights(layer):
    if isinstance(layer, nn.Linear):
        nn.init.xavier_uniform_(layer.weight)
        nn.init.zeros_(layer.bias)


In [None]:
# Define the sampling percentage
sampling_percentage = 1.0
batch_size = 64
max_length = 512
max_length = min(max_length,512)
pretrained_model_name="FacebookAI/xlm-roberta-base"

# Randomly sample a percentage of the training data
sampled_train_df = train_df.sample(frac=sampling_percentage, random_state=42).reset_index(drop=True)
sampled_train_df, val_df = train_test_split(sampled_train_df, test_size=0.09, random_state=42)
sampled_train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)

# Create the dataset and DataLoader with the sampled data
train_dataset = ParliamentaryDataset(sampled_train_df, XLMRobertaTokenizer.from_pretrained(pretrained_model_name),max_length=max_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = ParliamentaryDataset(test_df, XLMRobertaTokenizer.from_pretrained(pretrained_model_name),max_length=max_length)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

val_dataset = ParliamentaryDataset(val_df, XLMRobertaTokenizer.from_pretrained(pretrained_model_name),max_length=max_length)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

In [None]:
# Count occurrences of each class
class_counts = [sampled_train_df["label"].value_counts()[0], sampled_train_df["label"].value_counts()[1]]
total_samples = class_counts[0] + class_counts[1]

# Calculate weights inversely proportional to class frequencies
class_weights = [class_counts[1], class_counts[0]]/total_samples

# Convert to tensor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class_weights_tensor = torch.tensor([class_weights[0], class_weights[1]], dtype=torch.float).to(device)
print(class_weights_tensor)
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)

In [None]:
reload = 1
num_epochs = 30
model_name = "drive/MyDrive/463_hw2_data/model_task" + str(task) + ".pth"

model = CustomXLMRobertaModel()
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-5,weight_decay=0.01)
scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs)

if reload == 0:
    model.apply(initialize_weights)
else:
    model = torch.load(model_name)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
for epoch in range(num_epochs):
    # Training phase
    model.train()
    total_train_loss = 0
    all_train_preds = []
    all_train_labels = []

    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_loader_tqdm = tqdm(train_loader, desc="Training", unit="batch")

    for batch in train_loader_tqdm:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        additional_features = batch["additional_features"].to(device)
        labels = batch["label"].long().to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, additional_features)
        loss = criterion(outputs, labels)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        total_train_loss += loss.item()
        preds = torch.argmax(outputs, dim=1).detach().cpu().numpy()
        all_train_preds.extend(preds)
        all_train_labels.extend(labels.detach().cpu().numpy())

        train_loader_tqdm.set_postfix({"Batch Loss": loss.item()})

    avg_train_loss = total_train_loss / len(train_loader)
    train_balanced_acc = balanced_accuracy_score(all_train_labels, all_train_preds)

    print(f"Training Loss: {avg_train_loss:.4f}")
    print(f"Training Balanced Accuracy: {train_balanced_acc:.4f}")
    print(classification_report(all_train_labels, all_train_preds, zero_division=0))

    # Validation phase
    model.eval()
    total_val_loss = 0
    all_val_preds = []
    all_val_labels = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            additional_features = batch["additional_features"].to(device)
            labels = batch["label"].long().to(device)

            outputs = model(input_ids, attention_mask, additional_features)
            loss = criterion(outputs, labels)

            total_val_loss += loss.item()
            preds = torch.argmax(outputs, dim=1).detach().cpu().numpy()
            all_val_preds.extend(preds)
            all_val_labels.extend(labels.detach().cpu().numpy())

    avg_val_loss = total_val_loss / len(val_loader)
    val_balanced_acc = balanced_accuracy_score(all_val_labels, all_val_preds)

    print(f"Validation Loss: {avg_val_loss:.4f}")
    print(f"Validation Balanced Accuracy: {val_balanced_acc:.4f}")
    print(classification_report(all_val_labels, all_val_preds, zero_division=0))

    # Step the scheduler
    scheduler.step(avg_val_loss)
    print(f"Learning Rate after Epoch {epoch+1}: {scheduler._last_lr[0]:.3e}")
    print("---------------------------------------------------------------------------")

    torch.save(model, model_name)

In [None]:
# Testing loop
model.eval()  # Set the model to evaluation mode
all_preds = []
all_labels = []

print("Evaluating...")
test_loader_tqdm = tqdm(test_loader, desc="Processing Batches", unit="batch")  # Wrap test_loader with tqdm

with torch.no_grad():  # No need to compute gradients during testing
    for batch_idx, batch in enumerate(test_loader_tqdm):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        additional_features = batch["additional_features"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask, additional_features).squeeze(1)
        preds = torch.argmax(outputs, dim=1).detach().cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.detach().cpu().numpy())

conf_matrix = confusion_matrix(all_labels, all_preds)

# Plot confusion matrix
if task == 1:
    class_names = ["Left-0", "Right-1"]
elif task == 2:
    class_names = ["Governing-0", "Opposition-1"]
else:
    raise ValueError("Invalid task number")

disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=class_names)
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.show()

print(classification_report(all_labels, all_preds, zero_division=0))