In [3]:
import pandas as pd

df = pd.read_csv('./data/train.csv')
df

Unnamed: 0.1,Unnamed: 0,text_id,generated_text,suddenness,familiarity,predict_event,pleasantness,unpleasantness,goal_relevance,chance_responsblt,...,self_control,other_control,chance_control,accept_conseq,standards,social_norms,attention,not_consider,effort,emotion
0,0,3573,I was told “I love you” by a special person,2,2,4,5,1,5,3,...,5,5,5,4,1,1,4,4,3,joy
1,1,51199,I gave birth to my son.,2,3,5,3,3,5,4,...,1,2,4,4,1,1,5,1,5,joy
2,2,6601,When a student told me to ‘fuck off’ to my face,5,1,1,1,5,1,1,...,1,5,1,1,3,5,5,3,5,no-emotion
3,3,3727,When I’m doing my day to day routine.,4,2,2,1,3,2,1,...,4,1,4,1,1,1,1,2,4,no-emotion
4,4,52291,i failed a subject.,2,3,4,1,5,5,3,...,4,4,4,3,4,1,4,4,4,shame
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4315,4315,760,I had to go into my brothers flooded basement ...,3,1,1,1,5,1,4,...,4,1,5,1,4,1,5,4,3,fear
4316,4316,5121,When my dad cut me from his life,2,1,1,1,5,5,1,...,1,5,1,1,4,5,5,5,5,anger
4317,4317,4506,I felt fear when walking home alone on Wednesd...,2,4,4,1,5,1,1,...,3,3,1,3,5,1,5,5,4,fear
4318,4318,52508,my husband and my daughter came to meet me fro...,3,4,2,4,1,2,1,...,1,5,1,2,1,1,1,1,1,surprise


In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from transformers import RobertaTokenizer

# Define the custom dataset class
class EmotionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length, label_encoder):
        """
        Args:
            dataframe (pd.DataFrame): DataFrame containing 'sentence' and 'label' columns.
            tokenizer: Tokenizer for text data.
            max_length (int): Maximum sequence length for tokenization.
            label_encoder: Label encoder for converting labels to numeric.
        """
        self.sentences = dataframe['generated_text'].tolist()
        self.labels = dataframe['emotion'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.label_encoder = label_encoder
        self.encoded_labels = label_encoder.transform(self.labels)

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        # Tokenize the sentence
        inputs = self.tokenizer(
            self.sentences[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt',
        )
        # Get the label
        label = torch.tensor(self.encoded_labels[idx], dtype=torch.long)
        # Flatten tokenizer outputs (remove batch dimension)
        inputs = {key: val.squeeze(0) for key, val in inputs.items()}
        return inputs, label

# Prepare label encoder
def prepare_label_encoder(dataframe, label):
    label_encoder = LabelEncoder()
    label_encoder.fit(dataframe[label])
    return label_encoder

# Data preparation function
def prepare_dataloaders(train_path, val_path, tokenizer, batch_size=32, max_length=128):
    """
    Args:
        train_path (str): Path to the training CSV file.
        val_path (str): Path to the validation CSV file.
        tokenizer: Tokenizer for text data.
        batch_size (int): Batch size for DataLoader.
        max_length (int): Maximum sequence length for tokenization.

    Returns:
        train_dataloader, val_dataloader: Dataloaders for training and validation data.
        label_encoder: Fitted label encoder for decoding emotion labels.
    """
    # Load data
    train_df = pd.read_csv(train_path)
    val_df = pd.read_csv(val_path)
    emo_label = 'emotion'
    
    # Prepare label encoder
    label_encoder = prepare_label_encoder(train_df, emo_label)

    # Create datasets
    train_dataset = EmotionDataset(train_df, tokenizer, max_length, label_encoder)
    val_dataset = EmotionDataset(val_df, tokenizer, max_length, label_encoder)

    # Create dataloaders
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    return train_dataloader, val_dataloader, label_encoder

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
import torch
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast, GradScaler
import torch.nn.functional as F

# Validation function
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total_samples = 0
    
    with torch.no_grad():
        for batch, labels in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            labels = labels.to(device)
            
            with autocast():
                outputs = model(**batch)
                loss = criterion(outputs.logits, labels)
                total_loss += loss.item()
            
            preds = torch.argmax(outputs.logits, dim=1)
            correct += (preds == labels).sum().item()
            total_samples += labels.size(0)
    
    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total_samples
    return avg_loss, accuracy

# Training loop with mixed precision and gradient checkpointing
def train_model(model, dataloader, optimizer, scheduler, criterion, device, num_epochs=5):
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        for batch, labels in dataloader:
            optimizer.zero_grad()
            batch = {k: v.to(device) for k, v in batch.items()}
            labels = labels.to(device)

            with autocast():
                outputs = model(**batch)
                loss = criterion(outputs.logits, labels)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

        val_loss, val_accuracy = evaluate(model, val_dataloader, criterion, device)
        print(f"Epoch {epoch+1}/{num_epochs}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

if __name__ == '__main__':
    # Model and tokenizer setup
    tokenizer = AutoTokenizer.from_pretrained('roberta-base')
    model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=NUM_EMOTIONS)
    model.gradient_checkpointing_enable()  # Enable gradient checkpointing
    
    # Mixed precision training setup
    scaler = GradScaler()
    

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import DataLoader, Dataset

# Define the Appraisal MLP model
class AppraisalMLP(nn.Module):
    def __init__(self, input_dim, num_emotions):
        super(AppraisalMLP, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, num_emotions)
        )
    
    def forward(self, x):
        return self.fc(x)

# Training loop with mixed precision and gradient checkpointing
def train_appraisal_model(model, dataloader, optimizer, criterion, device, num_epochs=5):
    model.to(device)
    scaler = GradScaler()

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0

        for appraisal_inputs, labels in dataloader:
            optimizer.zero_grad()
            appraisal_inputs = appraisal_inputs.to(device)
            labels = labels.to(device)

            with autocast():  # Mixed precision context
                outputs = model(appraisal_inputs)
                loss = criterion(outputs, labels)
                total_train_loss += loss.item()

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

        avg_train_loss = total_train_loss / len(dataloader)
        val_loss, val_accuracy = evaluate(model, val_dataloader, criterion, device)

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

# Example of creating the optimizer and criterion
input_dim = 21  # Change to match your dataset's number of appraisal dimensions
num_emotions = 6  # Change to match the number of emotion classes
model = AppraisalMLP(input_dim=input_dim, num_emotions=num_emotions)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

# Assume `train_dataloader` and `val_dataloader` are already defined
train_appraisal_model(model, train_dataloader, optimizer, criterion, device='cuda', num_epochs=10)


In [None]:
class CombinedModel(nn.Module):
    def __init__(self, num_emotions, appraisal_input_dim):
        super(CombinedModel, self).__init__()
        self.text_model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=num_emotions)
        self.text_model.gradient_checkpointing_enable()  # Enable gradient checkpointing for the text model
        self.appraisal_model = AppraisalMLP(input_dim=appraisal_input_dim, num_emotions=num_emotions)
        self.fc_combined = nn.Sequential(
            nn.Linear(num_emotions * 2, 128),
            nn.ReLU(),
            nn.Linear(128, num_emotions)
        )
    
    def forward(self, text_inputs, appraisal_inputs):
        text_outputs = self.text_model(**text_inputs).logits
        appraisal_outputs = self.appraisal_model(appraisal_inputs)
        combined = torch.cat((text_outputs, appraisal_outputs), dim=1)
        return self.fc_combined(combined)

# Combined training with mixed precision
def train_combined_model(model, text_dataloader, appraisal_dataloader, optimizer, scheduler, criterion, device, num_epochs=5):
    model.to(device)
    scaler = GradScaler()
    
    for epoch in range(num_epochs):
        model.train()
        for (text_batch, text_labels), appraisal_inputs in zip(text_dataloader, appraisal_dataloader):
            optimizer.zero_grad()
            text_batch = {k: v.to(device) for k, v in text_batch.items()}
            text_labels = text_labels.to(device)
            appraisal_inputs = appraisal_inputs.to(device)

            with autocast():
                outputs = model(text_batch, appraisal_inputs)
                loss = criterion(outputs, text_labels)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

        val_loss, val_accuracy = evaluate(model, val_dataloader, criterion, device)
        print(f"Epoch {epoch+1}/{num_epochs}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")


In [None]:
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total_samples = 0
    
    with torch.no_grad():
        for batch, labels in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            labels = labels.to(device)
            
            with autocast():
                outputs = model(**batch)
                loss = criterion(outputs.logits, labels)
                total_loss += loss.item()
            
            preds = torch.argmax(outputs.logits, dim=1)
            correct += (preds == labels).sum().item()
            total_samples += labels.size(0)
    
    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total_samples
    return avg_loss, accuracy