In [2]:
pip install datasets transformers nltk bert-score

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/510.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.8/510.5 kB[0m [31m4.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m501.8/510.5 kB[0m [31m7.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[

In [6]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class EFRDataset(torch.utils.data.Dataset):
    def __init__(self, data_file, seq_len=5):
        import json
        with open(data_file, 'r') as f:
            self.data = json.load(f)
        self.seq_len = seq_len  # Add seq_len attribute

    def len(self):
        return len(self.data)

    def __getitem__(self, idx):
        instance = self.data[idx]
        utterances = instance['utterances']
        triggers = instance['triggers']

        input_text = ' '.join(utterances)
        input_ids = tokenizer.encode(input_text, add_special_tokens=True, truncation=True, padding="max_length")
        attention_mask = [1] * len(input_ids)

        # Pad labels according to input_ids length
        padding_length = max(0, self.seq_len - len(triggers))
        labels = triggers + [0.0] * padding_length

        return {'input_ids': torch.tensor(input_ids), 'attention_mask': torch.tensor(attention_mask), 'labels': torch.tensor(labels, dtype=torch.float)}



# Modify the training loop to sort the dataset by sequence length
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=lambda batch: sorted(batch, key=lambda x: x['input_ids'].size(0), reverse=True))

# Model M3: BERT-based Transformer Architecture
class M3(nn.Module):
    def __init__(self):
        super(M3, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(768, 1)  # Binary classification for trigger/non-trigger

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state
        pooled_output = self.dropout(last_hidden_state[:, 0])  # Take the [CLS] token representation
        logits = self.fc(pooled_output)
        return logits.squeeze(-1)  # Remove the last dimension for binary classification

# Training and evaluation code
train_dataset = EFRDataset('/content/train_file.json')
val_dataset = EFRDataset('/content/val_file.json')

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False)

# Initialize the model
model_m3 = M3()

# Define the loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model_m3.parameters(), lr=1e-5)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    # Training
    model_m3.train()
    train_loss = 0.0
    for inputs in train_loader:
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        labels = inputs['labels']

        optimizer.zero_grad()
        outputs = model_m3(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * input_ids.size(0)

    train_loss /= len(train_dataset)
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}')

    # Evaluation
    model_m3.eval()
    val_loss = 0.0
    val_acc = 0.0
    with torch.no_grad():
        for inputs in val_loader:
            input_ids = inputs['input_ids']
            attention_mask = inputs['attention_mask']
            labels = inputs['labels']

            outputs = model_m3(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * input_ids.size(0)

            preds = (outputs > 0).float()
            val_acc += torch.sum(preds == labels).item()

    val_loss /= len(val_dataset)
    val_acc /= len(val_dataset)
    print(f'Epoch {epoch+1}/{num_epochs}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')

# Save the best model
torch.save(model_m3.state_dict(), 'model_m3.pth')

TypeError: object of type 'EFRDataset' has no len()

In [1]:
#Optimized approach


import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class EFRDataset(torch.utils.data.Dataset):
    def __init__(self, data_file, seq_len=5):
        import json
        with open(data_file, 'r') as f:
            self.data = json.load(f)
        self.seq_len = seq_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        instance = self.data[idx]
        utterances = instance['utterances']
        triggers = instance['triggers']

        # Pad or truncate utterances
        if len(utterances) >= self.seq_len:
            utterances = utterances[-self.seq_len:]
            triggers = triggers[-self.seq_len:]
        else:
            padding_length = self.seq_len - len(utterances)
            utterances = ['<pad>'] * padding_length + utterances
            triggers = [0.0] * padding_length + triggers

        input_text = ' '.join(utterances)
        input_ids = tokenizer.encode(input_text, add_special_tokens=True, truncation=True)
        input_ids = input_ids[:512]  # Truncate if longer than 512 tokens
        padding_length = 512 - len(input_ids)
        input_ids += [tokenizer.pad_token_id] * padding_length  # Pad to max length

        attention_mask = [1] * len(input_ids)
        labels = triggers + [0.0] * (self.seq_len - len(triggers))  # Pad labels

        # Pad labels to match the batch size
        labels += [0.0] * (8 - len(labels))

        return {'input_ids': torch.tensor(input_ids), 'attention_mask': torch.tensor(attention_mask), 'labels': torch.tensor(labels, dtype=torch.float)}

# Model M3: BERT-based Transformer Architecture
class M3(nn.Module):
    def __init__(self):
        super(M3, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(768, 1)  # Binary classification for trigger/non-trigger

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state
        pooled_output = self.dropout(last_hidden_state[:, 0])  # Take the [CLS] token representation
        logits = self.fc(pooled_output)
        return logits.squeeze(-1)  # Remove the last dimension for binary classification

# Training and evaluation code
train_dataset = EFRDataset('/content/train_file.json', seq_len=5)
val_dataset = EFRDataset('/content/val_file.json', seq_len=5)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=4, shuffle=False)

# Initialize the model
model_m3 = M3()

# Define the loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model_m3.parameters(), lr=1e-5)

import gc  # Import garbage collector module

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    # Training
    model_m3.train()
    train_loss = 0.0
    for batch_idx, inputs in enumerate(train_loader):
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        labels = inputs['labels']

        optimizer.zero_grad()
        outputs = model_m3(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * input_ids.size(0)

        # Manually release GPU memory to prevent memory leaks
        del input_ids, attention_mask, labels, outputs, loss
        torch.cuda.empty_cache()
        gc.collect()  # Force garbage collection

    train_loss /= len(train_dataset)
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}')

    # Evaluation
    model_m3.eval()
    val_loss = 0.0
    val_acc = 0.0
    with torch.no_grad():
        for inputs in val_loader:
            input_ids = inputs['input_ids']
            attention_mask = inputs['attention_mask']
            labels = inputs['labels']

            outputs = model_m3(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * input_ids.size(0)

            preds = (outputs > 0).float()
            val_acc += torch.sum(preds == labels).item()

            # Manually release GPU memory
            del input_ids, attention_mask, labels, outputs, loss
            torch.cuda.empty_cache()
            gc.collect()  # Force garbage collection

    val_loss /= len(val_dataset)
    val_acc /= len(val_dataset)
    print(f'Epoch {epoch+1}/{num_epochs}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')

# Save the best model
torch.save(model_m3.state_dict(), 'model_m3.pth')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


ValueError: Target size (torch.Size([4, 8])) must be the same as input size (torch.Size([4]))