In [1]:
import torch

# Check available CUDA devices
print(torch.cuda.is_available())  # Should return True if CUDA is available
print(torch.cuda.device_count())  # Number of available GPUs

# Check which GPU is being used
print(torch.cuda.current_device())  # Get the index of the currently selected GPU
print(torch.cuda.get_device_name(torch.cuda.current_device()))  # Name of the current GPU

True
1
0
NVIDIA GeForce RTX 3070 Ti Laptop GPU


### Load DialogSum Dataset

In [2]:
from datasets import load_dataset

ds_train = load_dataset("knkarthick/dialogsum", split="train")
ds_validation = load_dataset("knkarthick/dialogsum", split="validation")
ds_test = load_dataset("knkarthick/dialogsum", split="test")

In [3]:
# Preview the dataset
print(ds_train.features)


{'id': Value(dtype='string', id=None), 'dialogue': Value(dtype='string', id=None), 'summary': Value(dtype='string', id=None), 'topic': Value(dtype='string', id=None)}


In [57]:
from transformers import BertTokenizer

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization function
def tokenize_function(examples):
    dialogue_encodings = tokenizer(
        examples['dialogue'],
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )
    
    summary_encodings = tokenizer(
        examples['summary'],
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )
    
    return {
        'input_ids_dialogue': dialogue_encodings['input_ids'],
        'attention_mask_dialogue': dialogue_encodings['attention_mask'],
        'input_ids_summary': summary_encodings['input_ids'],
        'attention_mask_summary': summary_encodings['attention_mask'],
        'topic': examples['topic']
    }

# Apply tokenization to the dataset
ds_train_tokenized = ds_train.map(tokenize_function, batched=True)


In [None]:
# from sklearn.preprocessing import LabelEncoder

# # Initialize label encoder
# label_encoder = LabelEncoder()

# # Fit label encoder on the 'topic' column of the dataset
# label_encoder.fit(ds_train_tokenized['topic'])

# # Apply the label encoder to transform topics in the dataset
# ds_train_tokenized = ds_train_tokenized.map(lambda e: {'label': label_encoder.transform([e['topic']])[0]}, batched=False)


In [59]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from datasets import load_dataset

# Load pre-trained model and tokenizer for text classification
model_name = "bhadresh-savani/distilbert-base-uncased-emotion"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Create a pipeline for text classification
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Test the classifier with a sample dialogue
sample_dialogue = "I am feeling very happy today!"
result = classifier(sample_dialogue)
print(result)  # Ensure the classifier works as expected

# Function to classify dialogues
def classify_dialogues(batch):
    dialogues = batch['dialogue']
    results = classifier(dialogues)
    labels = [result['label'] for result in results]
    return {'job_label': labels}

# Apply classification to each dialogue in the dataset using batched processing
ds_train_tokenized_job = ds_train_tokenized.map(classify_dialogues, batched=True)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'joy', 'score': 0.999061644077301}]


Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (617 > 512). Running this sequence through the model will result in indexing errors


RuntimeError: The size of tensor a (617) must match the size of tensor b (512) at non-singleton dimension 1

In [None]:
ds_train_tokenized_job

In [44]:
import torch
from torch.utils.data import Dataset, DataLoader

class DialogSumDataset(Dataset):
    def __init__(self, hf_dataset):
        self.dataset = hf_dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        input_ids_dialogue = torch.tensor(item['input_ids_dialogue'], dtype=torch.long)
        attention_mask_dialogue = torch.tensor(item['attention_mask_dialogue'], dtype=torch.long)
        input_ids_summary = torch.tensor(item['input_ids_summary'], dtype=torch.long)
        attention_mask_summary = torch.tensor(item['attention_mask_summary'], dtype=torch.long)
        
        # Use the encoded topic labels
        labels = item['label']
        
        return {
            'input_ids_dialogue': input_ids_dialogue,
            'attention_mask_dialogue': attention_mask_dialogue,
            'input_ids_summary': input_ids_summary,
            'attention_mask_summary': attention_mask_summary,
            'labels': torch.tensor(labels, dtype=torch.long)
        }

# Convert Hugging Face dataset to custom PyTorch dataset
train_dataset = DialogSumDataset(ds_train_tokenized)

# Create DataLoader for training
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)


In [None]:
print(train_dataset[0]['labels'])

3663


In [45]:
from transformers import BertForSequenceClassification, AdamW

# Initialize the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import torch

# Set device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Training loop
for epoch in range(3):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids_dialogue = batch['input_ids_dialogue'].to(device)
        attention_mask_dialogue = batch['attention_mask_dialogue'].to(device)
        input_ids_summary = batch['input_ids_summary'].to(device)
        attention_mask_summary = batch['attention_mask_summary'].to(device)
        
        # Labels are the encoded topic labels
        labels = batch['labels'].to(device)
        outputs = model(
            input_ids=input_ids_dialogue,
            attention_mask=attention_mask_dialogue,
            labels=labels
        )
        
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print(f'Epoch: {epoch}, Loss: {loss.item()}')


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
