In [26]:
import json
import os
import torch
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torchvision.models import resnet50
from torchvision import models
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

import torch.nn as nn
import torch.optim as optim

from torchtext.data.utils import get_tokenizer
import torchtext


In [27]:
torch.cuda.empty_cache()
batch_size=32

In [29]:
from torchtext.legacy.data import TabularDataset, Field

tokenizer = get_tokenizer('basic_english')


# Build the vocabulary for the text data
text_field = torchtext.data.Field(
    tokenize=tokenizer, include_lengths=True, lower=True)
label_field = torchtext.data.LabelField(dtype=torch.float)




class HatefulMemeDataset(Dataset):
    def __init__(self, json_file, tokenizer=get_tokenizer('basic_english')):
        self.tokenizer = tokenizer
        if tokenizer is None:
            self.tokenizer = get_tokenizer('basic_english')

        with open(json_file, "r") as f:
            self.data = [json.loads(line) for line in f]
            
        # Define the fields for the dataset
        self.text_field = Field(
            tokenize=tokenizer, use_vocab=True, sequential=True)
        self.label_field = Field(sequential=False, use_vocab=False)

        # Load the dataset from the data dictionary
        fields = [('text', self.text_field), ('label', self.label_field)]
        examples = [torchtext.data.Example.fromdict(
            d, fields) for d in self.data]
        self.dataset = torchtext.data.Dataset(examples, fields)

        # Build the vocabulary for the text field
        self.text_field.build_vocab(self.dataset)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        # img_path = os.path.join(self.img_dir, self.data[index]["img"])
        # image = Image.open(img_path).convert("RGB")

        # if self.transform is not None:
        #     image = self.transform(image)
        text = self.data[index]['text']
        label = self.data[index]['label']

        # Tokenize the text using the tokenizer
        tokens = self.tokenizer(text)

        # Convert the tokenized text into a tensor
        text_tensor = self.text_field.process([text]).squeeze(0)


        return text, label


ModuleNotFoundError: No module named 'torchtext.legacy'

In [None]:
# Load the datasets
train_dataset = HatefulMemeDataset(
    json_file="data/train.jsonl",  tokenizer=tokenizer)
val_dataset = HatefulMemeDataset(
    json_file="data/dev_seen.jsonl",  tokenizer=tokenizer)
valu_dataset = HatefulMemeDataset(
    json_file="data/dev_unseen.jsonl",  tokenizer=tokenizer)
test_dataset = HatefulMemeDataset(
    json_file="data/test_seen.jsonl", tokenizer=tokenizer)
testu_dataset = HatefulMemeDataset(
    json_file="data/test_unseen.jsonl", tokenizer=tokenizer)

train_loader = torchtext.data.BucketIterator(
    train_dataset, batch_size=batch_size, shuffle=True)
val_loader = torchtext.data.BucketIterator(val_dataset, batch_size=batch_size)
test_loader = torchtext.data.BucketIterator(
    test_dataset, batch_size=batch_size)
# Define the dataloaders
train_loader = DataLoader(train_dataset, batch_size=32,
                          shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=32,
                        shuffle=False, num_workers=4)
valu_loader = DataLoader(valu_dataset, batch_size=32,
                        shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=32,
                         shuffle=False, num_workers=4)
testu_loader = DataLoader(testu_dataset, batch_size=32,
                          shuffle=False, num_workers=4)


In [None]:
# for inputs, masks, labels in train_loader:
#     print(inputs.shape)
#     print(masks.shape)
#     print(labels.shape)
#     break

In [None]:
class GRUModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super(GRUModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        output, hidden = self.gru(x)
        last_hidden = hidden[-1]
        out = self.fc(last_hidden)
        return out


# Define hyperparameters
batch_size = 32
num_epochs = 5
learning_rate = 1e-3
vocab_size = 5000
embedding_dim = 100
hidden_dim = 256
num_classes = 2

# Instantiate the model and optimizer
model = GRUModel(vocab_size, embedding_dim, hidden_dim, num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    train_loss = 0
    train_acc = 0
    num_train_steps = 0

    model.train()
    for i, (text, labels) in enumerate(train_loader):
        text = text.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(text)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_acc += (outputs.argmax(1) == labels).sum().item()
        num_train_steps += 1

    train_loss = train_loss / num_train_steps
    train_acc = train_acc / len(train_dataset)

    # Evaluate the model on the validation set
    val_loss = 0
    val_acc = 0
    num_val_steps = 0

    model.eval()
    with torch.no_grad():
        for text, labels in val_loader:
            text = text.to(device)
            labels = labels.to(device)

            outputs = model(text)
            loss = nn.CrossEntropyLoss()(outputs, labels)

            val_loss += loss.item()
            val_acc += (outputs.argmax(1) == labels).sum().item()
            num_val_steps += 1

    val_loss = val_loss / num_val_steps
    val_acc = val_acc / len(val_dataset)

    print(f'Epoch {epoch+1}/{num_epochs}:')
    print(
        f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')




AttributeError: Caught AttributeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/aayush/a/dl_assignments/.venv/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/aayush/a/dl_assignments/.venv/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 51, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/aayush/a/dl_assignments/.venv/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 51, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/tmp/ipykernel_1512/3773070321.py", line 28, in __getitem__
    text_tensor = torch.tensor([self.vocab.stoi[token]
  File "/tmp/ipykernel_1512/3773070321.py", line 28, in <listcomp>
    text_tensor = torch.tensor([self.vocab.stoi[token]
AttributeError: 'HatefulMemeDataset' object has no attribute 'vocab'


In [None]:
# model = BertForSequenceClassification.from_pretrained(
#     'bert-base-uncased', num_labels=2)
# optimizer = optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
# train_batch_size = 32
# val_batch_size = 32
# num_epochs = 10

# for epoch in range(num_epochs):
#     model.train()
#     train_loss = 0
#     train_acc = 0
#     num_train_steps = 0
#     for inputs, attention_masks, labels in train_loader:
#         optimizer.zero_grad()
#         inputs, attention_masks, labels = inputs.to(device), attention_masks.to(device), labels.to(device)

        
#         outputs = model(
#             inputs, attention_mask=attention_masks, labels=labels)
#         loss = outputs[0]
#         logits = outputs[1]
#         loss.backward()

#         torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
#         optimizer.step()
#         scheduler.step()

#         train_loss += loss.item()
#         train_acc += (logits.argmax(1) == labels).sum().item()
#         num_train_steps += 1

#     # Evaluate the model on the validation set after each epoch
#     model.eval()
#     val_loss = 0
#     val_acc = 0
#     num_val_steps = 0

#     with torch.no_grad():
#         for inputs, attention_masks, labels in val_loader:
#             inputs, attention_masks, labels = inputs.to(
#                 device), attention_masks.to(device), labels.to(device)

#             outputs = model(inputs, attention_mask=attention_masks, labels=labels)
#             loss = outputs[0]
#             logits = outputs[1]

#             val_loss += loss.item()
#             val_acc += (logits.argmax(1) == labels).sum().item()
#             num_val_steps += 1
#         train_loss = train_loss / num_train_steps
#         train_acc = train_acc / len(train_dataset)
#         val_loss = val_loss / num_val_steps
#         val_acc = val_acc / len(val_dataset)

#         print('Epoch {}/{}'.format(epoch+1, num_epochs))
#         print('Train loss: {:.4f}, Train accuracy: {:.4f}'.format(
#             train_loss, train_acc))
#         print('Val loss: {:.4f}, Val accuracy: {:.4f}'.format(val_loss, val_acc))
        
    

    


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch 1/10
Train loss: 0.7134, Train accuracy: 0.4320
Val loss: 0.6996, Val accuracy: 0.4860
Epoch 2/10
Train loss: 0.7137, Train accuracy: 0.4264
Val loss: 0.6996, Val accuracy: 0.4860
Epoch 3/10
Train loss: 0.7135, Train accuracy: 0.4334
Val loss: 0.6996, Val accuracy: 0.4860
Epoch 4/10
Train loss: 0.7137, Train accuracy: 0.4356
Val loss: 0.6996, Val accuracy: 0.4860


KeyboardInterrupt: 

In [None]:
model.eval()
test_loss = 0
test_acc = 0
num_test_steps = 0

with torch.no_grad():
    for inputs, attention_masks, labels in testA_loader:
        inputs, attention_masks, labels = inputs.to(
            device), attention_masks.to(device), labels.to(device)

        outputs = model(
            inputs, attention_mask=attention_masks, labels=labels)
        loss = outputs[0]
        logits = outputs[1]

        test_loss += loss.item()
        test_acc += (logits.argmax(1) == labels).sum().item()
        num_test_steps += 1
        
    test_loss = test_loss / num_test_steps
    test_acc = test_acc / len(testA_dataset)

    print('Test loss: {:.4f}, Test accuracy: {:.4f}'.format(test_loss, test_acc))


Test loss: 0.6985, Test accuracy: 0.4990
