In [9]:
import os
import pandas as pd
import torch 
from PIL import Image
from torch.utils.data import Dataset
import clip
from tqdm import tqdm 
import torch.nn as nn
import torch.nn.functional as F
from loguru import logger
import json
import numpy as np
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader
import time

In [10]:
CLIP_MODELS = [
 'RN50',
 'RN101',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'ViT-B/32',
 'ViT-B/16',
 'ViT-L/14',
 'ViT-L/14@336px']

class VizWizDataset(Dataset):
    def __init__(self, 
                df_path, 
                image_dir, 
                clip_model,
                label2id=None,
                id2label=None,
                device='cpu',
                transform=None):
        assert os.path.exists(df_path), f"{df_path} does not exists"
        self.df = pd.read_csv(df_path)

        # using only Answerable Instances
        self.df = self.df[self.df.answerable == 1]
        

        answer_candidates = self.df.final_answer.unique().tolist()
        if label2id is None:
            self.label2id = {ans:idx for idx,ans in enumerate(answer_candidates)}
            self.id2label = {v:k for k,v in self.label2id.items()}
        else:
            self.label2id = label2id
            self.id2label = id2label

        self.df = self.df.iloc[:100]
        self.n_samples = self.df.shape[0]
        self.image_path = self.df["image"].apply(lambda x : os.path.join(image_dir, x))
        self.transform = transform
        self.clip_model = clip_model 
        # Initalizing Tensor to Store [Image, Text] Emneddings
        self.X = torch.empty((len(self.df), 2048), dtype=torch.float32)
        self.device=device
        for index in tqdm(range(len(self.df))):
            image = Image.open(self.image_path.iloc[index]).convert('RGB')
            if self.transform is not None:
                image = self.transform(image).unsqueeze(0).to(self.device)
            question = clip.tokenize(self.df['question'].iloc[index]).to(self.device)
            with torch.no_grad():
                    image_features = self.clip_model.encode_image(image)
                    text_features = self.clip_model.encode_text(question)
            self.X[index] = torch.cat((image_features, text_features), 1).to(torch.float32)
                
    def __getitem__(self, index):
        return index, self.X[index], self.label2id[self.df['final_answer'].iloc[index]], self.df['answerable'].iloc[index]
        
    def __len__(self):
        return self.n_samples

class VQAModelV1(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(VQAModelV1, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.ln1 = nn.LayerNorm(hidden_dim)
        self.dropout1 = nn.Dropout(p=0.5)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.ln2 = nn.LayerNorm(output_dim)
        self.dropout2 = nn.Dropout(p=0.5)

    def forward(self, x):
        # Layer 1
        x = self.fc1(x)
        x = self.ln1(x)
        x = F.relu(x)
        x = self.dropout1(x)
        
        # Layer 2
        x = self.fc2(x)
        x = self.ln2(x)
        x = self.dropout2(x)
        return x

def load_clip(model_name='RN50', device='cpu'):
    logger.info("Loading CLIP.....")
    assert model_name in CLIP_MODELS, f"clip models available {CLIP_MODELS}"
    clip_model, preprocess = clip.load(model_name, device=device)
    return clip_model, preprocess

In [11]:
class Config:
    base_dir = os.path.dirname(os.getcwd())
    data_dir = os.path.join(base_dir, 'vizviz/vqa')
    train_image_dir = os.path.join(data_dir, 'train')
    val_image_dir = os.path.join(data_dir, 'val')
    train_file_path = os.path.join(data_dir, 'train_df.csv')
    val_file_path = os.path.join(data_dir, 'eval_df.csv')
    batch_size = 32

cfg = Config()

In [22]:
train_df = pd.read_csv(cfg.train_file_path)
val_df = pd.read_csv(cfg.val_file_path)

In [30]:
ans_list = []
for ans in val_df.final_answer.unique():
    if ans not in train_df.final_answer:
        ans_list.append(ans)
len(ans_list)

1482

In [32]:
train_df.shape

(20523, 6)

In [26]:
'dog years: memoir' in train_df.final_answer

False

In [12]:
clip_model, clip_preprocess = load_clip(device='cuda:0')

[32m2024-04-20 20:45:04.436[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_clip[0m:[36m83[0m - [1mLoading CLIP.....[0m


In [13]:
train_dataset = VizWizDataset(df_path=cfg.train_file_path,
                              image_dir=cfg.train_image_dir,
                              clip_model=clip_model,
                              device='cuda:0',
                              transform=clip_preprocess)

val_dataset = VizWizDataset(df_path=cfg.val_file_path,
                              image_dir=cfg.val_image_dir,
                              clip_model=clip_model,
                              device='cuda:0',
                              transform=clip_preprocess)

100%|██████████| 100/100 [00:06<00:00, 15.07it/s]
100%|██████████| 100/100 [00:07<00:00, 12.52it/s]


In [14]:
train_dataloader = DataLoader(dataset=train_dataset, batch_size=cfg.batch_size, shuffle=True)
val_dataloader = DataLoader(dataset=val_dataset, batch_size=cfg.batch_size, shuffle=False)

In [15]:
model = VQAModelV1(input_dim=2048, hidden_dim=2048, output_dim=len(train_dataset.label2id))

In [16]:
def vqa_accuracy(train_df, id2label, train_index, pred_index):
    pred_label = id2label[int(pred_index)]
    train_row = train_df.iloc[int(train_index)]
    answer_set = np.array([ans['answer'] for ans in json.loads(train_row['answers'].replace("\'", "\""))])

    score = min(1, len(np.where(answer_set == pred_label)[0]) / 3)
    return score

In [17]:
criterion = nn.CrossEntropyLoss()
LEARNING_RATE=0.0001
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
lr_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=4, verbose=False)
device='cuda:0'



In [20]:
def train_vqa(model, data_loader, id2label, criterion, optimizer):
    model.train()
    train_loss = 0
    accuracy = 0
    for i, (index, x, answers, _) in enumerate(data_loader):
        x = x.to(device) 
        answers = answers.to(device)
        # Forward Pass
        outputs = model(x).squeeze(1)
        print(outputs.dtype, answers.dtype, outputs.shape, answers.shape)
        loss = criterion(outputs, answers)
        
        # Backward Pass
        optimizer.zero_grad()
        loss.backward()
        
        # Update Weights
        optimizer.step()

        # Loss and Accuracy Calculations
        train_loss += loss.item()
        _, predicted = outputs.max(1)
        for ip, idx in enumerate(index):
            accuracy += vqa_accuracy(data_loader.dataset.df, id2label, int(idx), predicted[ip])

    # id2label, train_index, pred_index
    train_loss /= len(data_loader.dataset)
    accuracy /= len(data_loader.dataset)
    
    return train_loss, accuracy

def validate_vqa(model, data_loader, id2label, criterion):
    model.eval()
    val_loss = 0
    accuracy = 0
    with torch.no_grad():
        for i, (index, x, answers, _) in enumerate(data_loader):
            x = x.to(device)
            answers = answers.to(device)
            # Forward Pass
            outputs = model(x).squeeze(1)
            loss = criterion(outputs, answers)
            
            # Loss and Accuracy Calculations
            val_loss += loss.item()
            _, predicted = outputs.max(1)
            for ip, idx in enumerate(index):
                accuracy += vqa_accuracy(data_loader.dataset.df, id2label, int(idx), predicted[ip])

    val_loss /= len(data_loader.dataset)
    accuracy /= len(data_loader.dataset)

    return val_loss, accuracy

In [21]:
# Defining Lists to store training and validation accuracies and losses
train_vqa_acc_history = []
train_vqa_loss_history = []
val_vqa_acc_history = []
val_vqa_loss_history = []

counter = 0
NUM_EPOCHS=10
patience=5
model.to(device)
best_val_loss=float('inf')
for epoch in range(NUM_EPOCHS):
    print(f"Epoch [{epoch + 1}/{NUM_EPOCHS}]:")
    start_time = time.perf_counter()
    
    train_loss, train_acc = train_vqa(model, train_dataloader, train_dataset.id2label, criterion, optimizer)
    val_loss, val_acc = validate_vqa(model, val_dataloader, train_dataset.id2label, criterion)
    
    epoch_time = time.perf_counter() - start_time
    avg_step_time = epoch_time / (len(train_dataloader) + len(val_dataloader))
        
    train_vqa_acc_history.append(train_acc)
    train_vqa_loss_history.append(train_loss)
    val_vqa_acc_history.append(val_acc)
    val_vqa_loss_history.append(val_loss)
    
     # Check if the validation loss has improved
    if val_loss < best_val_loss:
        print(f"val_loss improved from {best_val_loss:.5f} to {val_loss:.5f}")
        best_val_loss = val_loss
        counter = 0
    else:
        counter += 1
        if counter >= patience:
            print(f"val_loss hasn't improved for {patience} epochs. Early stopping.")
            break
    
    print(f"{int(np.round(epoch_time))}s {avg_step_time*1e3:.4f}ms/step - loss: {train_loss:.4f} - accuracy: {train_acc*100:.4f}% - val_loss: {val_loss:.4f} - val_accuracy: {val_acc*100:.4f}% - lr: {optimizer.param_groups[0]['lr']}")
    
    lr_scheduler.step(val_loss)
    print()

Epoch [1/10]:
torch.float32 torch.int64 torch.Size([32, 79]) torch.Size([32])
torch.float32 torch.int64 torch.Size([32, 79]) torch.Size([32])
torch.float32 torch.int64 torch.Size([32, 79]) torch.Size([32])
torch.float32 torch.int64 torch.Size([4, 79]) torch.Size([4])
val_loss improved from inf to 0.17637
0s 14.8918ms/step - loss: 0.1588 - accuracy: 27.0000% - val_loss: 0.1764 - val_accuracy: 34.6667% - lr: 0.0001

Epoch [2/10]:
torch.float32 torch.int64 torch.Size([32, 79]) torch.Size([32])
torch.float32 torch.int64 torch.Size([32, 79]) torch.Size([32])
torch.float32 torch.int64 torch.Size([32, 79]) torch.Size([32])
torch.float32 torch.int64 torch.Size([4, 79]) torch.Size([4])
0s 7.0025ms/step - loss: 0.1520 - accuracy: 25.0000% - val_loss: 0.1766 - val_accuracy: 33.3333% - lr: 0.0001

Epoch [3/10]:
torch.float32 torch.int64 torch.Size([32, 79]) torch.Size([32])
torch.float32 torch.int64 torch.Size([32, 79]) torch.Size([32])
torch.float32 torch.int64 torch.Size([32, 79]) torch.Size([32

In [49]:
logits = torch.sigmoid(outputs).max(1)[1]

In [56]:
correct = 0
for i in range(len(logits)):
    if logits[i] == batch_[2][i]:
        print(i)
        correct+=1

12


In [85]:
train_dataset.id2label[int(logits[10])]

'grey'

In [84]:
idx = int(batch_[0][10])
row = train_dataset.df.iloc[idx]
answer_set = [ans['answer'] for ans in json.loads(row['answers'].replace("\'", "\""))]
answer_set

['dog', 'dog', 'dog', 'dog', 'dog', 'dog', 'dog', 'dog', 'black dog', 'dog']

In [93]:
answer_set = np.array(answer_set)
min(1, len(np.where(answer_set == 'grey')[0]) / 3)

0.0