In [1]:
import os
import random
import functools
from functools import partial
import PIL

import numpy as np 
import pandas as pd

from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader

import timm

from transformers import BertTokenizer
from transformers import BertModel


Creating images of size (250, 250)

In [2]:
small_images_dir = 'data/small_train_images'

create_small_train_imgs = False

if not os.path.isdir(small_images_dir): 
    os.makedirs(small_images_dir)
    create_small_train_imgs = True
    
if create_small_train_imgs:
    big_img_path = 'data/train_images'
    img_paths = pd.read_csv('data/train.csv')['image'].tolist()
    sz = (250, 250)

    for im_path in tqdm(img_paths):
        im = PIL.Image.open(os.path.join(big_img_path, im_path))
        im = im.resize(sz)
        im.save(os.path.join(small_images_dir, im_path), quality = 100)

In [3]:
class TripletDS(Dataset):
    def __init__(self, data, tokenizer, images_path, return_triplet = True):
        super().__init__()
        self.imgs = data['image'].tolist()
        self.unique_labels = data['label_group'].unique().tolist()
        self.labels = data['label_group'].tolist()
        self.label_to_index_dict = (data.reset_index(drop = True)
                                    .groupby('label_group')
                                    .apply(lambda x: x.index.tolist())
                                    .to_dict())
        self.texts = tokenizer(data['title'].values.tolist(), return_tensors = 'pt',
                               padding=True, truncation=True, max_length = 40)
        self.images_path = images_path
        self.return_triplet = return_triplet
        
    def __getitem__(self, idx):
        
        # anchor data
        anchor_label = self.labels[idx]
        anchor_img, anchor_txt = self._get_item(idx)
        
        if not self.return_triplet: return anchor_img, anchor_txt
        
        # neg data
        neg_label = np.random.choice(self.unique_labels)
        while neg_label == anchor_label:
            neg_label = np.random.choice(self.unique_labels)
        neg_idx = np.random.choice(self.label_to_index_dict[neg_label])
        neg_img, neg_txt = self._get_item(neg_idx)   
        
        # pos data
        pos_idxs = self.label_to_index_dict[anchor_label]
        # picking an index not equal to anchor's index
        pos_idxs = [o for o in pos_idxs if o != idx]
        
        if len(pos_idxs) == 0:
            # edge case, only 1 sample per label
            pos_idxs = [idx]
        pos_idx = np.random.choice(pos_idxs)
        pos_img, pos_txt = self._get_item(pos_idx)
        
        return anchor_img, anchor_txt, pos_img, pos_txt, neg_img, neg_txt
        
        
    def __len__(self):
        return len(self.imgs)
    
    def _get_item(self, idx):
        im = PIL.Image.open(os.path.join(self.images_path, self.imgs[idx]))
        im = torch.tensor(np.array(im) / 255.0, dtype = torch.float).permute(2,0,1)
        txt = {'input_ids' : self.texts['input_ids'][idx], 
               'attention_mask' : self.texts['attention_mask'][idx]}
        return im, txt

In [4]:
# load in data

df = pd.read_csv('data/train.csv')

np.random.seed(1337)

# train val split

train_perc = 0.7
n_train_examples = int(train_perc * len(df))

train_df = df.iloc[:n_train_examples]
val_df = df.iloc[n_train_examples:]

In [6]:
# creating dataloaders

vision_model = 'resnet50'
language_model = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(language_model)


bs = 32
tr_ds = TripletDS(train_df, tokenizer, small_images_dir)
tr_dl = DataLoader(tr_ds, batch_size = bs, shuffle = True, pin_memory = True)

val_ds = TripletDS(val_df, tokenizer, small_images_dir)
val_dl = DataLoader(val_ds, batch_size = bs, shuffle = False, pin_memory = True)

device = torch.device('cuda')



In [7]:
def text_to_device(text, device):
    return {'input_ids' : text['input_ids'].to(device),
            'attention_mask' : text['attention_mask'].to(device)}

In [8]:

class EmbedorNN(nn.Module) :
    def __init__(self, pretrained_image_embedor='resnet50', pretrained_text_embedor='bert-base-uncased',
                output_dim=512) :
        super(EmbedorNN, self).__init__()
        self.image_embedor = timm.create_model(pretrained_image_embedor, pretrained=True)
        self.image_pool = nn.AdaptiveAvgPool2d((1,1))
        self.text_embedor = BertModel.from_pretrained(pretrained_text_embedor)
        self.head = nn.Sequential(nn.Linear(2048+768, output_dim), 
                                  #nn.ReLU(), 
                                  #nn.Linear(1024, output_dim)
                                 )
        
        for m in self.head.modules():
            if isinstance(m, nn.Linear):
                sz = m.weight.data.size(-1)
                m.weight.data.normal_(mean=0.0, std=1/np.sqrt(sz))
            elif isinstance(m, (nn.LayerNorm, nn.BatchNorm1d)):
                m.bias.data.zero_()
                m.weight.data.fill_(1.0)
                m.bias.data.zero_()
            if isinstance(m, nn.Linear) and m.bias is not None:
                m.bias.data.zero_()
            
    def freeze_lm(self):
        for parameter in self.text_embedor.parameters():
            parameter.requires_grad = False
            
    def unfreeze_lm(self):
        for parameter in self.text_embedor.parameters():
            parameter.requires_grad = True
            
    def freeze_cnn(self):
        for parameter in self.image_embedor.parameters():
            parameter.requires_grad = False
            
    def unfreeze_cnn(self):
        for parameter in self.image_embedor.parameters():
            parameter.requires_grad = True
    
    def forward(self, x) :
        images, texts = x
        out_images = self.image_embedor.forward_features(images)
        out_images = self.image_pool(out_images).squeeze()
        out_text = self.text_embedor(texts['input_ids'], 
                                     attention_mask=texts['attention_mask'])[0][:,0,:]
        out = torch.cat([out_images, out_text], dim=-1)
        return F.normalize(self.head(out), dim=-1)

In [9]:
# model instantiation

model = EmbedorNN(vision_model, language_model).to(device)

model.freeze_lm()
model.freeze_cnn()

In [10]:
# training params

normalize = transforms.Normalize(mean=(0.485, 0.456, 0.406),
                                 std=(0.229, 0.224, 0.225))

train_transforms = transforms.Compose([transforms.ColorJitter(.3,.3,.3),
                                       transforms.RandomRotation(5),
                                       transforms.RandomCrop(224),
                                       transforms.RandomHorizontalFlip(),
                                       normalize
                                       ])

val_transforms = transforms.Compose([transforms.Resize((224,224)),
                                     normalize
                                     ])

n_epochs = 2

lf = nn.TripletMarginLoss()

lr = 1e-4
wd = 0
no_decay = ["bias", "BatchNorm2d.weight", "BatchNorm2d.bias", "LayerNorm.weight", 'LayerNorm.bias',
            "BatchNorm1d.weight", "BatchNorm1d.bias"]

optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": wd,
    },
    {
        "params": [p for n, p in  model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=lr)

# learning rate scheduler
sched = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr =lr, pct_start = 0.3, #anneal_strategy = 'linear',
                                            total_steps = int(n_epochs * len(tr_dl)))

scaler = torch.cuda.amp.GradScaler()

In [13]:
tr_losses = []
val_losses = []
for ep in tqdm(range(n_epochs)):
    model.train()
    tr_loss = []
    pbar = tqdm(tr_dl)
    for anchor_image, anchor_text, pos_image, pos_text, neg_image, neg_text in pbar:
        
        anchor = train_transforms(anchor_image.to(device)), text_to_device(anchor_text, device)
        pos = train_transforms(pos_image.to(device)), text_to_device(pos_text, device)
        neg = train_transforms(neg_image.to(device)), text_to_device(neg_text, device)
        
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            anchor_emb = model(anchor)
            pos_emb = model(pos)
            neg_emb = model(neg)
            loss = lf(anchor_emb, pos_emb, neg_emb)
            
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        sched.step()
        
        tr_loss.append(loss.item())
        pbar.set_description(f"Train loss: {round(np.mean(tr_loss),3)}")
        
    model.eval()
    val_loss = []
    with torch.no_grad():
        pbar = tqdm(val_dl)
        for anchor_image, anchor_text, pos_image, pos_text, neg_image, neg_text in pbar:

            anchor = val_transforms(anchor_image.to(device)), text_to_device(anchor_text, device)
            pos = val_transforms(pos_image.to(device)), text_to_device(pos_text, device)
            neg = val_transforms(neg_image.to(device)), text_to_device(neg_text, device)

            with torch.cuda.amp.autocast():
                
                anchor_emb = model(anchor)
                pos_emb = model(pos)
                neg_emb = model(neg)
                loss = lf(anchor_emb, pos_emb, neg_emb)

            val_loss.append(loss.item())
            pbar.set_description(f"Val loss: {round(np.mean(val_loss),3)}")
            
    tr_loss = round(np.mean(tr_loss),3)
    val_loss = round(np.mean(val_loss),3)
    if ep >= 5:
        if val_loss < val_losses[-1]:
            torch.save(model.state_dict(), f'loss_{ep}_{val_loss}.pt')
    tr_losses.append(tr_loss)
    val_losses.append(val_loss)
    summary = f"Ep {ep}: Train loss {tr_loss} - Val loss {val_loss}"
    print(summary) 
    
    

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/750 [00:00<?, ?it/s]

  0%|          | 0/322 [00:00<?, ?it/s]

Ep 0: Train loss 0.434 - Val loss 0.177


  0%|          | 0/750 [00:00<?, ?it/s]

  0%|          | 0/322 [00:00<?, ?it/s]

Ep 1: Train loss 0.278 - Val loss 0.169


In [11]:
model.unfreeze_lm()
model.unfreeze_cnn()

In [17]:
torch.save(model.state_dict(), '2ep_frozen.pth')

In [12]:
model.load_state_dict(torch.load('2ep_frozen.pth'))

<All keys matched successfully>

In [None]:
n_epochs = 20
swa_start = int(0.75*n_epochs)

lr = 2e-5
wd = 1e-4
no_decay = ["bias", "BatchNorm2d.weight", "BatchNorm2d.bias", "LayerNorm.weight", 'LayerNorm.bias',
            "BatchNorm1d.weight", "BatchNorm1d.bias"]

optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": wd,
    },
    {
        "params": [p for n, p in  model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=lr)

# learning rate scheduler
sched = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr =lr, pct_start = 0.3, #anneal_strategy = 'linear',
                                            total_steps = int(n_epochs * len(tr_dl)))

tr_losses = []
val_losses = []
for ep in tqdm(range(n_epochs)):
    model.train()
    tr_loss = []
    pbar = tqdm(tr_dl)
    for anchor_image, anchor_text, pos_image, pos_text, neg_image, neg_text in pbar:
        
        anchor = train_transforms(anchor_image.to(device)), text_to_device(anchor_text, device)
        pos = train_transforms(pos_image.to(device)), text_to_device(pos_text, device)
        neg = train_transforms(neg_image.to(device)), text_to_device(neg_text, device)
        
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            anchor_emb = model(anchor)
            pos_emb = model(pos)
            neg_emb = model(neg)
            loss = lf(anchor_emb, pos_emb, neg_emb)
            
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        sched.step()
        
        tr_loss.append(loss.item())
        pbar.set_description(f"Train loss: {round(np.mean(tr_loss),3)}")
        
    model.eval()
    val_loss = []
    with torch.no_grad():
        pbar = tqdm(val_dl)
        for anchor_image, anchor_text, pos_image, pos_text, neg_image, neg_text in pbar:

            anchor = val_transforms(anchor_image.to(device)), text_to_device(anchor_text, device)
            pos = val_transforms(pos_image.to(device)), text_to_device(pos_text, device)
            neg = val_transforms(neg_image.to(device)), text_to_device(neg_text, device)

            with torch.cuda.amp.autocast():
                
                anchor_emb = model(anchor)
                pos_emb = model(pos)
                neg_emb = model(neg)
                loss = lf(anchor_emb, pos_emb, neg_emb)

            val_loss.append(loss.item())
            pbar.set_description(f"Val loss: {round(np.mean(val_loss),3)}")
            
    tr_loss = round(np.mean(tr_loss),3)
    val_loss = round(np.mean(val_loss),3)
    if ep >= 5:
        if val_loss < val_losses[-1]:
            torch.save(model.state_dict(), f'loss_{ep}_{val_loss}.pt')
    tr_losses.append(tr_loss)
    val_losses.append(val_loss)
    summary = f"Ep {ep}: Train loss {tr_loss} - Val loss {val_loss}"
    print(summary) 
    
    

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/750 [00:00<?, ?it/s]

In [None]:
testing_ds = TripletDS(df, tokenizer, small_images_dir)
testing_dl = DataLoader(testing_ds, batch_size = bs, shuffle = False, pin_memory = True)

In [26]:
embs = []
model.eval()
with torch.no_grad():
    pbar = tqdm(testing_dl)
    for image, text in pbar:
        x = image.to(device), {'input_ids' : text['input_ids'].to(device),
                               'attention_mask' : text['attention_mask'].to(device)}
        y = model(x)
        embs.append(y.cpu())

  0%|          | 0/1071 [00:00<?, ?it/s]

In [27]:
embs = torch.cat(embs,0)

In [28]:
embs_df = pd.DataFrame(embs.numpy())
emb_cols = [f'emb_{i}' for i in embs_df.columns]
embs_df.columns = emb_cols
embs_df.to_csv('train_embs.csv')


In [22]:
embs_df.to_csv('train_embs.csv')