In [76]:
import os
from itertools import chain

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as nnF
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

import torchvision.models as models
from torchvision.transforms import functional as F

In [2]:
def data_text_prep():
    d_train = pd.read_csv("../data/text_clean/train.csv")
    d_test = pd.read_csv("../data/text_clean/test.csv")

    # tokenize
    d_train.loc[:, 'title_1_token'] = d_train.title_1_pre.apply(word_tokenize)
    d_train.loc[:, 'title_2_token'] = d_train.title_2_pre.apply(word_tokenize)

    d_test.loc[:, 'title_1_token'] = d_test.title_1_pre.apply(word_tokenize)
    d_test.loc[:, 'title_2_token'] = d_test.title_2_pre.apply(word_tokenize)
    
    title_token = list(chain(*d_train.title_1_token.tolist() + d_train.title_2_token.tolist()))
    vocab_token = list(set(title_token))

    word2idx = dict((w, k) for k, w in enumerate(vocab_token, 2))
    idx2word = dict((k, w) for k, w in enumerate(vocab_token, 2))

    word2idx['<UNK>'] = 1
    idx2word[1] = '<UNK>'
    word2idx['<PAD>'] = 0
    idx2word[0] = '<PAD>'
    
    return d_train, d_test, word2idx, idx2word

In [15]:
class ShopeeDataset():
    def __init__(self, data, test, word2idx, idx2word):
        train, val = train_test_split(data)
        train.reset_index(drop=True, inplace=True)
        val.reset_index(drop=True, inplace=True)
        self.word2idx = word2idx
        self.idx2word = idx2word
        self.dataset = {
            'train': (train, train.shape[0]),
            'val': (val, val.shape[0]),
            'test': (test, test.shape[0])
        }
        self.set_split('train')
        
    def set_split(self, split='train'):
        self.data, self.length = self.dataset[split]
    
    def encode(self, text):
        token_ids = []
        for word in text:
            try:
                token_ids.append(self.word2idx[word])
            except:
                token_ids.append(1)
        token_ids = torch.LongTensor(token_ids)
        return token_ids
    
    def decode(self, ids):
        words = []
        for id_ in ids:
            try:
                words.append(self.idx2word[id_])
            except:
                words.append('<UNK>')
                
        return words
    
    def set_fix_length(self, ids):
        length = ids.shape[0]
        zeros = torch.zeros(25, dtype=torch.long)
        
        if length <= 25:
            zeros[:length] = ids
        else:
            zeros = ids[:25]
            
        return zeros
    
    def read_image(self, path):
        img_arr = Image.open(path)
        img_arr = img_arr.resize((224, 224))
        img_arr = F.to_tensor(img_arr)
        
        return img_arr
    
    def __getitem__(self, idx):
        t1 = self.data.loc[idx, 'title_1_token']
        t2 = self.data.loc[idx, 'title_2_token']
        i1 = self.data.loc[idx, 'image_1']
        i2 = self.data.loc[idx, 'image_2']
        label = self.data.loc[idx, 'Label']
        
        t1_encode = self.encode(t1)
        t2_encode = self.encode(t2)
        
        t1_encode = self.set_fix_length(t1_encode)
        t2_encode = self.set_fix_length(t2_encode)
        
        i1_scaled = self.read_image(os.path.join("../data/raw/training_img/training_img", i1))
        i2_scaled = self.read_image(os.path.join("../data/raw/training_img/training_img", i2))
        
        return t1_encode, t2_encode, i1_scaled, i2_scaled, label
    
    def __len__(self):
        return self.length

In [100]:
class TextEncoder(nn.Module):
    def __init__(self, num_vocab, emb_size=512, hid_size=256, num_layers=1):
        super(TextEncoder, self).__init__()
        self.network = nn.Sequential(
            nn.Embedding(num_vocab, emb_size),
            nn.LSTM(emb_size, hid_size, num_layers=num_layers, batch_first=True)
        )
        
    def forward(self, input_):
        out, (h, c) = self.network(input_)
        out = out.unsqueeze(1)
        
        return out

In [101]:
class ImageEncoder(nn.Module):
    def __init__(self, out_channels=256, kernel_size=(3,3)):
        super(ImageEncoder, self).__init__()
        
        self.mobilenet = models.mobilenet_v2()
        self.backbone = self.mobilenet.features
        self.model = nn.Sequential(
            self.backbone,
            nn.Conv2d(in_channels=1280, out_channels=out_channels, kernel_size=kernel_size)
        )
    
    def forward(self, input_):
        out = self.model(input_)
        out = torch.reshape(out, (2, 1, 256, -1))
        out = out.permute(0,1,3,2)
        
        return out

In [102]:
class BaseNetwork(nn.Module):
    def __init__(self, in_channel, kernel_size_cnn=(3,11), kernel_size_max_pool=2):
        super(BaseNetwork, self).__init__()
        
        self.base_network = nn.Sequential(
            nn.Conv2d(in_channels=in_channel, out_channels=1, kernel_size=kernel_size_cnn),
            nn.MaxPool2d(kernel_size=kernel_size_max_pool),
            nn.Conv2d(in_channels=1, out_channels=1, kernel_size=kernel_size_cnn),
            nn.MaxPool2d(kernel_size=kernel_size_max_pool),
            nn.Conv2d(in_channels=1, out_channels=1, kernel_size=kernel_size_cnn),
            nn.MaxPool2d(kernel_size=kernel_size_max_pool)
        )
        
    def forward(self, input_):
        out = self.base_network(input_)
        out = out.squeeze(1)
        out = out.squeeze(1)
        
        return out

In [136]:
class WrapperModel(nn.Module):
    def __init__(self):
        super(WrapperModel, self).__init__()
        self.model_text = TextEncoder(num_vocab=len(word2idx))
        self.model_image = ImageEncoder()
        self.model_base = BaseNetwork(in_channel=1)
        
    def forward(self, t1_encode, t2_encode, i1_scaled, i2_scaled):
        feat_t1 = self.model_text(t1_encode)
        feat_t2 = self.model_text(t2_encode)
        
        feat_i1 = self.model_image(i1_scaled)
        feat_i2 = self.model_image(i2_scaled)
        
        # concatenate
        concat_1 = torch.cat((feat_t1, feat_i1), axis=3)
        concat_2 = torch.cat((feat_t2, feat_i2), axis=3)
        
        vec_1 = self.model_base(concat_1)
        vec_2 = self.model_base(concat_2)
        
        return vec_1, vec_2

In [142]:
def euclidean_distance(vec_1, vec_2):
    ed = torch.sqrt(torch.sum(torch.pow(vec_1-vec_2, 2), dim=1))
    
    return ed

In [151]:
def cont_loss(label, distance, margin=0.5):
    loss_contrastive = torch.mean((label) * torch.pow(distance, 2) +
                                  (1-label )* torch.pow(torch.clamp(margin - distance, min=0), 2))
    
    return loss_contrastive

In [116]:
train, test, word2idx, idx2word = data_text_prep()

In [117]:
dataset = ShopeeDataset(train, test, word2idx, idx2word)
model_text = TextEncoder(num_vocab=len(word2idx))
model_image = ImageEncoder()
model_base = BaseNetwork(in_channel=1)

In [118]:
model_wrap = WrapperModel()

In [119]:
num_params = sum(p.numel() for p in model_text.parameters())
print(f"Trainable params: {num_params:,}")

Trainable params: 5,128,192


In [120]:
num_params = sum(p.numel() for p in model_image.parameters())
print(f"Trainable params: {num_params:,}")

Trainable params: 6,454,248


In [121]:
num_params = sum(p.numel() for p in model_base.parameters())
print(f"Trainable params: {num_params:,}")

Trainable params: 102


In [122]:
num_params = sum(p.numel() for p in model_wrap.parameters())
print(f"Trainable params: {num_params:,}")

Trainable params: 11,582,542


In [123]:
data_gen = DataLoader(dataset, batch_size=2)

In [124]:
for t1_encode, t2_encode, i1_scaled, i2_scaled, label in data_gen:
    break

In [125]:
feat_t1 = model_text(t1_encode)
feat_t2 = model_text(t2_encode)

In [126]:
feat_i1 = model_image(i1_scaled)
feat_i2 = model_image(i2_scaled)

In [127]:
# concatenate
concat_1 = torch.cat((feat_t1, feat_i1), axis=3)
concat_2 = torch.cat((feat_t2, feat_i2), axis=3)

In [130]:
vec_1 = model_base(concat_1)
vec_2 = model_base(concat_2)

In [131]:
vec_1.shape

torch.Size([2, 55])

In [132]:
vec_2.shape

torch.Size([2, 55])

In [133]:
vec_1, vec_2 = model_wrap(t1_encode, t2_encode, i1_scaled, i2_scaled)

In [134]:
vec_1.shape

torch.Size([2, 55])

In [135]:
vec_2.shape

torch.Size([2, 55])

In [143]:
ed = euclidean_distance(vec_1, vec_2)

In [149]:
label

tensor([1, 0])

In [152]:
loss = cont_loss(label, ed)

In [154]:
loss.backward()