# Joint Modeling of Zalando & Polyvore 

 - Create an image embedding that understands both the types of data
 - Use the product catgeory information that is assumed to be available as the target
 - use this embedding for subsequent outfit generation/compatibility learning

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import glob
import json
from collections import Counter
from PIL import Image
import numpy as np
import pandas as pd
import torch
from torchvision.models import resnet50

# import tensorflow as tf
# from tensorflow import keras
# from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from numpy.random import default_rng

from tqdm import tqdm
import pickle
from prettytable import PrettyTable

import sys
sys.path.insert(0, "/recsys_data/RecSys/fashion/automl/efficientnetv2")
import effnetv2_model

%pylab inline
import matplotlib.pyplot as plt

print("Pytorch:", torch.__version__)
print("NP:", np.__version__)

Populating the interactive namespace from numpy and matplotlib
Pytorch: 1.10.0+cu102
NP: 1.19.5


In [3]:
zalando_dir = "/recsys_data/RecSys/Zalando_Outfit/female/Outfit_Data"
zalando_image_dir = "/recsys_data/RecSys/Zalando_Outfit/resized_packshot_images_female"

pv_base_dir = "/recsys_data/RecSys/fashion/polyvore-dataset/polyvore_outfits"
pv_image_dir = os.path.join(pv_base_dir, "images")
pv_item_file = "polyvore_item_metadata.json"

pv2zalando = {'accessories': 'accessory',
              'all-body': 'all-body',
              'bags': 'accessory',
              'bottoms': 'bottomwear',
              'hats': 'accessory',
              'jewellery': 'jewellery',
              'outerwear': 'outerwear',
              'scarves': 'accessory',
              'shoes': 'footwear',
              'sunglasses': 'accessory',
              'tops': 'topwear'}

zalando2pv = {'all-body': 'all-body', 
              'footwear': 'shoes', 
              'accessory': 'accessories', 
              'outerwear': 'outerwear', 
              'jewellery': 'jewellery', 
              'topwear': 'tops', 
              'bottomwear': 'bottoms', 
#               'bodywear_nightwear_innerwear': None, 
#               'beachwear_swimwear': None
             }

with open(os.path.join(pv_base_dir, pv_item_file), 'r') as fr:
    pv_items = json.load(fr)

pv_item2cat = dict()
for item_id in pv_items:
    pv_item2cat[item_id] = pv_items[item_id]["semantic_category"]

zalando_item2cat = dict()
with open(os.path.join(zalando_dir, "item2cat.txt"), 'r') as fr:
    for line in fr:
        item, cat = line.strip().split()
        zalando_item2cat[item] = cat

In [4]:
all_paths = dict()
count = 0
for image_path in glob.glob(pv_image_dir + "/*.jpg"):
    image = image_path.split("/")[-1]
    item_id = image.split(".")[0]
    if item_id in pv_item2cat:
        all_paths[image_path] = pv_item2cat[item_id]
        count += 1
print(f"Read {count} polyvore images")

count = 0
for image_path in glob.glob(zalando_image_dir + "/*.jpg"):
    image = image_path.split("/")[-1]
    if image in zalando_item2cat:
        zcat = zalando_item2cat[image]
        if zcat in zalando2pv:
            all_paths[image_path] = zalando2pv[zcat]
        count += 1
print(f"Read {count} Zalando images")

Read 251008 polyvore images
Read 35576 Zalando images


In [5]:
data = list(all_paths.items())
rng = default_rng(seed=100)
num_valid = 35576
num_test = 51008

test_indices = rng.choice(len(data), size=num_test, replace=False)
test_paths = [data[ii] for ii in test_indices]
rem_paths = [data[ii] for ii in range(len(data)) if ii not in test_indices]

val_indices = rng.choice(len(rem_paths), size=num_valid, replace=False)
val_paths = [rem_paths[ii] for ii in val_indices]
train_paths = [rem_paths[ii] for ii in range(len(rem_paths)) if ii not in val_indices]
print(len(train_paths), len(val_paths), len(test_paths))

196279 35576 51008


In [6]:
for item_id, cat in train_paths[0:5]:
    print(item_id, cat)

/recsys_data/RecSys/fashion/polyvore-dataset/polyvore_outfits/images/200162527.jpg outerwear
/recsys_data/RecSys/fashion/polyvore-dataset/polyvore_outfits/images/129089366.jpg bags
/recsys_data/RecSys/fashion/polyvore-dataset/polyvore_outfits/images/191335829.jpg outerwear
/recsys_data/RecSys/fashion/polyvore-dataset/polyvore_outfits/images/128377889.jpg hats
/recsys_data/RecSys/fashion/polyvore-dataset/polyvore_outfits/images/212676450.jpg bags


In [7]:
from utils_torch import ImageDataGen
label_dict = {'accessories': 0, 
              'outerwear': 1, 
              'bags': 2, 
              'shoes': 3, 
              'all-body': 4, 
              'hats': 5, 
              'scarves': 6, 
              'jewellery': 7, 
              'sunglasses': 8, 
              'bottoms': 9, 
              'tops': 10}

train_set = ImageDataGen(train_paths,
                         input_size=(3, 224, 224),
                         label_dict=label_dict,
                         # max_example=1000,
                        )

valid_set = ImageDataGen(val_paths,
                         input_size=(3, 224, 224),
                         label_dict=label_dict,
                        )

test_set = ImageDataGen(test_paths,
                        input_size=(3, 224, 224),
                        label_dict=label_dict,
                        )


Total 1000 images with 11 classes
Total 1000 images with 11 classes
Total 1000 images with 11 classes


In [8]:
for ii in range(4):
    inps, targs = train_set[ii]
    print(inps.shape, targs.shape)

torch.Size([3, 224, 224]) torch.Size([])
torch.Size([3, 224, 224]) torch.Size([])
torch.Size([3, 224, 224]) torch.Size([])
torch.Size([3, 224, 224]) torch.Size([])


In [9]:
class ImageClassifier(torch.nn.Module):
    def __init__(self, d_model, num_classes):
        super(ImageClassifier, self).__init__()
        # Get a resnet50 backbone
        m = resnet50()
        # Extract 4 main layers (note: MaskRCNN needs this particular name
        # mapping for return nodes)
        # self.body = create_feature_extractor(
        #    m, return_nodes={f'layer{k}': str(v)
        #                     for v, k in enumerate([1, 2, 3, 4])})
        self.body = m
        self.image_projector = torch.nn.Sequential(
            torch.nn.Linear(1000, d_model),
            torch.nn.Tanh())
        
        # for NLL loss
#         self.final = torch.nn.Sequential(
#             torch.nn.Linear(d_model, num_classes),
#             torch.nn.LogSoftmax(dim=1))

        # for categorical-crossentropy loss
        self.final = torch.nn.Sequential(
            torch.nn.Linear(d_model, num_classes),
            torch.nn.ReLU())
        
    def forward(self, x):
        x = self.body(x)
        y = self.image_projector(x)
        y = self.final(y)
        return y


In [10]:
model = ImageClassifier(256, 11)
model.to('cuda')
total_params = 0
for name, parameter in model.named_parameters():
    if not parameter.requires_grad:
        continue
    param = parameter.numel()
    total_params += param
print(f"Total Trainable Params: {total_params}")

Total Trainable Params: 25816115


In [11]:
from utils_torch import train

batch_size = 64
model_path = "/recsys_data/RecSys/fashion/finetuned_resnet50.pt"
val_perf, test_perf = train(model, 
                          train_set, 
                          valid_set, 
                          device='cuda', 
                          epochs=10, 
                          batch_size=batch_size,
                          learning_rate=1e-04,
                          loss_name="xent",
                          observe="acc", 
                          test_set=test_set,
                          model_path=model_path,
                         )

                                                             

| Epoch   0 | time: 37.13s | train-loss 2.3265 | val-loss 6.9083 | val-ACC 0.1680 ( 0)


                                                             

| Epoch   1 | time: 37.80s | train-loss 2.2627 | val-loss 6.8811 | val-ACC 0.2020 ( 0)


                                                             

| Epoch   2 | time: 37.38s | train-loss 2.1750 | val-loss 6.8823 | val-ACC 0.1490 ( 1)


                                                             

| Epoch   3 | time: 37.56s | train-loss 2.1215 | val-loss 6.8622 | val-ACC 0.1760 ( 2)


                                                             

| Epoch   4 | time: 38.00s | train-loss 2.0630 | val-loss 6.9173 | val-ACC 0.1730 ( 3)


                                                             

| Epoch   5 | time: 37.48s | train-loss 2.0757 | val-loss 6.8841 | val-ACC 0.1580 ( 4)


                                                             

| Epoch   6 | time: 37.21s | train-loss 1.9532 | val-loss 6.9166 | val-ACC 0.1530 ( 5)


                                                             

| Epoch   7 | time: 37.52s | train-loss 1.9227 | val-loss 6.9580 | val-ACC 0.1570 ( 6)


                                                             

| Epoch   8 | time: 37.11s | train-loss 1.8768 | val-loss 6.9649 | val-ACC 0.1560 ( 7)


                                                             

| Epoch   9 | time: 37.75s | train-loss 1.8178 | val-loss 6.9916 | val-ACC 0.1500 ( 8)
Best valid Accuracy: 0.2020


                                               

Test results: {'acc': 0.123, 'loss': 6.93646}




In [12]:
# model.load_state_dict(torch.load("/recsys_data/RecSys/fashion/finetuned_resnet.pt"))
# model.eval()

In [13]:
# model = ImageClassifier(256, 11).eval()
# y = model(torch.unsqueeze(inps, 0))
# y.shape

In [14]:
# list(all_paths.keys())[0:5]

In [15]:
# list(all_paths.keys())[-5:]

In [16]:
# im = Image.open('/recsys_data/RecSys/fashion/polyvore-dataset/polyvore_outfits/images/114082981.jpg').convert('RGB')
# np.array(im).shape

In [17]:
# im = Image.open('/recsys_data/RecSys/Zalando_Outfit/resized_packshot_images_female/M0Q21A0BP-G11@10.jpg').convert('RGB')
# np.array(im).shape