In [4]:
import json
import Database_connector
import pandas as pd
%load_ext autoreload
%autoreload 2

In [5]:
import pandas as pd
import torch
import numpy as np
from Database_Settings import DB_SETTINGS
import Database_connector
from mysql.connector import (connection)
from PIL import Image

PICKLED_DATA_FOLDER = "Pickled_Data/"
PICKLED_PICTURES_FILE = "pictures_embeddings_resnet152_df.pkl"
PICKLED_OUTFIRS_FILE = "active_outfits_df.pkl"

PICTURES_DROP_COLUMNS = ["contentType", "status", "displayOrder", "sourceURL", "embeddings"]
#PICTURES_DROP_COLUMNS = []
OUTFITS_DROP_COLUMNS = ['owner', 'name', 'brand', 'isPublic', 'isDeleted', 'meta.validFrom', 'meta.validTo', "Outfit_size"]

def get_pictures_df(path):
    pictures_df = pd.read_pickle(path)
    #pictures_df = pictures_df.drop(columns=PICTURES_DROP_COLUMNS)
    pictures_df = pictures_df.drop(columns=pictures_df.columns[2:])
    return pictures_df

def get_outfits_df(path):
    outfits_df = pd.read_pickle(path)
    outfits_df = outfits_df.drop(columns=OUTFITS_DROP_COLUMNS)
    return outfits_df

def prepare_data(pictures_df_path = PICKLED_DATA_FOLDER + PICKLED_PICTURES_FILE, outfits_df_path = PICKLED_DATA_FOLDER + PICKLED_OUTFIRS_FILE):
    pictures_df = get_pictures_df(pictures_df_path)
    outfits_df = get_outfits_df(outfits_df_path)
    return pictures_df, outfits_df

def picture_exists(picture_id, pictures_dir_path):
    return os.path.isfile(pictures_dir_path + os.sep + picture_id + ".jpg")

def find_missing_pictures(pictures_df, pictures_dir_path):
    pictures_df["file_exists"] = pictures_df["id"].apply(lambda x : picture_exists(x, pictures_dir_path))
    return pictures_df["file_exists"].value_counts()

In [6]:
# pictures_df, outfits_df = prepare_data(pictures_df_path="../FREja_dataset_processing/pictures_embeddings_resnet152_df.pkl"
#                                        , outfits_df_path="../FREja_dataset_processing/active_outfits_df.pkl")

pictures_df, outfits_df = prepare_data(pictures_df_path="Pickled_Data/active_outfit_pictures_df.pkl", outfits_df_path="Pickled_Data/active_outfits_df.pkl")
pictures_df.head()

Unnamed: 0,id,owner
1,picture.00058abb53434872ae9bb4270ae21f8e,outfit.98f32aaf08bc4ff09c44e6e11e9199bc
2,picture.00063f52c36d43ada95da45f819b30b4,outfit.9fd1c42c3db543c5b6e53b0db1ee8c0f
3,picture.0008443461814f5c988f123718bbd20e,outfit.a7539783b6e94591bdf4e10339afc1d7
4,picture.000a5db3362049aebcc1eb2bf7bde95f,outfit.745fa2bc8156478bac6c0f7d46dadbda
6,picture.000ddec26b7a4aa495e2d7db9e9585e9,outfit.40d4e8f739a74e488769b16f8f87ca83


In [7]:
PICTURES_DIR_PATH = "../FREja_dataset_processing/Filtered_pictures/"
DF_SPLITS = 200

DEVICE = "cuda"
EMBEDDINGS_SAVE_DIR = "Pickled_Data/Embeddings_CLIP/"
EMBEDDINGS_PICTURES_DIR = f"{EMBEDDINGS_SAVE_DIR}pictures/"

if not os.path.isdir(EMBEDDINGS_SAVE_DIR):
    os.mkdir(EMBEDDINGS_SAVE_DIR)

In [8]:
from torchvision import transforms
import copy
from IPython.display import display
import ipywidgets
from tqdm.notebook import tqdm
from torchvision.io import read_image, ImageReadMode

import Retrieve_Image_Bucket_Data

def load_local_image(image_id):
    file_path = EMBEDDINGS_PICTURES_DIR + image_id
    image = read_image(file_path, mode=ImageReadMode.RGB)
    return image

def image_to_embedding(model_e, image_id, d_widget, index_num):
    d_widget.value = f"Converting file number: {index_num}"

    try:
        input_tensor = load_local_image(image_id)
    except:
        print(f"Could not load image: {image_id}, downloading from bucket...")
        input_tensor = Retrieve_Image_Bucket_Data.download_picture(bucket, image_id, "Temp.jpg", image_format="torch")
    embedding = model_e(input_tensor)
    return embedding[0].squeeze().cpu()

def get_df_embeddings(pictures_df, embedding_model, preprocess):
    display_out = ipywidgets.HTML()
    display(display_out)

    if torch.cuda.is_available():
        embedding_model.to('cuda')

    split_dfs = np.array_split(pictures_df, DF_SPLITS)
    with torch.no_grad():
        for df_split in tqdm(split_dfs):
            df_split["embeddings"] = df_split.apply(lambda row: image_to_embedding(preprocess, embedding_model, row["id"], display_out, row.name), axis=1)
    embedding_df = pd.concat(split_dfs)
    return embedding_df


In [9]:
from tqdm import tqdm, tqdm_notebook
from torchvision.models import quantization
from torchvision import models
import torchvision
import torch

import Retrieve_Image_Bucket_Data

class Embedding_Config():
    class exception_hack(Exception):
        pass

    
    class Parasite_Module(torch.nn.Module):
        def __init__(self, host_layer, include_host=True):
            super().__init__()
            self.host_layer = host_layer
            self.include_host = include_host

        def forward(self, x):
            if self.include_host:
                x = self.host_layer(x)
            raise Embedding_Config.exception_hack(x)

    def __init__(self):
        self.use_cuda = None
        
        self.weights = None
        self.transforms = None
        self.model = None
        self.model_save_name = None
        self.bucket = Retrieve_Image_Bucket_Data.get_bucket()

    def load_model(self):
        raise NotImplementedError

    def prepare_model(self):
        model = self.load_model()
        if self.use_cuda:
            model.to(DEVICE)
        model.eval()
        return model

    def print_config_summary(self):
        print(f"PKL name: {self.model_save_name} | Model: {type(self.model).__name__} | Weights: {type(self.weights).__name__} | {self.weights.name}")

    def load_images(self, image_series):
        image_tensors = self.load_image_series_torchvision(image_series)
        formatted_images = [self.transforms(test_img).unsqueeze(0) for test_img in image_tensors]
        formatted_images = torch.vstack(formatted_images)
        if self.use_cuda:
            formatted_images = formatted_images.to(DEVICE)
        return formatted_images
    
    def load_image_series_torchvision(self, image_series):
        images = []
        for image_id in image_series:
            try:
                input_tensor = load_local_image(image_id)
            except:
                print(f"Could not load image: {image_id}, downloading from bucket...")
                input_tensor = Retrieve_Image_Bucket_Data.download_picture(self.bucket, image_id, "Temp.jpg", image_format="torch")
            images.append(input_tensor)
        return images
    
    def get_embeddings(self, image_tensors):
        with torch.no_grad():
            try:
                embeddings = self.model(image_tensors)
            except self.exception_hack as e:
                embeddings = e.args[0]
        # Reduce to half precision to save space and load time
        forward_embeddings = embeddings.cpu()#embeddings.half().cpu()
        return forward_embeddings.half()


In [10]:
import torch
import torch.nn as nn
from torchvision.models import resnet50

# Create a custom layer that adds print functionality
class PrintLayer(nn.Module):
    def __init__(self, layer):
        super(PrintLayer, self).__init__()
        self.layer = layer

    def forward(self, x):
        print(f"Layer: {self.layer.__class__.__name__}, Input shape: {x.shape}")
        return self.layer(x)

# Subclass the ResNet50 model to create a custom model
class CustomResNet50(nn.Module):
    def __init__(self, original_model):
        super(CustomResNet50, self).__init__()
        for name, layer in original_model.named_children():
            setattr(self, name, PrintLayer(layer))

    def forward(self, x):
        for name, layer in self.named_children():
            x = layer(x)
        return x

In [None]:
class ViT_H_14_E2E_Config(Embedding_Config):
    def __init__(self):
        super().__init__()
        self.use_cuda = True

        self.weights = models.ViT_H_14_Weights.IMAGENET1K_SWAG_E2E_V1
        self.transforms = self.weights.transforms(antialias=True)
        self.model = self.prepare_model()
        self.model_save_name = f"{self.__class__.__name__}.pkl"
        self.print_config_summary()
        
    def load_model(self):
        model = models.vit_h_14(weights=self.weights)
        model.heads.head = self.Parasite_Module(model.heads.head, include_host=True)
        return model

class ViT_H_14_E2E_final(Embedding_Config):
    def __init__(self):
        super().__init__()
        self.use_cuda = True

        self.weights = models.ViT_H_14_Weights.IMAGENET1K_SWAG_E2E_V1
        self.transforms = self.weights.transforms(antialias=True)
        self.model = self.prepare_model()
        self.model_save_name = f"{self.__class__.__name__}.pkl"
        self.print_config_summary()
        
    def load_model(self):
        model = models.vit_h_14(weights=self.weights)
        model.heads.head = self.Parasite_Module(model.heads.head, include_host=False)
        return model

class resnet50_V1_Config(Embedding_Config):
    def __init__(self):
        super().__init__()
        self.use_cuda = True

        self.weights = models.ResNet50_Weights.IMAGENET1K_V1
        self.transforms = self.weights.transforms(antialias=True)
        self.model = self.prepare_model()
        self.model_save_name = "resnet50_v1.pkl"
        self.print_config_summary()
    
    def model_forward_append(self, original_forward):
        def new_forward(x):
            x = original_forward(x)
            print(x.shape)
            return x
        return new_forward

    def load_model(self):
        model = models.resnet50(weights=self.weights)
        model.fc = self.Parasite_Module(model.fc, include_host=False)
        model = CustomResNet50(model)
        return model

class EfficientNet_V2_L_final(Embedding_Config):
    def __init__(self):
        super().__init__()
        self.use_cuda = True

        self.weights = models.EfficientNet_V2_L_Weights.IMAGENET1K_V1
        self.transforms = self.weights.transforms()
        self.model = self.prepare_model()
        self.model_save_name = f"{self.__class__.__name__}.pkl"
        self.print_config_summary()
        
    def load_model(self):
        model = models.efficientnet_v2_l(weights=self.weights)
        model.classifier[1] = self.Parasite_Module(model.classifier[1], include_host=False)
        return model

class ConvNext_Large_v1_final(Embedding_Config):
    def __init__(self):
        super().__init__()
        self.use_cuda = True

        self.weights = models.ConvNeXt_Large_Weights.IMAGENET1K_V1
        self.transforms = self.weights.transforms()
        self.model = self.prepare_model()
        self.model_save_name = f"{self.__class__.__name__}.pkl"
        self.print_config_summary()
        
    def load_model(self):
        model = models.convnext_large(weights=self.weights)
        model.classifier[2] = self.Parasite_Module(model.classifier[2], include_host=False)
        return model

test_config = resnet50_V1_Config()
test_loaded_images = test_config.load_images(pictures_df["id"][:10])
test_embeddings = test_config.get_embeddings(test_loaded_images)
test_loaded_images.shape, test_embeddings.shape

In [56]:
import pandas as pd
import numpy as np

from mysql.connector import (connection)
from Database_Settings import DB_SETTINGS
import Database_connector

cnx = connection.MySQLConnection(**DB_SETTINGS)
db_connection = Database_connector.Db_Connection()
cursor = cnx.cursor(dictionary=True)
pictures_df = pd.read_pickle(f"{EMBEDDINGS_SAVE_DIR}/used_picture_ids.pkl")
# pictures_query = "SELECT * FROM Pictures WHERE (`Pictures`.`meta.validTo` >= '9999-01-01 00:00:00')"
# cursor.execute(pictures_query)
# pcitures_results = cursor.fetchall()

# pictures_df = pd.DataFrame([list(order_dict.values()) for order_dict in pcitures_results], columns=list(pcitures_results[0].keys()))

cursor = cnx.cursor(dictionary=True)
#tag_query = "SELECT * FROM Outfits WHERE (`Outfits`.`isPublic` = TRUE AND `Outfits`.`isDeleted` = FALSE AND `Outfits`.`meta.validTo` >= '9999-01-01 00:00:00')"
tag_query = "SELECT `Outfits`.`id` AS `id`, `Outfits`.`owner` AS `owner`, `Outfits`.`name` AS `name`, `Outfits`.`brand` AS `brand`, `Outfits`.`description` AS `description`, `Outfits`.`isPublic` AS `isPublic`, `Outfits`.`isDeleted` AS `isDeleted`, `Outfits`.`type` AS `type`, `Outfits`.`keywords` AS `keywords`, `Outfits`.`retailPrice` AS `retailPrice`, `Outfits`.`meta.validFrom` AS `meta.validFrom`, `Outfits`.`meta.validTo` AS `meta.validTo` FROM `Outfits` WHERE (`Outfits`.`isPublic` = TRUE AND `Outfits`.`isDeleted` = FALSE AND `Outfits`.`meta.validTo` >= '9999-01-01 00:00:00')"
cursor.execute(tag_query)
outfit_results = cursor.fetchall()

outfits_df_db = pd.DataFrame([list(order_dict.values()) for order_dict in outfit_results], columns=list(outfit_results[0].keys()))
outfits_df_db

Unnamed: 0,id,owner,name,brand,description,isPublic,isDeleted,type,keywords,retailPrice,meta.validFrom,meta.validTo
0,outfit.00004b4d01ca4ab0a70cf073ba74fefa,user.66d3a17f5dd149f1845bbaf223c67cc3,Yugen Black Cardigan,FWSS,The FWSS Yugen Cardigan is a form-fitted cardi...,1,0,,outfit.00004b4d01ca4ab0a70cf073ba74fefa - Yuge...,1900.0000,2023-04-04 04:10:22.062,9999-01-01 00:00:00
1,outfit.0013691ff35b440e9dcfe1748ec184c7,user.8a140d0e20704284a656f2400b64b885,Oldina Parka Cotta,Kari Traa,The Oldina Parka from Kari Traa is a women's p...,1,0,,outfit.0013691ff35b440e9dcfe1748ec184c7 - Oldi...,3500.0000,2023-02-23 12:20:27.042,9999-01-01 00:00:00
2,outfit.00234201fa2d4ee49a572d650c775213,user.adaf918fc0364873a48255897a2b13d8,Phantom Polaris Down Coat,Fleischer Couture,"Polaris has a classic design, with a curved sh...",1,0,,outfit.00234201fa2d4ee49a572d650c775213 - Phan...,8000.0000,2022-05-31 16:43:55.205,9999-01-01 00:00:00
3,outfit.003356af26964c5084d0cc1e9f95978c,user.87234eeeecf54b99b5694a4ded75e420,Cala Long Sleeve Black,Skappel,The Cala Knitted Jumper from Skappel is a nice...,1,0,,outfit.003356af26964c5084d0cc1e9f95978c - Cala...,1800.0000,2022-10-26 06:22:05.720,9999-01-01 00:00:00
4,outfit.003e042f5e9e4043bf10a0284d88ce75,user.8786114165114ea0b6fbc447deae9114,2 Rose Dust Tiana Top,Maud,"The Tiana Top is cut in a classic fit, and fea...",1,0,,outfit.003e042f5e9e4043bf10a0284d88ce75 - 2 Ro...,1500.0000,2022-07-19 12:16:00.406,9999-01-01 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...
9021,outfit.ffe4d235a016429dae41bb6399cf16d1,user.ed1650e1d15d407b925b669fb31227bd,Undercover Ash Sunglasses,Kaibosh,The Undercover Sunglasses from Kaibosh are 100...,1,0,,outfit.ffe4d235a016429dae41bb6399cf16d1 - Unde...,1000.0000,2022-08-15 17:06:41.201,9999-01-01 00:00:00
9022,outfit.ffe617f43d244ebc81228915cfcc6c8e,user.0e2df601218248bd9d97b882664dd978,Platinum Blazer,Riccovero,Platinum Blazer is a classic and narrow suit j...,1,0,,outfit.ffe617f43d244ebc81228915cfcc6c8e - Plat...,2800.0000,2022-11-18 13:10:41.352,9999-01-01 00:00:00
9023,outfit.ffebad2c479045a78adecdcd8f07427d,user.e6aed916b476487b85adaf0a6295578d,Bumble Dress Black Burnout Maxi,Høst & Vår,The Bumble Maxi Dress from Høst & Vår is a ful...,1,0,,outfit.ffebad2c479045a78adecdcd8f07427d - Bumb...,1800.0000,2023-01-05 09:12:43.847,9999-01-01 00:00:00
9024,outfit.ffeef842238f4dbdabc6c730a75aa2bd,user.5ab3daf28ecc4c5c9c5d2b6e37b97712,Black Amber Pants,Kupong Knit.wear,"Feel slack and nice dressed with this pant, ma...",1,0,,outfit.ffeef842238f4dbdabc6c730a75aa2bd - Blac...,1200.0000,2022-02-28 11:02:29.494,9999-01-01 00:00:00


In [60]:
from tqdm import tqdm


catalog = []
local_image_ids = os.listdir(EMBEDDINGS_PICTURES_DIR)
found_locally = []
for index, picture_row in tqdm(pictures_df.iterrows()):

    #picture_row = pictures_df[pictures_df["id"] == filtered_picture_id]
    filtered_picture_id = picture_row["id"]
    picture_outfit_id = picture_row["owner"]

    picture_index = picture_row.index[0]
    outfit_row = outfits_df_db[outfits_df_db["id"] == picture_outfit_id]
    
    found = filtered_picture_id in local_image_ids 
    found_outfit = len(outfit_row) != 0
    found_locally.append(found and found_outfit)
    if not found:
        print(f"Picture id: {index}, {filtered_picture_id} not found in local images, skipping...")
        continue
    

    if not found_outfit:
        continue

    picture_caption = f"{outfit_row['name'].values[0]} - {outfit_row['description'].values[0]}"
    #print(f"{picture_index}: Picture id: {filtered_picture_id}, outfit id: {picture_outfit_id}")
    catalog.append({"id": picture_index, "image":filtered_picture_id, "caption": picture_caption})

pictures_df["found_locally"] = found_locally
pictures_df = pictures_df[pictures_df["found_locally"]].drop(columns=["found_locally"])

import pickle
with open(f"{EMBEDDINGS_SAVE_DIR}/catalog.pkl", "wb") as f:
    pickle.dump(catalog, f)

699it [00:01, 732.04it/s]

Picture id: 930, picture.05ab2834ce8c40b89eb4fdbf12b3cba1 not found in local images, skipping...


9335it [00:12, 757.29it/s]

Picture id: 14162, picture.580a4eaee0614e1bab1b34a61557e4bd not found in local images, skipping...


9868it [00:13, 750.03it/s]

Picture id: 15018, picture.5d64c136ee0445eb9d62525b785f6a70 not found in local images, skipping...


14408it [00:19, 744.89it/s]

Picture id: 22729, picture.87ffd8300c6b4ec1931d7c699de9e676 not found in local images, skipping...


18371it [00:24, 758.02it/s]

Picture id: 32058, picture.ac3c991965ea4bec86a066a154b91613 not found in local images, skipping...


21856it [00:29, 751.92it/s]

Picture id: 39150, picture.cbebaa3a8af14d32b14626ad3fd38cb1 not found in local images, skipping...


22384it [00:30, 743.19it/s]

Picture id: 40004, picture.d158acf6479f45cd8fad556522df6b1f not found in local images, skipping...
Picture id: 40038, picture.d1907a29b851429aba22bf97f3677131 not found in local images, skipping...


23890it [00:32, 743.89it/s]

Picture id: 42361, picture.df94ca2ef38e4f8fa2a830d883a54fbb not found in local images, skipping...


27053it [00:36, 740.67it/s]


In [20]:
import pickle
with open(f"{EMBEDDINGS_SAVE_DIR}/catalog.pkl", "rb") as f:
    catalog = pickle.load(f)

In [66]:
import os
import sys
sys.path.append("/home/kaborg15/fashion-clip")
from fashion_clip.fashion_clip import FashionCLIP, FCLIPDataset
from fashion_clip.utils import get_cache_directory, display_images
print("Cache is at {}".format(get_cache_directory()))

Cache is at /home/kaborg15/.cache/fashion_clip


In [67]:
dataset = FCLIPDataset('farfetch_local',
                       image_source_path=EMBEDDINGS_PICTURES_DIR,
                       image_source_type='local',
                       catalog=catalog)
fclip = FashionCLIP("fashion-clip", dataset)

9d8b31fd16d4edb8c7333a9e6048c5f3623ef9d7_13d3b48a2e31d7bf49dda47c427c8777e7af575e8bf1b9d4d0ebc15c5b0e23f7


 68%|██████▊   | 571/838 [26:23<12:39,  2.84s/it]

In [24]:
import numpy as np
MODEL_SAVE_DIR = f"{EMBEDDINGS_SAVE_DIR}/clip_model/"

if not os.path.isdir(MODEL_SAVE_DIR):
    os.mkdir(MODEL_SAVE_DIR)


pictures_df.to_pickle(f"{MODEL_SAVE_DIR}/pictures_df.pkl")
np.save(f"{MODEL_SAVE_DIR}/image_vectors.npy", fclip.image_vectors)

NameError: name 'fclip' is not defined

In [74]:
import time
import Image_Embedding_File_Handler
import random

INFERENCE_TIME_LOG_FILE = "Pickled_Data/inference_time_log.txt"

def log_inference_time(config_class, embedding_shape, inference_time_array):
    with open(INFERENCE_TIME_LOG_FILE, "a") as f:
        f.write(f"\nModel: {config_class.model_save_name}, shape: {embedding_shape}\nmean: {inference_time_array.mean()}, values: {inference_time_array}")



def get_df_embeddings(base_df, config_class : Embedding_Config):
    # display_out = ipywidgets.HTML()
    # display(display_out)

    inference_time_log = []
    model_config = config_class()
    print(f"Generating embeddings from {type(model_config).__name__} to {model_config.model_save_name}...")
    split_dfs = np.array_split(base_df, DF_SPLITS)
    for df_split in tqdm(split_dfs[:]):
        df_images = model_config.load_images(df_split["id"])

        start_time = time.time()
        df_embeddings = model_config.get_embeddings(df_images)
        embedding_shape = df_embeddings.shape
        df_embeddings = list(df_embeddings)
        inference_time_log.append(time.time() - start_time)

        df_split["embeddings"] = df_embeddings
    embedding_df = pd.concat(split_dfs)
    embeddings_save_folder = os.path.join(EMBEDDINGS_SAVE_DIR, model_config.model_save_name)
    try:
        Image_Embedding_File_Handler.save_embeddings(embedding_df, save_dir=embeddings_save_folder)
    except:
        embedding_df = embedding_df.dropna()
        embeddings_save_folder += str(random.randint(0, 1000))
        #return embedding_df
        Image_Embedding_File_Handler.save_embeddings(embedding_df, save_dir=embeddings_save_folder)
        
    #embedding_df.to_pickle(os.path.join(EMBEDDINGS_SAVE_DIR, model_config.model_save_name))
    log_inference_time(model_config, embedding_shape, np.array(inference_time_log))
    return embedding_df

class_configs = [EfficientNet_V2_L_final, ConvNext_Large_v1_final]
for config_class in class_configs:
    get_df_embeddings(pictures_df, config_class)



PKL name: ViT_H_14_E2E_final.pkl | Model: VisionTransformer | Weights: ViT_H_14_Weights | IMAGENET1K_SWAG_E2E_V1
Generating embeddings from ViT_H_14_E2E_final to ViT_H_14_E2E_final.pkl...


100%|██████████| 200/200 [2:27:46<00:00, 44.33s/it]  


Embeddings saved to ViT_H_tensors.pt and ViT_H_ids.pkl
