In [None]:
import clip
import numpy as np
import pandas as pd
import requests
import time
import torch
import warnings

from io import BytesIO
from IPython.display import Image, display
from PIL import Image as PILImage
from tqdm import tqdm

warnings.filterwarnings("ignore")

In [None]:
# Load CLIP model and preprocess function
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device = device)

100%|████████████████████████████████████████| 338M/338M [00:02<00:00, 124MiB/s]


In [None]:
df = pd.read_csv("/content/sample_data/random_samples.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,availableSizes,brand.id,brand.name,gender,hasSimilarProducts,id,images.cutOut,images.model,isCustomizable,...,priceInfo.currencyCode,priceInfo.discountLabel,priceInfo.finalPrice,priceInfo.formattedFinalPrice,priceInfo.formattedInitialPrice,priceInfo.initialPrice,priceInfo.installmentsLabel,priceInfo.isOnSale,shortDescription,stockTotal
0,0,"[{'scaleId': 0, 'size': 'XS'}]",6,Paul Smith,women,True,13884060,https://cdn-images.farfetch-contents.com/13/88...,https://cdn-images.farfetch-contents.com/13/88...,False,...,SGD,50% Off,478,$478,$955,955,,True,Paul's Photo print midi dress,1
1,133,"[{'scaleId': 0, 'size': 'M'}, {'scaleId': 0, '...",38206,Mara Hoffman,women,True,13630929,https://cdn-images.farfetch-contents.com/13/63...,https://cdn-images.farfetch-contents.com/13/63...,False,...,SGD,,629,$629,$629,629,,False,striped maxi dress,3
2,26,"[{'scaleId': 0, 'size': 'M'}, {'scaleId': 0, '...",5502,Stella McCartney,women,True,13592474,https://cdn-images.farfetch-contents.com/13/59...,https://cdn-images.farfetch-contents.com/13/59...,False,...,SGD,50% Off,948,$948,"$1,895",1895,,True,monogram zipped track jacket,3
3,144,,120739,Jimmy Choo,women,True,14214212,https://cdn-images.farfetch-contents.com/14/21...,https://cdn-images.farfetch-contents.com/14/21...,False,...,SGD,,895,$895,$895,895,,False,Love 85 pumps,140
4,47,"[{'scaleId': 0, 'size': 'One Size'}]",825159,Andrea Fohrman,women,False,13472532,https://cdn-images.farfetch-contents.com/13/47...,https://cdn-images.farfetch-contents.com/13/47...,False,...,SGD,,9828,"$9,828","$9,828",9828,,False,"18kt yellow gold, rock crystal and grey sapphi...",1


In [None]:
df.columns

Index(['Unnamed: 0', 'availableSizes', 'brand.id', 'brand.name', 'gender',
       'hasSimilarProducts', 'id', 'images.cutOut', 'images.model',
       'isCustomizable', 'merchandiseLabel', 'merchandiseLabelField',
       'merchantId', 'priceInfo.currencyCode', 'priceInfo.discountLabel',
       'priceInfo.finalPrice', 'priceInfo.formattedFinalPrice',
       'priceInfo.formattedInitialPrice', 'priceInfo.initialPrice',
       'priceInfo.installmentsLabel', 'priceInfo.isOnSale', 'shortDescription',
       'stockTotal'],
      dtype='object')

In [None]:
df.drop(columns = ['Unnamed: 0', 'availableSizes',
       'hasSimilarProducts', 'images.model',
       'isCustomizable', 'merchandiseLabel', 'merchandiseLabelField',
       'merchantId', 'priceInfo.currencyCode', 'priceInfo.discountLabel',
       'priceInfo.finalPrice', 'priceInfo.formattedFinalPrice',
       'priceInfo.formattedInitialPrice', 'priceInfo.initialPrice',
       'priceInfo.installmentsLabel', 'priceInfo.isOnSale',
       'stockTotal'], inplace = True)

In [None]:
df.head()

Unnamed: 0,brand.id,brand.name,gender,id,images.cutOut,shortDescription
0,6,Paul Smith,women,13884060,https://cdn-images.farfetch-contents.com/13/88...,Paul's Photo print midi dress
1,38206,Mara Hoffman,women,13630929,https://cdn-images.farfetch-contents.com/13/63...,striped maxi dress
2,5502,Stella McCartney,women,13592474,https://cdn-images.farfetch-contents.com/13/59...,monogram zipped track jacket
3,120739,Jimmy Choo,women,14214212,https://cdn-images.farfetch-contents.com/14/21...,Love 85 pumps
4,825159,Andrea Fohrman,women,13472532,https://cdn-images.farfetch-contents.com/13/47...,"18kt yellow gold, rock crystal and grey sapphi..."


In [None]:
df = df.rename(columns={'images.cutOut': 'image', "shortDescription": "title"})
df.head()

Unnamed: 0,brand.id,brand.name,gender,id,image,title
0,6,Paul Smith,women,13884060,https://cdn-images.farfetch-contents.com/13/88...,Paul's Photo print midi dress
1,38206,Mara Hoffman,women,13630929,https://cdn-images.farfetch-contents.com/13/63...,striped maxi dress
2,5502,Stella McCartney,women,13592474,https://cdn-images.farfetch-contents.com/13/59...,monogram zipped track jacket
3,120739,Jimmy Choo,women,14214212,https://cdn-images.farfetch-contents.com/14/21...,Love 85 pumps
4,825159,Andrea Fohrman,women,13472532,https://cdn-images.farfetch-contents.com/13/47...,"18kt yellow gold, rock crystal and grey sapphi..."


In [None]:
def preprocess_image(image_url, timeout = 30, retries = 3, backoff = 2):
    attempt = 0
    while attempt < retries:
        try:
            response = requests.get(image_url, timeout = timeout)
            response.raise_for_status()

            image_tensor = preprocess(
                PILImage.open(
                    BytesIO(response.content)
                )
            ).unsqueeze(0).to(device)

            return image_tensor
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {e}. Retrying in {backoff} seconds...")
            time.sleep(backoff)
            attempt += 1
            backoff *= 2
    print(f"Failed to download image from {image_url} after {retries} attempts.")
    return np.nan

In [None]:
def preprocess_text(text, max_words = 70):
    words = text.split()
    if len(words) > max_words:
        text = " ".join(words[:max_words])

    text_tensor = clip.tokenize(text).to(device)
    return text_tensor

In [None]:
def generate_embeddings(row):
    image_tensor = preprocess_image(row["image"]).to('cuda')

    if torch.isnan(image_tensor).any():
        return np.nan, np.nan, np.nan
    text_tensor = preprocess_text(row["title"]).to('cuda')

    with torch.no_grad():
        image_features = model.encode_image(image_tensor)
        text_features = model.encode_text(text_tensor)

        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)

        text_vector = text_features.cpu().numpy().flatten()
        image_vector = image_features.cpu().numpy().flatten()
        combined_vector = (text_vector + image_vector) / 2

        return text_vector, image_vector, combined_vector


In [None]:
tqdm.pandas()

df[["text_vector", "image_vector", "combined_vector"]] = df.progress_apply(
    lambda row: pd.Series(generate_embeddings(row)),
    axis = 1
)

 14%|█▍        | 3/21 [00:20<02:10,  7.23s/it]

Attempt 1 failed: HTTPSConnectionPool(host='cdn-images.farfetch-contents.com', port=443): Read timed out. (read timeout=30). Retrying in 2 seconds...


 19%|█▉        | 4/21 [01:02<05:39, 19.97s/it]

Attempt 1 failed: HTTPSConnectionPool(host='cdn-images.farfetch-contents.com', port=443): Read timed out. (read timeout=30). Retrying in 2 seconds...


 24%|██▍       | 5/21 [01:44<07:24, 27.76s/it]

Attempt 1 failed: HTTPSConnectionPool(host='cdn-images.farfetch-contents.com', port=443): Read timed out. (read timeout=30). Retrying in 2 seconds...


100%|██████████| 21/21 [05:02<00:00, 14.41s/it]


In [None]:
df

Unnamed: 0,brand.id,brand.name,gender,id,image,title,text_vector,image_vector,combined_vector
0,6,Paul Smith,women,13884060,https://cdn-images.farfetch-contents.com/13/88...,Paul's Photo print midi dress,"[0.01471, -0.04083, -0.00892, 0.03815, -0.0433...","[0.01084, 0.01301, -0.002697, -0.02037, 0.0164...","[0.01277, -0.013916, -0.005806, 0.00889, -0.01..."
1,38206,Mara Hoffman,women,13630929,https://cdn-images.farfetch-contents.com/13/63...,striped maxi dress,"[-0.01917, 0.00584, -0.0394, 0.007027, -0.0200...","[0.02278, 0.02046, -0.0002176, 0.03137, -0.004...","[0.001808, 0.01315, -0.0198, 0.0192, -0.01208,..."
2,5502,Stella McCartney,women,13592474,https://cdn-images.farfetch-contents.com/13/59...,monogram zipped track jacket,"[-0.029, 0.0536, 0.0422, 0.01233, -0.004227, 0...","[-0.004604, 0.01595, 0.018, -0.00953, 0.0133, ...","[-0.0168, 0.03476, 0.0301, 0.0014, 0.004536, 0..."
3,120739,Jimmy Choo,women,14214212,https://cdn-images.farfetch-contents.com/14/21...,Love 85 pumps,"[-0.01857, -0.0182, 0.0088, -0.00549, -0.04932...","[-0.02551, -0.03333, 0.03232, 0.01557, -0.0270...","[-0.02203, -0.02576, 0.02055, 0.005043, -0.038..."
4,825159,Andrea Fohrman,women,13472532,https://cdn-images.farfetch-contents.com/13/47...,"18kt yellow gold, rock crystal and grey sapphi...","[0.02115, -0.0365, -0.02979, 0.013374, 0.01816...","[-0.01415, -0.05432, 0.02159, -0.02385, -0.019...","[0.003498, -0.0454, -0.004097, -0.005238, -0.0..."
5,842573,Talbot Runhof,women,13254961,https://cdn-images.farfetch-contents.com/13/25...,floral embellished long dress,"[0.04004, -0.010025, -0.01449, 0.10443, -0.033...","[0.00949, -0.010155, 0.01008, 0.04245, -0.0341...","[0.02477, -0.010086, -0.002205, 0.0734, -0.033..."
6,77463,Dusan,women,13819532,https://cdn-images.farfetch-contents.com/13/81...,front button coat,"[-0.0355, 0.03207, -0.02606, 0.004368, 0.00369...","[-0.006107, 0.0134, 0.00292, 0.0311, 0.0185, 0...","[-0.0208, 0.02274, -0.01157, 0.01773, 0.01109,..."
7,2638322,Mira Mikati,women,13919087,https://cdn-images.farfetch-contents.com/13/91...,Just Chilling T-shirt,"[0.02293, 0.02837, -0.03293, 0.02602, -0.0392,...","[0.02617, 0.00284, 0.01397, 0.015564, 0.000779...","[0.02455, 0.0156, -0.009476, 0.02078, -0.01921..."
8,3440,Dolce & Gabbana,women,12543581,https://cdn-images.farfetch-contents.com/12/54...,printed iPhone 6/7 Plus case,"[-0.00844, 0.006577, -0.04694, 0.05237, 0.0218...","[0.02592, -0.01515, -0.0296, -0.02725, 0.00012...","[0.00874, -0.004288, -0.03827, 0.01256, 0.0109..."
9,25354,Gucci,women,12952115,https://cdn-images.farfetch-contents.com/12/95...,sequin stripes track trousers,"[-0.00987, 0.01214, -0.06793, -2.915e-05, -0.0...","[0.02394, 0.001843, -0.0201, 0.02277, -0.04642...","[0.007034, 0.006992, -0.044, 0.01137, -0.04587..."


In [None]:
import json

df['text_vector'] = df['text_vector'].apply(lambda x: [float(i) for i in x])
df['image_vector'] = df['image_vector'].apply(lambda x: [float(i) for i in x])
df['combined_vector'] = df['combined_vector'].apply(lambda x: [float(i) for i in x])

df.to_json('output.json', orient='records')

In [None]:
import sqlite3
import json

conn = sqlite3.connect('mydatabase.db')
cursor = conn.cursor()

cursor.execute('''
    CREATE TABLE IF NOT EXISTS image_data (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        data TEXT
    )
''')

with open('output.json', 'r') as file:
    data = json.load(file)

for record in data:
    json_record = json.dumps(record)
    cursor.execute("INSERT INTO image_data (data) VALUES (?)", (json_record,))

conn.commit()
conn.close()