In [1]:
import torch
import clip
import numpy as np
from PIL import Image
from io import BytesIO
import requests
import json
import os
import requests
import time

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model, preprocess = clip.load("ViT-B/16", device=device)

100%|███████████████████████████████████████| 335M/335M [00:42<00:00, 8.29MiB/s]


In [45]:
DATASET_DIRECTORY = "All_Beauty"
sample_data_path = 'meta_All_Beauty.jsonl'
# sample_data_path = 'test.jsonl'

In [46]:
def image_url_to_img(image_url, retries=5, delay=1):
    for _ in range(retries):
        try:
            response = requests.get(image_url, timeout=10)
            response.raise_for_status()
            img = Image.open(BytesIO(response.content))
            return img
        except requests.exceptions.RequestException as e:
            print(f"Error fetching image from {image_url}: {e}")
            time.sleep(delay)  # Wait before retrying
    print(f"Failed to fetch image from {image_url} after {retries} retries.")
    return None

In [47]:
def generate_text_embeddings(text):
    text_features = clip.tokenize([text]).to(device)
    with torch.no_grad():
        text_embeddings = model.encode_text(text_features)
        text_embeddings /= text_embeddings.norm(dim = -1, keepdim= True)
        return text_embeddings.cpu().numpy()[0]

def generate_image_embeddings(img):
    img_preprocessed = preprocess(img).unsqueeze(0).to(device)
    with torch.no_grad():
        image_embeddings = model.encode_image(img_preprocessed).float()
        image_embeddings /= image_embeddings.norm(dim = -1, keepdim = True)
    return image_embeddings[0].cpu().numpy()


In [48]:
start = time.time()
img = image_url_to_img('https://m.media-amazon.com/images/I/612JNfob9nL._AC_UY218_.jpg')
embed = generate_image_embeddings(img)
t_embeddings = generate_text_embeddings('Hello I am Awais and I am trying to test the time it takes to convert text into embeddings using my cpu..!!')
end = time.time()
time_taken = end - start
print(time_taken)

0.28485941886901855


In [49]:
def extract_img_urls(image_array):
    urls = []
    for item in image_array:
        # print(item)
        if 'hi_res' in item and item['hi_res']:
            urls.append(item['hi_res'])
        elif 'large' in item and item['large']:
            urls.append(item['large'])
        else:
            print(f"Key 'hi_res' and 'large' not found in item: {item}")
    return urls

In [50]:
def contains_invalid_chars(title):
    invalid_chars = set('/\\:*?<>|"')
    return any(char in invalid_chars for char in title)

In [89]:
def save_embeddings(title, title_embedding, image_embeddings, output_dir):
    # Create a directory for the title
    try:
        title_dir = os.path.join(output_dir, title)
        os.makedirs(title_dir, exist_ok=True)
    except OSError as exc:
        if exc.errno == 36:
            return
        else:
            raise
    # Save title embedding
    title_embedding_path = os.path.join(title_dir, "title_embedding.npy")
    np.save(title_embedding_path, title_embedding)
    
    # Save image embeddings and create the JSON mapping
    image_mapping = {}
    for i, (image_embedding, url) in enumerate(image_embeddings):
        image_embedding_path = os.path.join(title_dir, f"image_embedding_{i}.npy")
        np.save(image_embedding_path, image_embedding)
        image_mapping[i] = url
    
    # Save the JSON mapping
    json_mapping_path = os.path.join(title_dir, "image_url_mapping.json")
    with open(json_mapping_path, 'w') as json_file:
        json.dump(image_mapping, json_file)

In [90]:
with open(sample_data_path) as file:
    for line in file:
        data = json.loads(line.strip())
        
        title = data['title']
        if title in os.listdir(DATASET_DIRECTORY):
            continue
        # Split the title into words
        title_words = title.split()
        title_words = title_words[:55]
        # Join the words to form a string with a single space between each word
        title = ' '.join(title_words)

        if contains_invalid_chars(title):
            continue
        # Generate title embedding
        title_embedding = generate_text_embeddings(title)

        image_urls = extract_img_urls(data['images'])
        image_embeddings = []
        for url in image_urls:
            img = image_url_to_img(url)
            if img:
                image_embedding = generate_image_embeddings(img)
                image_embeddings.append((image_embedding, url))
        # Save embeddings and image URL mapping
        save_embeddings(title, title_embedding, image_embeddings, DATASET_DIRECTORY)

NameError: name 'handle_filename_too_long' is not defined

In [25]:
cd home

/home


In [26]:
cd Semantic-Search-using-Vector-Database

/home/Semantic-Search-using-Vector-Database


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [27]:
ls

README.md                        meta_All_Beauty.jsonl
[0m[01;34membeddings[0m/                      milvus-standalone-docker-compose.yml
gpu_embeddings_generation.ipynb  milvus_standalone.py
main.ipynb


In [28]:
mkdir embeddings2

In [85]:
text = 'Input Heidi & Oak Natural Labs - Vitamin C Anti-Aging Serum 20% - 1 Fl.Oz - 60 Day Supply - With Hyaluronic Acid + Vitamin E + Amino Acid Complex - Provides Nourishment, Restoration and Protection for Your Skin - Reduces Unsightly Wrinkles, Fades Age Spots, Calms Inflammation and Helps Combat Acne By Neutralizing Free Radicals for Beautiful Glowing Skin - Convienient Pump Action Bottle - Your Skin Will Love Our Vitamin C Serum - Made in USA'
title_words = text.split()
title_words = title_words[:55]
# Join the words to form a string with a single space between each word
title = ' '.join(title_words)
text_features = clip.tokenize([title]).to(device)

# print(len(text.split(" ")))
print(title)

Input Heidi & Oak Natural Labs - Vitamin C Anti-Aging Serum 20% - 1 Fl.Oz - 60 Day Supply - With Hyaluronic Acid + Vitamin E + Amino Acid Complex - Provides Nourishment, Restoration and Protection for Your Skin - Reduces Unsightly Wrinkles, Fades Age Spots, Calms Inflammation and Helps Combat Acne By Neutralizing Free


In [80]:
'Eye Patch Black Adult with Tie Band (6 Per Pack)' in os.listdir('All_Beauty')

True

In [None]:
os.listdir('All_Beauty')

In [88]:
text = 'All_Beauty/Heidi & Oak Natural Labs - Vitamin C Anti-Aging Serum 20% - 1 Fl.Oz - 60 Day Supply - With Hyaluronic Acid + Vitamin E + Amino Acid Complex - Provides Nourishment, Restoration and Protection for Your Skin - Reduces Unsightly Wrinkles, Fades Age Spots, Calms Inflammation and Helps Combat Acne By Neutralizing Free Radicals'
print(len(text))

333
