In [1]:
pip install pandas


Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd

# --- Step 1: Load and Prepare the Data ---

try:
    # Load the dataset from the CSV file.
    # Ensure 'intern_data_ikarus.csv' is in the same folder as your notebook.
    df = pd.read_csv('intern_data_ikarus.csv')
    print("Dataset 'intern_data_ikarus.csv' loaded successfully!")

    # --- Data Cleaning and Feature Engineering ---

    # Define the columns that we will use to determine product similarity.
    feature_columns = ['title', 'brand', 'description', 'categories', 'material', 'color']

    # Replace any missing values (NaN) in these columns with an empty string
    # to prevent errors during text processing.
    for col in feature_columns:
        df[col] = df[col].fillna('')

    # Define a function to combine all our feature columns into a single string.
    def create_feature_soup(row):
        return (str(row['title']) + ' ' +
                str(row['brand']) + ' ' +
                str(row['description']) + ' ' +
                str(row['categories']) + ' ' +
                str(row['material']) + ' ' +
                str(row['color']))

    # Apply the function to each row of the dataframe to create the 'soup' column.
    df['soup'] = df.apply(create_feature_soup, axis=1)

    # --- Verification ---

    # Display the total number of products and the first 5 rows
    # to show the new 'soup' column.
    print(f"\nTotal products in the dataset: {len(df)}")
    print("\n--- First 5 rows with the 'soup' feature ---")
    print(df[['title', 'soup']].head())

    # Save the processed data to a new CSV file for the next steps.
    df.to_csv('processed_data.csv', index=False)
    print("\nProcessed data has been saved to 'processed_data.csv'")


except FileNotFoundError:
    print("\n--- ERROR ---")
    print("The file 'intern_data_ikarus.csv' was not found.")
    print("Please make sure the CSV file is in the same directory as your Jupyter Notebook.")

Dataset 'intern_data_ikarus.csv' loaded successfully!

Total products in the dataset: 312

--- First 5 rows with the 'soup' feature ---
                                               title  \
0  GOYMFK 1pc Free Standing Shoe Rack, Multi-laye...   
1  subrtex Leather ding Room, Dining Chairs Set o...   
2  Plant Repotting Mat MUYETOL Waterproof Transpl...   
3  Pickleball Doormat, Welcome Doormat Absorbent ...   
4  JOIN IRON Foldable TV Trays for Eating Set of ...   

                                                soup  
0  GOYMFK 1pc Free Standing Shoe Rack, Multi-laye...  
1  subrtex Leather ding Room, Dining Chairs Set o...  
2  Plant Repotting Mat MUYETOL Waterproof Transpl...  
3  Pickleball Doormat, Welcome Doormat Absorbent ...  
4  JOIN IRON Foldable TV Trays for Eating Set of ...  

Processed data has been saved to 'processed_data.csv'


In [4]:
pip install scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# --- Step 2: Vectorize Text and Compute Similarity ---

try:
    # Load the processed data we created in the last step.
    df = pd.read_csv('processed_data.csv')
    print("Loaded 'processed_data.csv' successfully.")

    # --- TF-IDF Vectorization ---

    # Initialize a TF-IDF Vectorizer.
    # stop_words='english' removes common English words (like 'the', 'a', 'in')
    # which don't help in distinguishing between products.
    tfidf = TfidfVectorizer(stop_words='english')

    # Create the TF-IDF matrix by fitting and transforming the 'soup' column.
    # This converts each product's text 'soup' into a numerical vector.
    tfidf_matrix = tfidf.fit_transform(df['soup'])
    print("TF-IDF matrix created successfully.")
    print(f"Shape of TF-IDF matrix: {tfidf_matrix.shape}") # (num_products, num_unique_words)

    # --- Cosine Similarity Calculation ---

    # Calculate the cosine similarity matrix from the TF-IDF matrix.
    # This gives us a score of how similar each product is to every other product.
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    print("Cosine similarity matrix calculated successfully.")
    print(f"Shape of similarity matrix: {cosine_sim.shape}") # (num_products, num_products)


    # --- Save the Model and Processed Data ---

    # Save the cosine similarity matrix to a file. This is our "model".
    with open('similarity_matrix.pkl', 'wb') as f:
        pickle.dump(cosine_sim, f)
    print("Similarity matrix saved as 'similarity_matrix.pkl'")

    # We also need to save the main dataframe to easily access product titles later.
    df.to_pickle('products_df.pkl')
    print("Processed DataFrame saved as 'products_df.pkl'")


except FileNotFoundError:
    print("\n--- ERROR ---")
    print("The file 'processed_data.csv' was not found.")
    print("Please ensure you have run the previous step successfully.")

Loaded 'processed_data.csv' successfully.
TF-IDF matrix created successfully.
Shape of TF-IDF matrix: (312, 3572)
Cosine similarity matrix calculated successfully.
Shape of similarity matrix: (312, 312)
Similarity matrix saved as 'similarity_matrix.pkl'
Processed DataFrame saved as 'products_df.pkl'


In [6]:
import pandas as pd
import pickle

# --- Step 3 (Corrected): Build the Recommendation Function ---

# Load the saved model and data
try:
    products_df = pd.read_pickle('products_df.pkl')
    cosine_sim = pickle.load(open('similarity_matrix.pkl', 'rb'))
    print("Successfully loaded 'products_df.pkl' and 'similarity_matrix.pkl'")

    # Create a pandas Series of product titles with their corresponding index.
    indices = pd.Series(products_df.index, index=products_df['title']).drop_duplicates()

    # --- Define the Recommendation Function ---
    def get_recommendations(title, cosine_sim=cosine_sim, data=products_df):
        """
        This function takes a product title as input and returns
        the top 5 most similar products.
        """
        # Get the index of the product that matches the title
        if title not in indices:
            return f"Product with title '{title}' not found."
        
        idx = indices[title]

        # --- THIS IS THE FIX ---
        # If the title is duplicated, idx will be a Series. We take the index of the first occurrence.
        if isinstance(idx, pd.Series):
            idx = idx.iloc[0]

        # Get the pairwise similarity scores of all products with that product
        sim_scores = list(enumerate(cosine_sim[idx]))

        # Sort the products based on the similarity scores in descending order
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get the scores of the 5 most similar products (excluding the product itself)
        sim_scores = sim_scores[1:6]

        # Get the product indices
        product_indices = [i[0] for i in sim_scores]

        # Return the titles of the top 5 most similar products
        return data['title'].iloc[product_indices]

    # --- Test the Recommendation Function ---

    # Let's get recommendations for the first product in our dataset
    example_product_title = products_df['title'].iloc[0]

    print("\n--- Testing the recommendation function ---")
    print(f"\nRecommendations for: '{example_product_title}'")
    recommendations = get_recommendations(example_product_title)
    print(recommendations)
    
    # You can also test with another product
    example_product_title_2 = products_df['title'].iloc[5]
    print(f"\nRecommendations for: '{example_product_title_2}'")
    recommendations_2 = get_recommendations(example_product_title_2)
    print(recommendations_2)


except FileNotFoundError:
    print("\n--- ERROR ---")
    print("Could not find 'products_df.pkl' or 'similarity_matrix.pkl'.")
    print("Please ensure you have run the previous step successfully.")

Successfully loaded 'products_df.pkl' and 'similarity_matrix.pkl'

--- Testing the recommendation function ---

Recommendations for: 'GOYMFK 1pc Free Standing Shoe Rack, Multi-layer Metal Shoe Cap Rack With 8 Double Hooks For Living Room, Bathroom, Hallway'
7      GOYMFK 1pc Free Standing Shoe Rack, Multi-laye...
82     LANTEFUL Shoe Rack Organizer Shoe Storage Cabi...
179    Dscabomlg Foldable Shoe Storage Plastic Vertic...
149    sogesfurniture 5 Tier Free Standing Wooden Sho...
54     Honey-Can-Do 3-Tier Nesting Bamboo Shoe Rack S...
Name: title, dtype: object

Recommendations for: 'LOVMOR 30'' Bathroom Vanity Sink Base Cabine, Storage Cabinet with 3-Drawers on The Left, Suitable for Bathrooms, Kitchens, Laundry Rooms and Other Places.'
12     LOVMOR 30'' Bathroom Vanity Sink Base Cabine, ...
238    ZZQXTC Over Toilet Storage Cabinet, Bathroom S...
184    WEENFON Storage Cabinet with Doors and Shelves...
64     SogesHome Wood Corner Cabinet Wall Corner Stor...
175    SP-AU-Era Mirro

In [7]:
pip install langchain langchain-google-genai google-generativeai

Note: you may need to restart the kernel to use updated packages.


In [8]:
import os
import pandas as pd
import google.generativeai as genai
from dotenv import load_dotenv  # <-- Import the new library

# --- Load Environment Variables ---
# This command looks for the .env file and loads it into the environment.
load_dotenv()
print("Loaded variables from .env file.")

# --- Step 4 (Secure): Direct API Call ---

# Get the API key from the environment variables
api_key = os.getenv("GOOGLE_API_KEY")

# Check if the key was found
if not api_key:
    print("--- WARNING ---")
    print("GOOGLE_API_KEY not found. Make sure you have created a .env file.")
else:
    try:
        # Configure the API key
        genai.configure(api_key=api_key)

        # Initialize the model
        model = genai.GenerativeModel('gemini-2.5-flash')
        print("Google AI Model ('gemini-2.5-flash') initialized successfully.")

        # --- Define the generator function (no change here) ---
        def generate_description_direct(title, brand, material):
            prompt = f"""You are a creative marketing assistant for an e-commerce store.
            Your task is to write a short, appealing, and creative product description
            that is no more than two sentences long.

            Product Title: {title}
            Brand: {brand}
            Material: {material}

            Creative Description:"""
            response = model.generate_content(prompt)
            return response.text

        # --- Test the function (no change here) ---
        products_df = pd.read_pickle('products_df.pkl')
        sample_product = products_df.iloc[15]

        print("\n--- Testing Direct Description Generator ---")
        print(f"Product Title: {sample_product['title']}")
        creative_description = generate_description_direct(
            title=sample_product['title'],
            brand=sample_product['brand'],
            material=sample_product['material']
        )
        print("\nGenerated Creative Description:")
        print(creative_description)

    except Exception as e:
        print(f"\n--- AN ERROR OCCURRED ---")
        print(e)

Loaded variables from .env file.
Google AI Model ('gemini-2.5-flash') initialized successfully.

--- Testing Direct Description Generator ---
Product Title: Boss Office Products Any Task Mid-Back Task Chair with Loop Arms in Grey

--- AN ERROR OCCURRED ---
429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.


# For the Vector Database part , Pinecone part

In [9]:
pip install sentence-transformers 

Note: you may need to restart the kernel to use updated packages.


In [10]:
pip install pinecone

Note: you may need to restart the kernel to use updated packages.


In [11]:
import os
import pandas as pd
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
from tqdm import tqdm # A library to show a progress bar

# --- Step 5.3: Create and Populate Pinecone Vector DB ---

# --- 1. Load Keys and Initialize ---
load_dotenv()
pinecone_api_key = os.getenv("PINECONE_API_KEY")

if not pinecone_api_key:
    print("--- WARNING ---")
    print("PINECONE_API_KEY not found. Make sure it is in your .env file.")
else:
    try:
        # Initialize Pinecone
        pc = Pinecone(api_key=pinecone_api_key)
        print("Pinecone client initialized.")

        # --- 2. Create Pinecone Index ---
        index_name = "product-recommendations"
        
        if index_name not in pc.list_indexes().names():
            print(f"Index '{index_name}' not found. Creating it...")
            pc.create_index(
                name=index_name,
                dimension=384, # This dimension MUST match our model
                metric="cosine", # Cosine similarity is good for text
                spec=ServerlessSpec(
                    cloud="aws",
                    region="us-east-1" # Use a free-tier region
                )
            )
            print(f"Index '{index_name}' created successfully.")
        else:
            print(f"Index '{index_name}' already exists. Connecting to it.")

        # Connect to the index
        index = pc.Index(index_name)
        print(index.describe_index_stats()) # Show stats

        # --- 3. Load Embedding Model ---
        # This model is fast and very effective for semantic search
        model = SentenceTransformer('all-MiniLM-L6-v2')
        print("\nSentenceTransformer model 'all-MiniLM-L6-v2' loaded.")

        # --- 4. Load Data ---
        products_df = pd.read_pickle('products_df.pkl')
        # Ensure there are no missing 'uniq_id' values
        products_df = products_df.dropna(subset=['uniq_id'])
        print(f"Loaded {len(products_df)} products from 'products_df.pkl'")

        # --- 5. Generate Embeddings and Upload ---
        print("\nStarting to generate embeddings and upload to Pinecone...")
        
        # We upload in batches for efficiency
        batch_size = 100 
        for i in tqdm(range(0, len(products_df), batch_size)):
            # Get a batch of products
            batch = products_df.iloc[i:i+batch_size]
            
            # Get the 'soup' text
            texts = batch['soup'].tolist()
            # Get the unique IDs
            ids = batch['uniq_id'].tolist()
            
            # Generate embeddings for the batch
            embeddings = model.encode(texts)
            
            # Prepare the data for upload (id, vector)
            vectors_to_upload = list(zip(ids, embeddings.tolist()))
            
            # Upload the batch to Pinecone
            index.upsert(vectors=vectors_to_upload)

        print("\n--- All products have been embedded and uploaded to Pinecone! ---")
        print(index.describe_index_stats())

    except Exception as e:
        print(f"\n--- AN ERROR OCCURRED ---")
        print(e)

Pinecone client initialized.
Index 'product-recommendations' already exists. Connecting to it.
{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 305}},
 'total_vector_count': 305,
 'vector_type': 'dense'}

SentenceTransformer model 'all-MiniLM-L6-v2' loaded.
Loaded 312 products from 'products_df.pkl'

Starting to generate embeddings and upload to Pinecone...


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:19<00:00,  4.82s/it]



--- All products have been embedded and uploaded to Pinecone! ---
{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 305}},
 'total_vector_count': 305,
 'vector_type': 'dense'}


In [12]:
import os
import pandas as pd
from dotenv import load_dotenv
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer

# --- Step 5.4: Create the Vector Search Function ---

# --- 1. Load Keys and Initialize Clients ---
load_dotenv()
pinecone_api_key = os.getenv("PINECONE_API_KEY")

if not pinecone_api_key:
    print("--- WARNING ---")
    print("PINECONE_API_KEY not found. Make sure it is in your .env file.")
else:
    try:
        # Initialize Pinecone
        pc = Pinecone(api_key=pinecone_api_key)
        index_name = "product-recommendations"
        index = pc.Index(index_name)
        print(f"Connected to Pinecone index '{index_name}'.")

        # Load the embedding model
        model = SentenceTransformer('all-MiniLM-L6-v2')
        print("SentenceTransformer model loaded.")

        # Load product data
        products_df = pd.read_pickle('products_df.pkl')
        # Set 'uniq_id' as the index for fast lookups
        products_df = products_df.set_index('uniq_id')
        print("Product data loaded and indexed by 'uniq_id'.")

        # --- 2. Define the New Recommendation Function ---
        def get_semantic_recommendations(query, top_k=5):
            """
            Gets recommendations from Pinecone based on a text query.
            """
            # 1. Create the embedding for the query
            query_embedding = model.encode(query).tolist()
            
            # 2. Query Pinecone
            query_results = index.query(
                vector=query_embedding,
                top_k=top_k, # Get the top 5 results
                include_metadata=False # We only need the IDs
            )
            
            # 3. Get the list of product IDs from the results
            result_ids = [match['id'] for match in query_results['matches']]
            
            # 4. Look up the full product details from our dataframe
            recommended_products = products_df.loc[result_ids]
            
            return recommended_products

        # --- 3. Test the Function ---
        print("\n--- Testing Semantic Search ---")
        
        # Test 1: A specific query
        query1 = "a comfortable chair for my office"
        print(f"\nRecommendations for: '{query1}'")
        results1 = get_semantic_recommendations(query1)
        print(results1['title'])
        
        # Test 2: A different query
        query2 = "modern storage for shoes"
        print(f"\nRecommendations for: '{query2}'")
        results2 = get_semantic_recommendations(query2)
        print(results2['title'])
        
        # Test 3: A query about materials
        query3 = "something made of wood"
        print(f"\nRecommendations for: '{query3}'")
        results3 = get_semantic_recommendations(query3)
        print(results3['title'])

    except Exception as e:
        print(f"\n--- AN ERROR OCCURRED ---")
        print(e)

Connected to Pinecone index 'product-recommendations'.
SentenceTransformer model loaded.
Product data loaded and indexed by 'uniq_id'.

--- Testing Semantic Search ---

Recommendations for: 'a comfortable chair for my office'
uniq_id
0583ef58-47cd-509b-9e6d-89a0ad8490b2    Ergonomic Office Chair,Office Chair, with Lumb...
a0a69530-a944-589d-a036-90358cb9e485    Boss Office Products Any Task Mid-Back Task Ch...
fe25ae1d-4a82-57ad-9bab-b9de4321fd0b    Karl home Accent Chair Mid-Century Modern Chai...
b3a490d6-a89e-57e0-9188-706c89a156a1    Winrise Office Chair Ergonomic Desk Chair, Hig...
e037f8af-d28c-51a1-8f1c-3f524620910e    AnRui Folding Floor Chair with Adjustable Back...
Name: title, dtype: object

Recommendations for: 'modern storage for shoes'
uniq_id
122c5c2a-5490-51ce-8555-9526c9698a38    LANTEFUL Shoe Rack Organizer Shoe Storage Cabi...
feca0b89-547f-5bca-9b97-f255c5467e47    Soerreo Shoe Slot Storage Box Adjustable Shoe ...
a10176fb-74af-5428-9aaf-2787aa4d66d2    MoNiBloom Fo

# For Open CV part

In [13]:
pip install imgbeddings

Note: you may need to restart the kernel to use updated packages.


In [14]:
pip install Pillow requests

Note: you may need to restart the kernel to use updated packages.


In [15]:
pip install --upgrade huggingface_hub

Note: you may need to restart the kernel to use updated packages.


In [17]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from PIL import Image
import requests
import ast
import io

# --- Step 6.2 (Final Attempt): Prototype Image Embedding with CLIP ---

try:
    # --- 1. Load Data and Clean Image Column ---
    products_df = pd.read_pickle('products_df.pkl')
    print("Product data loaded.")

    def get_first_image_url(image_str):
        """
        Cleans the 'images' column string and gets the first URL.
        """
        try:
            image_list = ast.literal_eval(image_str)
            if image_list and isinstance(image_list, list) and len(image_list) > 0:
                return image_list[0] # Return the first image URL
        except (ValueError, SyntaxError, TypeError):
            return None
        return None

    products_df['first_image'] = products_df['images'].apply(get_first_image_url)
    sample_product = products_df.dropna(subset=['first_image']).iloc[0]
    sample_image_url = sample_product['first_image']
    
    print(f"Sample product selected: {sample_product['title']}")
    print(f"Sample image URL: {sample_image_url}")

    # --- 2. Initialize Image Embedding Model (CLIP) ---
    print("\nLoading image embedding model (clip-ViT-B-32)...")
    img_model = SentenceTransformer('clip-ViT-B-32')
    print("Model loaded successfully.")

    # --- 3. Download Image and Generate Embedding ---
    print("Downloading sample image...")
    # Download the image from the URL
    response = requests.get(sample_image_url, stream=True)
    
    if response.status_code == 200:
        # Open the image from the downloaded content
        img = Image.open(io.BytesIO(response.content))
        print("Image downloaded and opened successfully.")

        print("Generating embedding for the sample image...")
        # Encode the PIL image object
        image_embedding = img_model.encode(img)
        
        print("\n--- Image Embedding Generated Successfully! ---")
        print(f"Type of embedding: {type(image_embedding)}")
        print(f"Shape of embedding: {image_embedding.shape}")
        print("\nFirst 10 values of the embedding vector:")
        print(image_embedding[:10])
    else:
        print(f"\n--- ERROR ---")
        print(f"Failed to download image. Status code: {response.status_code}")

except Exception as e:
    print(f"\n--- AN ERROR OCCURRED ---")
    print(e)

Product data loaded.
Sample product selected: GOYMFK 1pc Free Standing Shoe Rack, Multi-layer Metal Shoe Cap Rack With 8 Double Hooks For Living Room, Bathroom, Hallway
Sample image URL: https://m.media-amazon.com/images/I/416WaLx10jL._SS522_.jpg 

Loading image embedding model (clip-ViT-B-32)...


modules.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Exception ignored in: <function tqdm.__del__ at 0x000001F0C9D977E0>
Traceback (most recent call last):
  File "C:\Users\cyash\yashenv\Lib\site-packages\tqdm\std.py", line 1148, in __del__
    self.close()
  File "C:\Users\cyash\yashenv\Lib\site-packages\tqdm\notebook.py", line 282, in close
    self.disp(bar_style='success', check_delay=False)
    ^^^^^^^^^
AttributeError: 'tqdm' object has no attribute 'disp'
Exception ignored in: <function tqdm.__del__ at 0x000001F0C9D977E0>
Traceback (most recent call last):
  File "C:\Users\cy


--- AN ERROR OCCURRED ---
Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory C:\Users\cyash\.cache\huggingface\hub\models--sentence-transformers--clip-ViT-B-32\snapshots\327ab6726d33c0e22f920c83f2ff9e4bd38ca37f\0_CLIPModel.


## Above part is not working for some reason , using core transformers library 

In [19]:
pip install transformers torch

Note: you may need to restart the kernel to use updated packages.


In [21]:
import pandas as pd
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import requests
import ast
import io
import torch

# --- Step 6.2 (Final Method, Corrected Download): Prototype Image Embedding ---

try:
    # --- 1. Load Data and Clean Image Column ---
    products_df = pd.read_pickle('products_df.pkl')
    print("Product data loaded.")

    def get_first_image_url(image_str):
        try:
            image_list = ast.literal_eval(image_str)
            if image_list and isinstance(image_list, list) and len(image_list) > 0:
                return image_list[0]
        except (ValueError, SyntaxError, TypeError):
            return None
        return None

    products_df['first_image'] = products_df['images'].apply(get_first_image_url)
    
    # --- THIS IS THE FIX: Get a *different* sample product ---
    # The first image might be broken or protected. Let's try the 5th product.
    sample_product = products_df.dropna(subset=['first_image']).iloc[4]
    sample_image_url = sample_product['first_image']
    
    print(f"Sample product selected: {sample_product['title']}")
    print(f"Sample image URL: {sample_image_url}")

    # --- 2. Initialize Image Embedding Model (CLIP) ---
    print("\nLoading CLIP model and processor directly...")
    model_name = "openai/clip-vit-base-patch32"
    processor = CLIPProcessor.from_pretrained(model_name)
    model = CLIPModel.from_pretrained(model_name)
    print("Model and processor loaded successfully.")

    # --- 3. Download Image and Generate Embedding ---
    print("Downloading sample image...")
    
    # --- THIS IS THE FIX ---
    # We add a User-Agent header to pretend we are a real browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    }
    response = requests.get(sample_image_url, stream=True, headers=headers)
    
    if response.status_code == 200:
        img = Image.open(io.BytesIO(response.content))
        print("Image downloaded and opened successfully.")

        print("Generating embedding for the sample image...")
        
        with torch.no_grad():
            inputs = processor(images=img, return_tensors="pt")
            image_features = model.get_image_features(**inputs)
        
        image_embedding = image_features[0]
        
        print("\n--- Image Embedding Generated Successfully! ---")
        print(f"Type of embedding: {type(image_embedding)}")
        print(f"Shape of embedding: {image_embedding.shape}")
        print("\nFirst 10 values of the embedding vector:")
        print(image_embedding[:10])
    else:
        print(f"\n--- ERROR ---")
        print(f"Failed to download image. Status code: {response.status_code}")
        print("This specific image URL may be broken or heavily protected.")


except Exception as e:
    print(f"\n--- AN ERROR OCCURRED ---")
    print(e)

Product data loaded.
Sample product selected: JOIN IRON Foldable TV Trays for Eating Set of 4 with Stand,Folding TV/Snack Tray Table Set,Folding TV Dinner Tables for Small Space,(Grey)
Sample image URL: https://m.media-amazon.com/images/I/41p4d4VJnNL._SS522_.jpg 

Loading CLIP model and processor directly...
Model and processor loaded successfully.
Downloading sample image...

--- ERROR ---
Failed to download image. Status code: 400
This specific image URL may be broken or heavily protected.


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Exception ignored in: <function tqdm.__del__ at 0x000001F0C9D977E0>
Traceback (most recent call last):
  File "C:\Users\cyash\yashenv\Lib\site-packages\tqdm\std.py", line 1148, in __del__
    self.close()
  File "C:\Users\cyash\yashenv\Lib\site-packages\tqdm\notebook.py", line 279, in close
    self.disp(bar_style='danger', check_delay=False)
    ^^^^^^^^^
AttributeError: 'tqdm' object has no attribute 'disp'


# Creating a mock function , because Amazon servers are not allowing us to downlaod, because it suspects us being not a real person

In [22]:
import pandas as pd
import numpy as np
import ast

# --- Step 6.3: Create Mock CV Function (Requirement #3) ---

# We have already proven we can load the real CLIP model.
# Now, we create a mock function to bypass the download block
# and keep the project on schedule.

# The 'openai/clip-vit-base-patch32' model outputs a 512-dimension vector.
EMBEDDING_DIMENSION = 512

def get_image_embedding_mock(image_url):
    """
    MOCK FUNCTION: Pretends to download an image and run the CLIP model.
    Returns a fake vector of the correct shape (512 dimensions).
    """
    if not image_url or not isinstance(image_url, str):
        return None
    
    # Generate a random vector to simulate the real embedding
    mock_embedding = np.random.rand(EMBEDDING_DIMENSION)
    return mock_embedding

print("Mock CV function 'get_image_embedding_mock' created successfully.")

# --- Test the Mock Function ---
products_df = pd.read_pickle('products_df.pkl')

def get_first_image_url(image_str):
    try:
        image_list = ast.literal_eval(image_str)
        if image_list and isinstance(image_list, list) and len(image_list) > 0:
            return image_list[0]
    except (ValueError, SyntaxError, TypeError):
        return None
    return None

products_df['first_image'] = products_df['images'].apply(get_first_image_url)
sample_product_url = products_df.dropna(subset=['first_image']).iloc[0]['first_image']

print(f"\nTesting with sample URL: {sample_product_url}")

# Test the function
mock_vector = get_image_embedding_mock(sample_product_url)

print("\n--- Mock Image Embedding Generated Successfully! ---")
print(f"Type of embedding: {type(mock_vector)}")
print(f"Shape of embedding: {mock_vector.shape}")
print("\nFirst 10 values of the mock vector:")
print(mock_vector[:10])

Mock CV function 'get_image_embedding_mock' created successfully.

Testing with sample URL: https://m.media-amazon.com/images/I/416WaLx10jL._SS522_.jpg 

--- Mock Image Embedding Generated Successfully! ---
Type of embedding: <class 'numpy.ndarray'>
Shape of embedding: (512,)

First 10 values of the mock vector:
[0.93848062 0.60454704 0.86079219 0.88536474 0.47567564 0.41950611
 0.72127564 0.95537625 0.36057688 0.11199573]
