In [None]:
# Colab cell 1: Setup
!pip -q install pandas numpy scikit-learn sentence-transformers transformers pinecone datasets pillow langchain langchain-community

import os, io, json, re, ast, requests, math, random
import pandas as pd
import numpy as np

from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from transformers import CLIPProcessor, CLIPModel
from PIL import Image

# Pinecone (v3)
!pip -q install pinecone
from pinecone import Pinecone, ServerlessSpec

# LangChain for GenAI
from langchain.prompts import PromptTemplate
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import pipeline

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/587.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.6/587.6 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.3/259.3 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency

In [None]:
# Colab cell 2: Load dataset (upload or from Drive)
# Option A: manual upload
from google.colab import files
uploaded = files.upload()  # upload intern_data_ikarus.csv

csv_path = '/content/intern_data_ikarus.csv'
df = pd.read_csv(csv_path)

# Basic cleanups
df['description'] = df['description'].fillna('')
df['brand'] = df['brand'].fillna('')
df['title'] = df['title'].fillna('')
df['price'] = df['price'].astype(str).str.replace(r'[^0-9\.]', '', regex=True).replace('', np.nan).astype(float)

# Parse categories and images columns if they are stringified lists
def parse_list(x):
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except Exception:
            return []
    return x if isinstance(x, list) else []

df['categories'] = df['categories'].apply(parse_list)
df['images'] = df['images'].apply(parse_list)

# Create a text field for embedding
def product_text(row):
    cats = ' > '.join(row['categories'][:4]) if isinstance(row['categories'], list) else ''
    return f"Title: {row['title']}\nBrand: {row['brand']}\nDescription: {row['description']}\nCategories: {cats}\nMaterial: {row.get('material','')}\nColor: {row.get('color','')}"
df['text_for_embed'] = df.apply(product_text, axis=1)

df.head(2)


Saving intern_data_ikarus.csv to intern_data_ikarus (1).csv


Unnamed: 0,title,brand,description,price,categories,images,manufacturer,package_dimensions,country_of_origin,material,color,uniq_id,text_for_embed
0,"GOYMFK 1pc Free Standing Shoe Rack, Multi-laye...",GOYMFK,"multiple shoes, coats, hats, and other items E...",24.99,"[Home & Kitchen, Storage & Organization, Cloth...",[https://m.media-amazon.com/images/I/416WaLx10...,GOYMFK,"2.36""D x 7.87""W x 21.6""H",China,Metal,White,02593e81-5c09-5069-8516-b0b29f439ded,"Title: GOYMFK 1pc Free Standing Shoe Rack, Mul..."
1,"subrtex Leather ding Room, Dining Chairs Set o...",subrtex,subrtex Dining chairs Set of 2,,"[Home & Kitchen, Furniture, Dining Room Furnit...",[https://m.media-amazon.com/images/I/31SejUEWY...,Subrtex Houseware INC,"18.5""D x 16""W x 35""H",,Sponge,Black,5938d217-b8c5-5d3e-b1cf-e28e340f292e,"Title: subrtex Leather ding Room, Dining Chair..."


In [None]:
# Colab cell 3: Text embeddings
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = embed_model.encode(df['text_for_embed'].tolist(), batch_size=64, show_progress_bar=True)
embeddings = np.array(embeddings, dtype=np.float32)

# KMeans clustering to group similar items
num_clusters = max(5, min(50, len(df)//200))  # heuristic
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init='auto')
df['cluster'] = kmeans.fit_predict(embeddings)

# Save clustering labels
df[['uniq_id','cluster']].to_csv('clusters.csv', index=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
# Colab cell 4: Pinecone init and upsert
from google.colab import userdata
PINECONE_API_KEY = userdata.get("PINECONE_API_KEY")  # set in Colab > Secrets
pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "ikarus-products"
if index_name not in [idx['name'] for idx in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=embeddings.shape[1],
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
index = pc.Index(index_name)

# Prepare vectors
def meta_row(i):
    r = df.iloc[i]
    # Ensure metadata values are Pinecone-compatible
    metadata = {
        "uniq_id": str(r['uniq_id']),
        "title": r['title'],
        "brand": r['brand'],
        "description": r['description'],
        "categories": r['categories'] if isinstance(r['categories'], list) else [],
        "images": [str(img) for img in r['images']] if isinstance(r['images'], list) else [], # Ensure images are list of strings
        "material": str(r['material']) if pd.notna(r['material']) else '',
        "color": str(r['color']) if pd.notna(r['color']) else '',
        "cluster": int(r['cluster'])
    }
    # Only include price if it's not NaN
    if pd.notna(r['price']):
        metadata['price'] = float(r['price'])

    return metadata

batch = []
for i in range(len(df)):
    batch.append({"id": str(df.iloc[i]['uniq_id']), "values": embeddings[i].tolist(), "metadata": meta_row(i)})
    if len(batch) == 100:
        index.upsert(vectors=batch)
        batch = []
if batch:
    index.upsert(vectors=batch)

In [None]:
import os
from google.colab import userdata

api_key = userdata.get('PINECONE_API_KEY')
if api_key is None:
    print("PINECONE_API_KEY secret is not set or not enabled for notebook access.")
else:
    print("PINECONE_API_KEY secret is set.")
    # You can optionally print a part of the key to confirm it's the correct one, but be cautious with exposing sensitive information.
    # print(f"First few characters of the key: {api_key[:5]}...")

PINECONE_API_KEY secret is set.


In [None]:
# Colab cell 5: Extract top-level category label
import torch
import joblib # Import joblib here as it's used in this cell

def top_category(cats):
    if isinstance(cats, list) and len(cats) > 0:
        return cats[0]
    return "Unknown"

df['label'] = df['categories'].apply(top_category)

# Download a subset of images for feasibility
subset = df[df['images'].apply(lambda x: isinstance(x, list) and len(x)>0)].copy()
subset = subset.sample(min(1200, len(subset)), random_state=42)

# CLIP feature extraction
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_proc = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def fetch_image(urls):
    for u in urls:
        u = u.strip()
        try:
            img = Image.open(io.BytesIO(requests.get(u, timeout=10).content)).convert("RGB")
            return img
        except Exception:
            continue
    return None

X_feats, y_labels = [], []
for _, r in subset.iterrows():
    img = fetch_image(r['images'])
    if img is None:
        continue
    inputs = clip_proc(images=img, return_tensors="pt")
    with torch.no_grad():
        image_features = clip_model.get_image_features(**inputs)
    X_feats.append(image_features[0].cpu().numpy())
    y_labels.append(r['label'])

X = np.vstack(X_feats)
y = np.array(y_labels)

# Train/test split and linear classifier
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
clf = LogisticRegression(max_iter=2000)
clf.fit(Xtr, ytr)
print(classification_report(yte, clf.predict(Xte)))

# Save model artifacts
label_space = list(set(y)) # Assign the list of unique labels to label_space
joblib.dump(clf, 'clip_linear_classifier.joblib')
joblib.dump(label_space, 'label_space.joblib') # Dump the variable label_space
# No need to save CLIP weights (loaded on backend); alternatively cache HF models in backend

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

                          precision    recall  f1-score   support

           Baby Products       0.00      0.00      0.00         1
  Beauty & Personal Care       0.00      0.00      0.00         1
             Electronics       0.00      0.00      0.00         1
          Home & Kitchen       0.96      1.00      0.98        51
    Patio, Lawn & Garden       1.00      1.00      1.00         5
Tools & Home Improvement       0.80      1.00      0.89         4

                accuracy                           0.95        63
               macro avg       0.46      0.50      0.48        63
            weighted avg       0.91      0.95      0.93        63



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


['label_space.joblib']

In [None]:
# Colab cell 6: LangChain GenAI pipeline
gen_pipeline = pipeline("text2text-generation", model="google/flan-t5-small", max_new_tokens=128)
llm = HuggingFacePipeline(pipeline=gen_pipeline)

prompt = PromptTemplate.from_template(
    "Write a creative, vivid but concise product blurb (70-100 words) for a furniture item.\n"
    "Title: {title}\nBrand: {brand}\nMaterial: {material}\nColor: {color}\nCategories: {categories}\n"
    "Make it friendly and helpful for shoppers."
)

# Test on a sample
sample = df.iloc[0]
print(llm(prompt.format(
    title=sample['title'],
    brand=sample['brand'],
    material=sample.get('material',''),
    color=sample.get('color',''),
    categories=' > '.join(sample['categories'][:4]) if isinstance(sample['categories'], list) else ''
)))


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu
  llm = HuggingFacePipeline(pipeline=gen_pipeline)
  print(llm(prompt.format(


GOYMFK 1pc Free Standing Shoe Rack, Multi-layer Metal Shoe Cap Rack With 8 Double Hooks For Living Room, Bathroom, Hallway Brand: GOYMFK Material: Metal Color: White Categories: Home & Kitchen > Storage & Organization > Clothing & Closet Storage > Shoe Organizers


In [None]:
import os, joblib, pandas as pd
os.makedirs('backend/models', exist_ok=True)
os.makedirs('backend/data', exist_ok=True)

# Replace with your actual variables from training
joblib.dump(clf, 'backend/models/clip_linear_classifier.joblib')
joblib.dump(label_space, 'backend/models/label_space.joblib')
df[['uniq_id','cluster']].to_csv('backend/data/clusters.csv', index=False)

# Ensure the provided dataset is copied for backend analytics
!cp intern_data_ikarus.csv backend/data/intern_data_ikarus.csv

In [None]:
!git init
!git add .
!git commit -m "Initial commit: Initial commit: backend, frontend, artifacts, data"
!git branch -M main
!git remote add origin https://<PAT>@github.com/<username>/AI_Recommendation_App.git
!git push -u origin main

Reinitialized existing Git repository in /content/.git/
On branch main
nothing to commit, working tree clean
/bin/bash: line 1: PAT: No such file or directory
fatal: 'YOUR_GITHUB_REPO_URL' does not appear to be a git repository
fatal: Could not read from remote repository.

Please make sure you have the correct access rights
and the repository exists.


In [None]:
!git config --global user.email "asharma4_be22@thapar.edu"
!git config --global user.name "asharmaaryamani"

In [None]:
# In Colab, with your repo cloned as the working directory
import os, shutil
os.makedirs('backend/data', exist_ok=True)
shutil.copy('intern_data_ikarus.csv', 'backend/data/intern_data_ikarus.csv')

# then commit and push
!git add backend/data/intern_data_ikarus.csv
!git commit -m "Add dataset CSV for API analytics and indexing"
!git push


On branch main
nothing to commit, working tree clean
fatal: The current branch main has no upstream branch.
To push the current branch and set the remote as upstream, use

    git push --set-upstream origin main

