In [22]:
from sentence_transformers import SentenceTransformer
import numpy as np
import json
import pandas as pd
from tqdm.notebook import tqdm
import torch
import pandas as pd
import sys
import pickle
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance
from qdrant_client.http import models


In [19]:
model = SentenceTransformer(
    "all-MiniLM-L6-v2", device="cuda"
)  # or device="cpu" if you don't have a GPU

In [12]:
df = pd.read_csv("./bigBasketProducts.csv")

df['market_price'] = df['market_price'].astype(str)
df['rating'] = df['rating'].astype(str)
df['rating'].replace('nan', 'not found', inplace=True)

df['description'].fillna('not found', inplace=True)

print(f'Before dropping: {df.shape=}')
df.dropna(inplace=True)

print(f'After dropping: {df.shape=}')
df.head()

Before dropping: df.shape=(27555, 10)
After dropping: df.shape=(27553, 10)


Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...
1,2,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ..."
2,3,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,119.0,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m..."
3,4,Cereal Flip Lid Container/Storage Jar - Assort...,Cleaning & Household,Bins & Bathroom Ware,Nakoda,149.0,176.0,"Laundry, Storage Baskets",3.7,Multipurpose container with an attractive desi...
4,5,Creme Soft Soap - For Hands & Body,Beauty & Hygiene,Bath & Hand Wash,Nivea,162.0,162.0,Bathing Bars & Soaps,4.4,Nivea Creme Soft Soap gives your skin the best...


In [13]:

class make_embedding_ds(torch.utils.data.IterableDataset):
    def __init__(self, csv = df):
        """
        will compute embedding using sentence encoder
        
        """
        super(make_embedding_ds).__init__()
        self.csv = csv
        self.total_row = csv.shape[0]
        self.col = csv.columns.to_list()

    def __iter__(self):
        for row_no in range(self.total_row):
            
            #testing: make comment it later
       
            
            
            row = self.csv.iloc[row_no].to_dict()
            product_name = row[self.col[0]]
            story = f"{row[ self.col[0] ]} is of category {row[self.col[1]]} and sub category is {row[self.col[2]]}. {row[self.col[0]]} is type {row[self.col[6]]}. brand of {row[self.col[0]]} is {row[self.col[3]]}, with rating {row[self.col[7]]}. sale price of {row[self.col[0]]} is {row[self.col[4]]} with market price {row[self.col[5]]}, description of {row[self.col[0]]} is {row[self.col[8]]}"
            emb = model.encode(story)
            
            
            yield row_no,product_name,story,emb
    

ds = make_embedding_ds(df)

In [14]:
ds

<__main__.make_embedding_ds at 0x7f4334948cd0>

In [None]:

json_emb = {
    "payload":[],
    "emb": []
}
batch_no = 1
dataloader = torch.utils.data.DataLoader(ds, num_workers=1,batch_size=256)
for row,product_name,story,emb in dataloader:
    
    _batch_len = len(row)
    
    
    
    print(f"batch no-{batch_no} competed")
    batch_no += 1
    
    for index in range(_batch_len):
        
        temp = {}
        temp["id"] = row[index].item()
        temp["product"] = product_name[index]
        temp["story"] = story[index]
        
        json_emb["payload"].append(temp)
        json_emb["emb"].append(emb[index].numpy().tolist())

In [None]:
with open('vector_payload.pkl', 'wb') as file:
    pickle.dump(json_emb, file)

In [20]:
VECTOR_SPACE_PATH = "vector_payload.pkl"
with open(VECTOR_SPACE_PATH, 'rb') as file:
    vs = pickle.load(file)

client = QdrantClient(":memory:")

In [23]:
collection_name = "qdrant-space"

collections = client.get_collections()
if collection_name not in [c.name for c in collections.collections]:
    client.recreate_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
            size=384,
            distance=models.Distance.COSINE,
        ),
    )
print(client.get_collections())

collections=[CollectionDescription(name='qdrant-space')]


In [25]:
total_records = len(vs["payload"]) # total records data
_payload = vs["payload"]
_emb = vs["emb"]
ids = list(range(0,total_records))

batch_size = 64 

client.upsert(
    collection_name=collection_name,
    points=models.Batch(ids=ids, vectors=_emb, payloads=_payload),
)


collection_vector_count = client.get_collection(collection_name=collection_name).vectors_count
print(f"Vector count in collection: {collection_vector_count}")

Vector count in collection: 27553


In [26]:
with open('quadrant_vectordb_client.pkl', 'wb') as file:
    pickle.dump(client, file)

In [29]:
retriever = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")

In [30]:
with open('encodermodel.pkl', 'wb') as file:
    pickle.dump(retriever, file)

In [32]:

model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
from transformers import pipeline

# load the reader model into a question-answering pipeline
reader = pipeline("question-answering", model=model_name, tokenizer=model_name)

Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [33]:
with open('bert-question-answering.pkl', 'wb') as file:
    pickle.dump(reader, file)