In [32]:
import pandas as pd


In [33]:
# df = pd.read_csv("dataset/preprocessed_data.csv")
# Load the DataFrame from a pickle file

df = pd.read_pickle('dataset/preprocessed_data.pkl')

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Product URL        210 non-null    object
 1   Product Name       210 non-null    object
 2   Product Price      209 non-null    object
 3   Rating             208 non-null    object
 4   Number of reviews  208 non-null    object
 5   Manufacturer       154 non-null    object
 6   ASIN               156 non-null    object
 7   product_name       210 non-null    object
 8   tokens             210 non-null    object
 9   embeddings         210 non-null    object
dtypes: object(10)
memory usage: 16.5+ KB


In [35]:
from transformers import BertTokenizer, BertModel
import torch

# Load the pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [36]:
def get_word_embeddings(text, tokenizer, model):
    # Tokenize the input text and get the input IDs and attention mask
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    # Get the embeddings from the BERT model
    with torch.no_grad():
        outputs = model(**inputs)
    
    # outputs[0] contains the hidden states of all tokens in the input
    # Shape of outputs[0]: [batch_size, sequence_length, hidden_size]
    token_embeddings = outputs.last_hidden_state.squeeze(0)
    
    # Get the embeddings for each token (excluding special tokens like [CLS], [SEP])
    token_embeddings = token_embeddings[1:-1]
    
    # Get the corresponding tokens
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze(0))[1:-1]
    
    return tokens, token_embeddings


In [37]:
def process_and_sort_results(results, top_n=5):
    # Initialize a dictionary to hold cumulative similarities and count of query tokens for each document
    doc_similarity_aggregate = {}

    # Iterate over each result to calculate cumulative similarity for each document
    for result in results:
        doc_idx = result['document_index']
        similarity = result['similarity']

        # If the document index is not already in the dictionary, initialize it
        if doc_idx not in doc_similarity_aggregate:
            doc_similarity_aggregate[doc_idx] = {"total_similarity": 0, "count": 0}

        # Add the similarity score to the total similarity for the document and increment the token count
        doc_similarity_aggregate[doc_idx]["total_similarity"] += similarity
        doc_similarity_aggregate[doc_idx]["count"] += 1

    # Calculate average similarity for each document
    avg_similarities = []
    for doc_idx, data in doc_similarity_aggregate.items():
        avg_similarity = data["total_similarity"] / data["count"]  # Average similarity
        avg_similarities.append({"document_index": doc_idx, "avg_similarity": avg_similarity})

    # Sort documents by their average similarity in descending order and get the top N
    sorted_docs = sorted(avg_similarities, key=lambda x: x['avg_similarity'], reverse=True)[:top_n]

    return sorted_docs


In [38]:
from sklearn.metrics.pairwise import cosine_similarity

def word_level_search(query, document_tokens, document_embeddings, tokenizer, model):
    # Get word embeddings for the query
    query_tokens, query_embeddings = get_word_embeddings(query, tokenizer, model)
    
    # Store results (document index, query token, document token, similarity score)
    results = []

    # For each document, compute the cosine similarity between query words and document words
    for doc_idx, doc_embedding in enumerate(document_embeddings):
        if len(doc_embedding) == 0:  # Prevent errors if document has no embeddings
            continue
        
        # Compute similarity scores for all query tokens against the document embedding
        similarity_scores = cosine_similarity(query_embeddings, doc_embedding)
        
        # For each query token, find the most similar token in the document
        for i, query_token in enumerate(query_tokens):
            max_sim_idx = similarity_scores[i].argmax()
            max_sim_score = similarity_scores[i][max_sim_idx]
            
            # Append results with the corresponding query token and the document index
            results.append({
                "document_index": doc_idx,  # Index of the document in the DataFrame
                "document_token": document_tokens[doc_idx][max_sim_idx],
                "query_token": query_token,
                "similarity": max_sim_score
            })
    print(len(results))
    print(results)
    
    return process_and_sort_results(results)


In [41]:
import spacy

# Load the spacy model
nlp = spacy.load("en_core_web_sm")

def preprocess_query(text):
    doc = nlp(text)
    # Remove stop words and lemmatize the remaining words
    filtered_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

    # Remove duplicate words while preserving order
    filtered_tokens = list(dict.fromkeys(filtered_tokens))
    
    return ' '.join(filtered_tokens)

query = "green bag"
preprocessed_query = preprocess_query(query)

# Call the modified function
output = word_level_search(preprocessed_query, df['tokens'], df['embeddings'], tokenizer, model)
output

420
[{'document_index': 0, 'document_token': 'sky', 'query_token': 'green', 'similarity': np.float32(0.4944315)}, {'document_index': 0, 'document_token': 'strap', 'query_token': 'bag', 'similarity': np.float32(0.62106425)}, {'document_index': 1, 'document_token': '##n', 'query_token': 'green', 'similarity': np.float32(0.46116313)}, {'document_index': 1, 'document_token': 'girl', 'query_token': 'bag', 'similarity': np.float32(0.58340687)}, {'document_index': 2, 'document_token': 'hike', 'query_token': 'green', 'similarity': np.float32(0.47003338)}, {'document_index': 2, 'document_token': 'college', 'query_token': 'bag', 'similarity': np.float32(0.56346166)}, {'document_index': 3, 'document_token': 'backpack', 'query_token': 'green', 'similarity': np.float32(0.46920735)}, {'document_index': 3, 'document_token': 'black', 'query_token': 'bag', 'similarity': np.float32(0.6247646)}, {'document_index': 4, 'document_token': 'safari', 'query_token': 'green', 'similarity': np.float32(0.48388395)

[{'document_index': 68, 'avg_similarity': np.float32(0.6935778)},
 {'document_index': 19, 'avg_similarity': np.float32(0.67617536)},
 {'document_index': 164, 'avg_similarity': np.float32(0.66523826)},
 {'document_index': 150, 'avg_similarity': np.float32(0.6552919)},
 {'document_index': 60, 'avg_similarity': np.float32(0.65361214)}]

In [42]:
# Iterate over the sorted top documents returned by process_and_sort_results
for result in output:
    doc_index = result['document_index']  # Get the document index from the result
    avg_similarity = result['avg_similarity']  # Get the average similarity score (optional, for display)
    
    # Extract the entire row using the document index
    document_info = df.iloc[doc_index, 1]  # This will retrieve the specific column data of row from the DataFrame
    
    # Print the document information (you can customize how it's displayed)
    print(f"Document Index: {doc_index}, Average Similarity: {avg_similarity}")
    print(document_info)  # This will display all columns of the row


Document Index: 68, Average Similarity: 0.6935778260231018
Storite Nylon 50 cms Imported Travel Duffle Bag Multi- Pocket Sports Shoulder Bag for Women with Wet Pocket & Shoe Compartment Weekender Overnight Travel Luggage Bag (Green - 50 x 18 x 29 cm)
Document Index: 19, Average Similarity: 0.6761753559112549
Aristocrat Nova Laptop Backpack - Black/School bag
Document Index: 164, Average Similarity: 0.6652382612228394
Zureni Garbage Dustbin Bags (Large, 60 x 81 cm or 24" x 32") Drawstring Anti-drip Trash Waste Basket Bag for Kitchen Office Warehouse Pantry (Black, Green, Blue, 3 x 15 Pcs/Roll)
Document Index: 150, Average Similarity: 0.6552919149398804
Shalimar Compostable Garbage Bag Size 19 x 21 Inches (Medium) 45 Piece (3 Rolls) Dustbin Bag/Trash Bag - Green Color
Document Index: 60, Average Similarity: 0.6536121368408203
FEDRA Epoch Nylon 55 litres Waterproof Duffle Bag Strolley Duffle Bag- 2 Wheels - Luggage Bag - (Green White)
