In [20]:
from sentence_transformers import SentenceTransformer
import json

In [21]:
import json
from elasticsearch import Elasticsearch

with open('../data/products_data.json', 'rt') as f_in:
    documents = json.load(f_in)

In [22]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

In [23]:
v = model.encode('I have 1000 rands what jean can i buy in the store')

In [24]:
len(v)

384

In [25]:
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer

# Initialize the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')  # 384 dimensions

def process_price_for_encoding(price):
    """Convert price to a string format suitable for encoding"""
    return f"price {price:.2f} Rands"

for doc in tqdm(documents):
    try:
        # Encode product name
        doc['productName_vector'] = model.encode(doc['productName']).tolist()
        
        # Encode price (convert to text format first)
        price_text = process_price_for_encoding(doc['price'])
        doc['price_vector'] = model.encode(price_text).tolist()
        
        # Encode category
        doc['category_vector'] = model.encode(doc['category']).tolist()
        
        # Encode product description
        doc['productDescription_vector'] = model.encode(doc['productDescription']).tolist()
        
        # Prepare document for Elasticsearch
        # Convert any remaining numpy arrays to lists for JSON serialization
        doc = {k: v.tolist() if hasattr(v, 'tolist') else v for k, v in doc.items()}
        
        # Index the document
        es_client.index(
            index="shop_products",
            id=doc['id'],
            body=doc
        )
        
    except Exception as e:
        print(f"Error processing document {doc.get('id', 'unknown')}: {str(e)}")

# Refresh the index to make the documents searchable immediately
es_client.indices.refresh(index="shop_products")

100%|█████████████████████████████████████████████████████| 23/23 [00:01<00:00, 11.72it/s]


ObjectApiResponse({'_shards': {'total': 1, 'successful': 1, 'failed': 0}})

In [26]:
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
import pandas as pd

# Initialize the model and client
model = SentenceTransformer('all-MiniLM-L6-v2')
es_client = Elasticsearch('http://localhost:9200')
index_name = "shop_products"

def elastic_search_knn(field, vector, category=None):
    """KNN search for a specific vector field"""
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000
    }
    
    if category:
        knn["filter"] = {
            "term": {
                "category.keyword": category
            }
        }

    search_query = {
        "knn": knn,
        "_source": ["id", "productName", "category", "price", "productDescription", "availableColours", "sizes"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    return [hit['_source'] for hit in es_results['hits']['hits']]

def elastic_search_knn_combined(vector, category=None):
    """Combined vector search using multiple fields"""
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": [
                    {
                        "script_score": {
                            "query": {"match_all": {}},
                            "script": {
                                "source": """
                                    cosineSimilarity(params.query_vector, 'productName_vector') * 0.4 + 
                                    cosineSimilarity(params.query_vector, 'productDescription_vector') * 0.4 + 
                                    cosineSimilarity(params.query_vector, 'category_vector') * 0.2 + 
                                    1.0
                                """,
                                "params": {
                                    "query_vector": vector
                                }
                            }
                        }
                    }
                ]
            }
        },
        "_source": ["id", "productName", "category", "price", "productDescription", "availableColours", "sizes"]
    }

    if category:
        search_query["query"]["bool"]["filter"] = {
            "term": {
                "category.keyword": category
            }
        }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    return [hit['_source'] for hit in es_results['hits']['hits']]

# Search functions for different vector fields
def productName_vector_search(q):
    query = q['question']
    category = q['category']
    vector = model.encode(query)
    return elastic_search_knn('productName_vector', vector, category)

def description_vector_search(q):
    query = q['question']
    category = q['category']
    vector = model.encode(query)
    return elastic_search_knn('productDescription_vector', vector, category)

def combined_vector_search(q):
    query = q['question']
    category = q['category']
    vector = model.encode(query)
    return elastic_search_knn_combined(vector, category)

# Evaluation functions
def hit_rate(relevance_total):
    """Calculate hit rate"""
    cnt = sum(1 for line in relevance_total if True in line)
    return cnt / len(relevance_total)

def mrr(relevance_total):
    """Calculate Mean Reciprocal Rank"""
    total_score = 0.0
    for line in relevance_total:
        for rank, relevant in enumerate(line):
            if relevant:
                total_score += 1 / (rank + 1)
                break
    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    """Evaluate search function performance"""
    relevance_total = []
    detailed_results = []

    for q in tqdm(ground_truth):
        expected_id = q['product_id']
        results = search_function(q)
        relevance = [d['id'] == expected_id for d in results]
        relevance_total.append(relevance)
        
        # Store detailed results for analysis
        detailed_results.append({
            'question': q['question'],
            'expected_id': expected_id,
            'expected_name': q['product_name'],
            'top_results': [
                {
                    'id': r['id'],
                    'name': r['productName'],
                    'correct': r['id'] == expected_id
                }
                for r in results
            ]
        })

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
        'detailed_results': detailed_results
    }

# Load and prepare ground truth data
def load_ground_truth(file_path):
    """Load and prepare ground truth data"""
    df = pd.read_csv(file_path)
    
    # Convert colors and sizes from string to list
    df['colors'] = df['colors'].apply(lambda x: [s.strip() for s in x.split(',')] if isinstance(x, str) else [])
    df['sizes'] = df['sizes'].apply(lambda x: [s.strip() for s in x.split(',')] if isinstance(x, str) else [])
    
    return df.to_dict(orient='records')


In [27]:
# Load ground truth data
ground_truth = load_ground_truth('../data/product_qa_groundtruth.csv')

# Run evaluations
results_name = evaluate(ground_truth, productName_vector_search)
results_desc = evaluate(ground_truth, description_vector_search)
results_combined = evaluate(ground_truth, combined_vector_search)

# Print results
print("\nProduct Name Vector Search:")
print(f"Hit Rate: {results_name['hit_rate']:.4f}")
print(f"MRR: {results_name['mrr']:.4f}")

print("\nDescription Vector Search:")
print(f"Hit Rate: {results_desc['hit_rate']:.4f}")
print(f"MRR: {results_desc['mrr']:.4f}")

print("\nCombined Vector Search:")
print(f"Hit Rate: {results_combined['hit_rate']:.4f}")
print(f"MRR: {results_combined['mrr']:.4f}")

# Analyze failed cases
def print_failed_cases(results):
    failed = [r for r in results['detailed_results'] if not any(tr['correct'] for tr in r['top_results'])]
    print(f"\nNumber of failed queries: {len(failed)}")
    for f in failed[:5]:  # Show first 5 failed cases
        print(f"\nQuestion: {f['question']}")
        print(f"Expected: {f['expected_name']} ({f['expected_id']})")
        print("Top results:")
        for r in f['top_results']:
            print(f"- {r['name']} ({r['id']})")

print_failed_cases(results_combined)

100%|███████████████████████████████████████████████████| 115/115 [00:02<00:00, 56.63it/s]
100%|███████████████████████████████████████████████████| 115/115 [00:01<00:00, 58.66it/s]
100%|███████████████████████████████████████████████████| 115/115 [00:01<00:00, 60.62it/s]


Product Name Vector Search:
Hit Rate: 1.0000
MRR: 1.0000

Description Vector Search:
Hit Rate: 1.0000
MRR: 0.9957

Combined Vector Search:
Hit Rate: 1.0000
MRR: 1.0000

Number of failed queries: 0





### Observations and Conclusions

**Hit Rate (1.0000 for all approaches)**
- A hit rate of 1.0 (or 100%) -  every single query successfully found the correct product in the top 5 results

**Mean Reciprocal Rank (MRR)**
- A score of 1.0000 (Product Name and Combined) means the correct product was consistently ranked #1 in the results
- Description Search's 0.9957 is really great but maybe , suggesting that in a few cases the correct product might have appeared as the second or third result which is not that bad

**Failed Queries: 0**
- No queries failed to find their target product

#### This is clear Vector search is better than Text Search
  

