In [4]:
import json
from elasticsearch import Elasticsearch

with open('../data/products_data.json', 'rt') as f_in:
    documents = json.load(f_in)

In [7]:
es_client = Elasticsearch('http://localhost:9200')

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "productName": {"type": "text"},
            "price": {
                "type": "double",  # For decimal numbers
                "fields": {
                    "keyword": {  # Additional keyword field for exact matches
                        "type": "keyword"
                    }
                }
            },
            "image": {"type": "text"},
            "category": {"type": "keyword"},
            "id": {"type": "keyword"},
            "productDescription": {
                "type": "text",
                "analyzer": "standard"
            },
            "availableColours": {
                "type": "text",
                "fields": {
                    "keyword": {
                        "type": "keyword"
                    }
                }
            },
            "sizes": {
                "type": "text",
                "fields": {
                    "keyword": {
                        "type": "keyword"
                    }
                }
            },
            "discount": {
                "type": "integer"  # For whole numbers
            }
        }
    }
}

# Delete existing index if it exists
index_name = "shop_products"
es_client.indices.delete(index=index_name, ignore_unavailable=True)

# Create new index with updated mappings
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'shop_products'})

In [8]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████████████████████████████████████| 23/23 [00:00<00:00, 34.00it/s]


In [21]:
def elastic_search(query, category, max_price=None):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["productName^3", "productDescription", "availableColours", "sizes"],
                        "type": "best_fields"
                    }
                },
                "filter": [
                    {
                        "term": {
                            "category": category
                        }
                    }
                ]
            }
        }
    }
    
    # Add price range filter if max_price is provided
    if max_price is not None:
        search_query["query"]["bool"]["filter"].append({
            "range": {
                "price": {
                    "lte": max_price
                }
            }
        })

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs



In [22]:
# Example usage:
results = elastic_search(
    query="trousers",
    category="pants",
    max_price=9000
)

In [23]:
results

[{'id': 'CLT015',
  'productName': 'Tailored Wool Trousers',
  'price': 1499.99,
  'category': 'pants',
  'image': 'https://images.unsplash.com/photo-1594633313593-89bbf62c5c51?w=300',
  'productDescription': 'Classic tailored wool trousers\n\nMaterial Composition:\n- 100% Australian wool\n- Italian fabric\n- Satin pocket lining\n\nFit Details:\n- Mid-rise waist\n- Straight leg cut\n- 32-inch inseam\n- Tailored fit\n\nCare Instructions:\n- Dry clean only\n- Press with damp cloth\n- Store on trouser hanger\n- Brush after wearing\n\nDesign Features:\n- Front pleats\n- Side slant pockets\n- Back welt pockets\n- Extended tab closure\n\nStyling Tips:\n- Perfect for formal occasions\n- Pair with silk blouse\n- Great with loafers',
  'availableColours': ['Charcoal', 'Navy', 'Black', 'Grey'],
  'sizes': ['34', '36', '38', '40', '42', '44'],
  'discount': 10},
 {'id': 'CLT011',
  'productName': 'Wide-Leg Linen Trousers',
  'price': 999.99,
  'category': 'pants',
  'image': 'https://images.unspl

In [25]:
import pandas as pd
from tqdm.auto import tqdm

def evaluate_search_performance(ground_truth_data):
    relevance_total = []
    queries_with_no_results = []
    
    for query_data in tqdm(ground_truth_data):
        try:
            # Get the expected answer from ground truth
            expected_answer = str(query_data['answer']).strip()
            
            # Use the elastic_search function with parameters from your data
            results = elastic_search(
                query=query_data['question'],
                category=query_data['category']
            )
            
            # Check if we got any results
            if not results:
                queries_with_no_results.append(query_data['question'])
                relevance = [False] * 5
            else:
                # Create relevance list based on matching the answer
                # This handles both product ID and price answers
                relevance = []
                for result in results:
                    # Convert result values to strings for comparison
                    result_id = str(result['id']).strip()
                    result_price = str(result['price']).strip()
                    
                    # Check if either ID or price matches the expected answer
                    is_relevant = (result_id == expected_answer or 
                                 result_price == expected_answer)
                    relevance.append(is_relevant)
                
                # Pad with False if we have fewer than 5 results
                relevance.extend([False] * (5 - len(relevance)))
            
            relevance_total.append(relevance)
            
        except Exception as e:
            print(f"Error processing query: {query_data['question']}")
            print(f"Error details: {str(e)}")
            relevance_total.append([False] * 5)
    
    return relevance_total, queries_with_no_results

def hit_rate(relevance_total):
    """
    Calculate hit rate (recall@k) - fraction of queries where correct document 
    appears in the results list
    """
    if not relevance_total:
        return 0.0
    
    hits = sum(1 for relevance in relevance_total if True in relevance)
    return hits / len(relevance_total)

def mrr(relevance_total):
    """
    Calculate Mean Reciprocal Rank - average of reciprocal ranks of correct documents
    """
    if not relevance_total:
        return 0.0
    
    total_score = 0.0
    for relevance in relevance_total:
        for rank, is_relevant in enumerate(relevance):
            if is_relevant:
                total_score += 1.0 / (rank + 1)
                break
    
    return total_score / len(relevance_total)

# Load and prepare ground truth data
df_ground_truth = pd.read_csv('../data/product_qa_groundtruth.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

# Evaluate search performance
relevance_total, failed_queries = evaluate_search_performance(ground_truth)

# Calculate metrics
hit_rate_score = hit_rate(relevance_total)
mrr_score = mrr(relevance_total)

# Print results with detailed information
print("\nSearch Performance Metrics:")
print(f"Number of queries evaluated: {len(ground_truth)}")
print(f"Hit Rate (Recall@k): {hit_rate_score:.4f}")
print(f"Mean Reciprocal Rank (MRR): {mrr_score:.4f}")

if failed_queries:
    print(f"\nQueries with no results ({len(failed_queries)}):")
    for query in failed_queries[:5]:
        print(f"- {query}")
    if len(failed_queries) > 5:
        print(f"... and {len(failed_queries) - 5} more")

# Save detailed results to CSV
results_df = pd.DataFrame({
    'Query': [q['question'] for q in ground_truth],
    'Expected_Answer': [q['answer'] for q in ground_truth],
    'Category': [q['category'] for q in ground_truth],
    'Found_In_Results': [True in rel for rel in relevance_total],
    'Position_If_Found': [
        next((i+1 for i, r in enumerate(rel) if r), None) 
        for rel in relevance_total
    ],
    'Product_ID': [q['product_id'] for q in ground_truth],
    'Product_Name': [q['product_name'] for q in ground_truth]
})

# Add analysis of different question types
results_df['Question_Type'] = results_df.apply(
    lambda x: 'ID Question' if 'ID' in x['Query'] 
    else 'Price Question' if 'cost' in x['Query'].lower() or 'price' in x['Query'].lower()
    else 'Other',
    axis=1
)

# Calculate performance by question type
print("\nPerformance by Question Type:")
for question_type in results_df['Question_Type'].unique():
    type_data = results_df[results_df['Question_Type'] == question_type]
    hit_rate = type_data['Found_In_Results'].mean()
    print(f"{question_type}: Hit Rate = {hit_rate:.4f}")

results_df.to_csv('search_evaluation_results.csv', index=False)
print("\nDetailed results saved to 'search_evaluation_results.csv'")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 115/115 [00:00<00:00, 258.37it/s]


Search Performance Metrics:
Number of queries evaluated: 115
Hit Rate (Recall@k): 0.4000
Mean Reciprocal Rank (MRR): 0.4000

Performance by Question Type:
ID Question: Hit Rate = 1.0000
Price Question: Hit Rate = 1.0000
Other: Hit Rate = 0.0000

Detailed results saved to 'search_evaluation_results.csv'





### Conclusions 

**Hit Rate (Recall@k): 0.4000** 
- 40% of all queries found the correct answer in the top 5 results

**Mean Reciprocal Rank (MRR)**
- 0.4000 Average position of the correct answer
- This means the correct answer, is  typically appearing around the 2nd or 3rd position, hence the  average of all these is 0.4

**ID Question: Hit Rate = 1.0000**
- The system is excellent at finding exact matches (IDs and prices)

**Other: Hit Rate = 0.0000**
- The system completely fails on "Other" type questions (0% hit rate) e.g Product features Colors,Sizes,Descriptions,General product information