In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm

class SearchEvaluator:
    def __init__(self, es_client, index_name, model_name='all-MiniLM-L6-v2'):
        self.es_client = es_client
        self.index_name = index_name
        self.model = SentenceTransformer(model_name)
        
    def vector_search(self, query, category=None, max_price=None):
        """Combined vector search using multiple fields"""
        vector = self.model.encode(query)
        
        search_query = {
            "size": 5,
            "query": {
                "bool": {
                    "must": [
                        {
                            "script_score": {
                                "query": {"match_all": {}},
                                "script": {
                                    "source": """
                                        cosineSimilarity(params.query_vector, 'productName_vector') * 0.4 + 
                                        cosineSimilarity(params.query_vector, 'productDescription_vector') * 0.4 + 
                                        cosineSimilarity(params.query_vector, 'category_vector') * 0.2 + 
                                        1.0
                                    """,
                                    "params": {
                                        "query_vector": vector
                                    }
                                }
                            }
                        }
                    ]
                }
            }
        }
        
        # Add filters
        if category or max_price:
            filters = []
            if category:
                filters.append({"term": {"category.keyword": category}})
            if max_price:
                filters.append({"range": {"price": {"lte": max_price}}})
            search_query["query"]["bool"]["filter"] = filters
            
        results = self.es_client.search(index=self.index_name, body=search_query)
        return [hit['_source'] for hit in results['hits']['hits']]
    
    def text_search(self, query, category=None, max_price=None):
        """Text-based search using multi_match"""
        search_query = {
            "size": 5,
            "query": {
                "bool": {
                    "must": {
                        "multi_match": {
                            "query": query,
                            "fields": ["productName^3", "productDescription", "availableColours", "sizes"],
                            "type": "best_fields"
                        }
                    }
                }
            }
        }
        
        # Add filters
        if category or max_price:
            filters = []
            if category:
                filters.append({"term": {"category.keyword": category}})
            if max_price:
                filters.append({"range": {"price": {"lte": max_price}}})
            search_query["query"]["bool"]["filter"] = filters
            
        results = self.es_client.search(index=self.index_name, body=search_query)
        return [hit['_source'] for hit in results['hits']['hits']]
    
    def evaluate_search_method(self, ground_truth, search_method):
        """Evaluate a search method using multiple metrics"""
        relevance_total = []
        detailed_results = []
        
        for query_data in tqdm(ground_truth):
            try:
                # Get search results
                results = search_method(
                    query=query_data['question'],
                    category=query_data['category'],
                    max_price=query_data.get('max_price')
                )
                
                # Check relevance against ground truth
                expected_id = str(query_data['product_id'])
                relevance = [str(r['id']) == expected_id for r in results]
                relevance.extend([False] * (5 - len(relevance)))  # Pad to 5 results
                relevance_total.append(relevance)
                
                # Store detailed results
                detailed_results.append({
                    'query': query_data['question'],
                    'category': query_data['category'],
                    'expected_id': expected_id,
                    'expected_name': query_data['product_name'],
                    'results': [
                        {
                            'id': r['id'],
                            'name': r['productName'],
                            'correct': str(r['id']) == expected_id
                        } for r in results
                    ]
                })
                
            except Exception as e:
                print(f"Error processing query: {query_data['question']}")
                print(f"Error details: {str(e)}")
                relevance_total.append([False] * 5)
        
        # Calculate metrics
        metrics = {
            'hit_rate': self.calculate_hit_rate(relevance_total),
            'mrr': self.calculate_mrr(relevance_total),
            'precision_at_k': self.calculate_precision_at_k(relevance_total),
            'ndcg': self.calculate_ndcg(relevance_total),
            'detailed_results': detailed_results
        }
        
        return metrics
    
    @staticmethod
    def calculate_hit_rate(relevance_total):
        """Calculate hit rate (recall@k)"""
        return sum(1 for rel in relevance_total if True in rel) / len(relevance_total)
    
    @staticmethod
    def calculate_mrr(relevance_total):
        """Calculate Mean Reciprocal Rank"""
        total_score = 0.0
        for relevance in relevance_total:
            for rank, is_relevant in enumerate(relevance):
                if is_relevant:
                    total_score += 1.0 / (rank + 1)
                    break
        return total_score / len(relevance_total)
    
    @staticmethod
    def calculate_precision_at_k(relevance_total, k=5):
        """Calculate Precision@k"""
        return sum(sum(rel[:k]) / k for rel in relevance_total) / len(relevance_total)
    
    @staticmethod
    def calculate_ndcg(relevance_total):
        """Calculate Normalized Discounted Cumulative Gain"""
        def dcg(rel):
            return sum((2**r - 1) / (log2(i + 2)) for i, r in enumerate(rel))
        
        from math import log2
        ndcg_scores = []
        for rel in relevance_total:
            dcg_score = dcg(rel)
            idcg_score = dcg(sorted(rel, reverse=True))
            ndcg_scores.append(dcg_score / idcg_score if idcg_score > 0 else 0)
        return sum(ndcg_scores) / len(ndcg_scores)

def run_comparison():
    # Initialize components
    es_client = Elasticsearch('http://localhost:9200')
    evaluator = SearchEvaluator(es_client, "shop_products")
    
    # Load ground truth data
    ground_truth = pd.read_csv('../data/product_qa_groundtruth.csv').to_dict(orient='records')
    
    # Evaluate both methods
    vector_results = evaluator.evaluate_search_method(ground_truth, evaluator.vector_search)
    text_results = evaluator.evaluate_search_method(ground_truth, evaluator.text_search)
    
    # Print comparison results
    print("\nVector Search Results:")
    print(f"Hit Rate: {vector_results['hit_rate']:.4f}")
    print(f"MRR: {vector_results['mrr']:.4f}")
    print(f"Precision@5: {vector_results['precision_at_k']:.4f}")
    print(f"NDCG: {vector_results['ndcg']:.4f}")
    
    print("\nText Search Results:")
    print(f"Hit Rate: {text_results['hit_rate']:.4f}")
    print(f"MRR: {text_results['mrr']:.4f}")
    print(f"Precision@5: {text_results['precision_at_k']:.4f}")
    print(f"NDCG: {text_results['ndcg']:.4f}")
    
    # Analyze results by query type
    analyze_results_by_query_type(vector_results['detailed_results'], "Vector Search")
    analyze_results_by_query_type(text_results['detailed_results'], "Text Search")
    
    return vector_results, text_results

def analyze_results_by_query_type(detailed_results, method_name):
    """Analyze performance by query type"""
    df = pd.DataFrame(detailed_results)
    df['query_type'] = df['query'].apply(lambda x: 
        'price' if any(word in x.lower() for word in ['price', 'cost', 'expensive', 'cheap']) 
        else 'color' if any(word in x.lower() for word in ['color', 'colour']) 
        else 'size' if any(word in x.lower() for word in ['size', 'fit']) 
        else 'general'
    )
    
    print(f"\n{method_name} Performance by Query Type:")
    for query_type in df['query_type'].unique():
        type_data = df[df['query_type'] == query_type]
        success_rate = sum(1 for results in type_data['results'] 
                          if any(r['correct'] for r in results)) / len(type_data)
        print(f"{query_type}: Success Rate = {success_rate:.4f}")

if __name__ == "__main__":
    vector_results, text_results = run_comparison()

  from .autonotebook import tqdm as notebook_tqdm
100%|███████████████████████████████████████████████████| 115/115 [00:01<00:00, 59.13it/s]
100%|██████████████████████████████████████████████████| 115/115 [00:00<00:00, 300.47it/s]


Vector Search Results:
Hit Rate: 1.0000
MRR: 1.0000
Precision@5: 0.2000
NDCG: 1.0000

Text Search Results:
Hit Rate: 1.0000
MRR: 1.0000
Precision@5: 0.2000
NDCG: 1.0000

Vector Search Performance by Query Type:
general: Success Rate = 1.0000
price: Success Rate = 1.0000
color: Success Rate = 1.0000
size: Success Rate = 1.0000

Text Search Performance by Query Type:
general: Success Rate = 1.0000
price: Success Rate = 1.0000
color: Success Rate = 1.0000
size: Success Rate = 1.0000





In [2]:

import pandas as pd
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm
import time
from typing import List, Dict, Any
import numpy as np

class AdvancedSearchEvaluator:
    def __init__(self, es_client, index_name, model_name='all-MiniLM-L6-v2'):
        self.es_client = es_client
        self.index_name = index_name
        self.model = SentenceTransformer(model_name)
    
    def vector_search(self, query: str, category: str = None, max_price: float = None) -> tuple[List[Dict], float]:
        """Vector search with timing measurement"""
        start_time = time.time()
        vector = self.model.encode(query)
        
        search_query = {
            "size": 5,
            "query": {
                "bool": {
                    "must": [
                        {
                            "script_score": {
                                "query": {"match_all": {}},
                                "script": {
                                    "source": """
                                        cosineSimilarity(params.query_vector, 'productName_vector') * 0.4 + 
                                        cosineSimilarity(params.query_vector, 'productDescription_vector') * 0.4 + 
                                        cosineSimilarity(params.query_vector, 'category_vector') * 0.2 + 
                                        1.0
                                    """,
                                    "params": {
                                        "query_vector": vector
                                    }
                                }
                            }
                        }
                    ]
                }
            }
        }
        
        if category or max_price:
            filters = []
            if category:
                filters.append({"term": {"category.keyword": category}})
            if max_price:
                filters.append({"range": {"price": {"lte": max_price}}})
            search_query["query"]["bool"]["filter"] = filters
        
        results = self.es_client.search(index=self.index_name, body=search_query)
        search_time = time.time() - start_time
        
        return [hit['_source'] for hit in results['hits']['hits']], search_time
    
    def text_search(self, query: str, category: str = None, max_price: float = None) -> tuple[List[Dict], float]:
        """Text search with timing measurement"""
        start_time = time.time()
        
        search_query = {
            "size": 5,
            "query": {
                "bool": {
                    "must": {
                        "multi_match": {
                            "query": query,
                            "fields": ["productName^3", "productDescription", "availableColours", "sizes"],
                            "type": "best_fields",
                            "fuzziness": "AUTO"
                        }
                    }
                }
            }
        }
        
        if category or max_price:
            filters = []
            if category:
                filters.append({"term": {"category.keyword": category}})
            if max_price:
                filters.append({"range": {"price": {"lte": max_price}}})
            search_query["query"]["bool"]["filter"] = filters
        
        results = self.es_client.search(index=self.index_name, body=search_query)
        search_time = time.time() - start_time
        
        return [hit['_source'] for hit in results['hits']['hits']], search_time

    def evaluate_search_method(self, ground_truth: List[Dict], search_method, method_name: str) -> Dict[str, Any]:
        """Comprehensive evaluation of a search method"""
        metrics = {
            'hit_rate': [], 
            'mrr': [], 
            'precision': [],
            'search_times': [],
            'query_coverage': [],
            'results_by_type': {}
        }
        
        query_types = ['price', 'color', 'size', 'general']
        for qt in query_types:
            metrics['results_by_type'][qt] = {'success': 0, 'total': 0}
        
        detailed_results = []
        
        for query_data in tqdm(ground_truth):
            try:
                # Get search results and timing
                results, search_time = search_method(
                    query=query_data['question'],
                    category=query_data['category'],
                    max_price=query_data.get('max_price')
                )
                
                # Record search time
                metrics['search_times'].append(search_time)
                
                # Calculate relevance
                expected_id = str(query_data['product_id'])
                relevance = [str(r['id']) == expected_id for r in results]
                
                # Record metrics
                metrics['hit_rate'].append(1 if True in relevance else 0)
                
                # MRR calculation
                mrr_score = 0
                for rank, is_relevant in enumerate(relevance):
                    if is_relevant:
                        mrr_score = 1.0 / (rank + 1)
                        break
                metrics['mrr'].append(mrr_score)
                
                # Precision calculation
                metrics['precision'].append(sum(relevance) / len(results) if results else 0)
                
                # Query coverage (percentage of results returned)
                metrics['query_coverage'].append(len(results) / 5)
                
                # Query type analysis
                query_type = self._determine_query_type(query_data['question'])
                metrics['results_by_type'][query_type]['total'] += 1
                if any(relevance):
                    metrics['results_by_type'][query_type]['success'] += 1
                
                # Store detailed results
                detailed_results.append({
                    'query': query_data['question'],
                    'query_type': query_type,
                    'expected_id': expected_id,
                    'found_correct': any(relevance),
                    'rank_if_found': next((i+1 for i, r in enumerate(relevance) if r), None),
                    'search_time': search_time,
                    'num_results': len(results)
                })
                
            except Exception as e:
                print(f"Error processing query: {query_data['question']}")
                print(f"Error details: {str(e)}")
        
        # Calculate final metrics
        final_metrics = {
            'method_name': method_name,
            'hit_rate': np.mean(metrics['hit_rate']),
            'mrr': np.mean(metrics['mrr']),
            'precision': np.mean(metrics['precision']),
            'avg_search_time': np.mean(metrics['search_times']),
            'query_coverage': np.mean(metrics['query_coverage']),
            'performance_by_type': {
                qt: {
                    'success_rate': metrics['results_by_type'][qt]['success'] / metrics['results_by_type'][qt]['total']
                    if metrics['results_by_type'][qt]['total'] > 0 else 0,
                    'sample_size': metrics['results_by_type'][qt]['total']
                }
                for qt in query_types
            },
            'detailed_results': detailed_results
        }
        
        return final_metrics

    @staticmethod
    def _determine_query_type(query: str) -> str:
        """Determine the type of query"""
        query = query.lower()
        if any(word in query for word in ['price', 'cost', 'expensive', 'cheap', 'rand', '$']):
            return 'price'
        elif any(word in query for word in ['color', 'colour', 'red', 'blue', 'black', 'white']):
            return 'color'
        elif any(word in query for word in ['size', 'fit', 'small', 'large', 'medium']):
            return 'size'
        return 'general'

def print_comparison_results(vector_metrics: Dict, text_metrics: Dict):
    """Print detailed comparison of search methods"""
    print("\n=== Search Methods Comparison ===\n")
    
    # Create comparison table
    metrics_table = pd.DataFrame({
        'Metric': ['Hit Rate', 'MRR', 'Precision', 'Avg Search Time (s)', 'Query Coverage'],
        'Vector Search': [
            vector_metrics['hit_rate'],
            vector_metrics['mrr'],
            vector_metrics['precision'],
            vector_metrics['avg_search_time'],
            vector_metrics['query_coverage']
        ],
        'Text Search': [
            text_metrics['hit_rate'],
            text_metrics['mrr'],
            text_metrics['precision'],
            text_metrics['avg_search_time'],
            text_metrics['query_coverage']
        ]
    })
    
    print("Overall Performance Metrics:")
    print(metrics_table.round(4))
    
    print("\nPerformance by Query Type:")
    for query_type in ['price', 'color', 'size', 'general']:
        print(f"\n{query_type.title()} Queries:")
        vector_perf = vector_metrics['performance_by_type'][query_type]
        text_perf = text_metrics['performance_by_type'][query_type]
        
        type_table = pd.DataFrame({
            'Metric': ['Success Rate', 'Sample Size'],
            'Vector Search': [
                vector_perf['success_rate'],
                vector_perf['sample_size']
            ],
            'Text Search': [
                text_perf['success_rate'],
                text_perf['sample_size']
            ]
        })
        print(type_table.round(4))
    
    # Provide recommendations
    print("\n=== Recommendations ===")
    
    # Compare overall performance
    if vector_metrics['hit_rate'] > text_metrics['hit_rate']:
        better_method = "Vector Search"
        margin = vector_metrics['hit_rate'] - text_metrics['hit_rate']
    else:
        better_method = "Text Search"
        margin = text_metrics['hit_rate'] - vector_metrics['hit_rate']
    
    print(f"\n1. Overall Performance: {better_method} performs better by {margin:.2%} in hit rate.")
    
    # Compare search times
    time_diff = vector_metrics['avg_search_time'] - text_metrics['avg_search_time']
    faster_method = "Vector Search" if time_diff < 0 else "Text Search"
    print(f"2. Speed: {faster_method} is faster by {abs(time_diff):.4f} seconds on average.")
    
    # Analyze query type strengths
    print("\n3. Query Type Analysis:")
    for query_type in ['price', 'color', 'size', 'general']:
        vector_rate = vector_metrics['performance_by_type'][query_type]['success_rate']
        text_rate = text_metrics['performance_by_type'][query_type]['success_rate']
        better = "Vector Search" if vector_rate > text_rate else "Text Search"
        diff = abs(vector_rate - text_rate)
        if diff > 0.05:  # Only mention significant differences
            print(f"   - For {query_type} queries: {better} performs better by {diff:.2%}")
    
    # Final recommendation
    print("\n4. Final Recommendation:")
    if vector_metrics['hit_rate'] > text_metrics['hit_rate'] and vector_metrics['avg_search_time'] <= text_metrics['avg_search_time']:
        print("   Use Vector Search - Better accuracy and comparable or better speed")
    elif text_metrics['hit_rate'] > vector_metrics['hit_rate'] and text_metrics['avg_search_time'] <= vector_metrics['avg_search_time']:
        print("   Use Text Search - Better accuracy and comparable or better speed")
    else:
        print(f"   Consider trade-off:")
        print(f"   - Vector Search: {vector_metrics['hit_rate']:.2%} accuracy, {vector_metrics['avg_search_time']:.4f}s average time")
        print(f"   - Text Search: {text_metrics['hit_rate']:.2%} accuracy, {text_metrics['avg_search_time']:.4f}s average time")
        if abs(vector_metrics['hit_rate'] - text_metrics['hit_rate']) < 0.05:
            print("   Recommendation: Choose based on your priority (speed vs. accuracy)")

def run_advanced_comparison():
    """Run the advanced comparison"""
    # Initialize components
    es_client = Elasticsearch('http://localhost:9200')
    evaluator = AdvancedSearchEvaluator(es_client, "shop_products")
    
    # Load ground truth data
    ground_truth = pd.read_csv('../data/product_qa_groundtruth.csv').to_dict(orient='records')
    
    # Evaluate both methods
    vector_metrics = evaluator.evaluate_search_method(
        ground_truth, evaluator.vector_search, "Vector Search"
    )
    text_metrics = evaluator.evaluate_search_method(
        ground_truth, evaluator.text_search, "Text Search"
    )
    
    # Print comparison results
    print_comparison_results(vector_metrics, text_metrics)
    
    return vector_metrics, text_metrics

if __name__ == "__main__":
    vector_metrics, text_metrics = run_advanced_comparison()

100%|███████████████████████████████████████████████████| 115/115 [00:01<00:00, 60.57it/s]
100%|███████████████████████████████████████████████████| 115/115 [00:02<00:00, 38.51it/s]


=== Search Methods Comparison ===

Overall Performance Metrics:
                Metric  Vector Search  Text Search
0             Hit Rate         1.0000       1.0000
1                  MRR         1.0000       1.0000
2            Precision         0.4783       0.5232
3  Avg Search Time (s)         0.0163       0.0257
4       Query Coverage         0.5304       0.4904

Performance by Query Type:

Price Queries:
         Metric  Vector Search  Text Search
0  Success Rate            1.0          1.0
1   Sample Size           23.0         23.0

Color Queries:
         Metric  Vector Search  Text Search
0  Success Rate            1.0          1.0
1   Sample Size           30.0         30.0

Size Queries:
         Metric  Vector Search  Text Search
0  Success Rate            1.0          1.0
1   Sample Size           15.0         15.0

General Queries:
         Metric  Vector Search  Text Search
0  Success Rate            1.0          1.0
1   Sample Size           47.0         47.0

=== Rec




In [3]:
import os
import time
import uuid
from typing import Dict, Any, Tuple, List
import logging
import json
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
from groq import Groq
from dotenv import load_dotenv

logging.basicConfig(level=logging.INFO)
load_dotenv()

class EcommerceRAG:
    def __init__(self):
        self.model_name = os.getenv('MODEL_NAME', 'all-MiniLM-L6-v2')
        self.es_url = os.getenv('ELASTICSEARCH_URL', 'http://localhost:9200')
        self.groq_api_key = os.getenv('GROQ_API_KEY')
        self.index_name = "shop_products"
        
        self.embedding_model = SentenceTransformer(self.model_name)
        self.es_client = Elasticsearch(self.es_url)
        self.llm_client = Groq()
        
    def vector_search(self, query: str, category: str = None, max_price: float = None) -> List[Dict]:
        """Enhanced vector search for product information"""
        vector = self.embedding_model.encode(query)
        
        search_query = {
            "size": 5,
            "query": {
                "bool": {
                    "must": [
                        {
                            "script_score": {
                                "query": {"match_all": {}},
                                "script": {
                                    "source": """
                                        cosineSimilarity(params.query_vector, 'productName_vector') * 0.4 + 
                                        cosineSimilarity(params.query_vector, 'productDescription_vector') * 0.4 + 
                                        cosineSimilarity(params.query_vector, 'category_vector') * 0.2 + 
                                        1.0
                                    """,
                                    "params": {
                                        "query_vector": vector
                                    }
                                }
                            }
                        }
                    ]
                }
            }
        }
        
        if category or max_price:
            filters = []
            if category:
                filters.append({"term": {"category.keyword": category}})
            if max_price:
                filters.append({"range": {"price": {"lte": max_price}}})
            search_query["query"]["bool"]["filter"] = filters
        
        results = self.es_client.search(index=self.index_name, body=search_query)
        return [hit['_source'] for hit in results['hits']['hits']]

    def build_product_context(self, products: List[Dict]) -> str:
        """Builds a context string from retrieved products"""
        context_parts = []
        for product in products:
            context = f"""
Product: {product['productName']}
ID: {product['id']}
Price: ${product['price']}
Category: {product['category']}
Description: {product['productDescription']}
Available Colors: {', '.join(product['availableColours'])}
Available Sizes: {', '.join(product['sizes'])}
"""
            if product.get('discount'):
                context += f"Discount: {product['discount']}%\n"
            context_parts.append(context.strip())
        
        return "\n\n".join(context_parts)

    def build_prompt(self, query: str, search_results: List[Dict]) -> str:
        """Builds a prompt for the e-commerce assistant"""
        prompt_template = """
You are a knowledgeable and helpful e-commerce shopping assistant. Your role is to help customers find products and answer questions about them based on the CONTEXT provided.

Guidelines:
- Be concise and friendly in your responses
- Include specific product details (name, price, colors, sizes) when relevant
- If asked about price, always mention the exact price and any discounts
- For sizing questions, mention available sizes and any fit details from the description
- If the customer asks about something not in the CONTEXT, politely explain that you can only provide information about available products
- If multiple relevant products are found, briefly mention alternatives

Customer Question: {question}

Available Product Information:
{context}

Please provide a helpful response based solely on the provided product information.
""".strip()
        
        context = self.build_product_context(search_results)
        return prompt_template.format(question=query, context=context)

    def generate_response(self, prompt: str) -> Tuple[str, Dict[str, Any], float]:
        """Generates a response using the LLM"""
        start_time = time.time()
        response = self.llm_client.chat.completions.create(
            model='llama-3.1-70b-versatile',
            messages=[{"role": "user", "content": prompt}]
        )
        response_time = time.time() - start_time
        
        return (
            response.choices[0].message.content,
            response.usage.to_dict(),
            response_time
        )

    def handle_query(self, query: str, category: str = None, max_price: float = None) -> Dict[str, Any]:
        """Main method to handle a customer query"""
        conversation_id = str(uuid.uuid4())
        
        # Get relevant products
        search_results = self.vector_search(query, category, max_price)
        
        # Build and send prompt to LLM
        prompt = self.build_prompt(query, search_results)
        answer, tokens, response_time = self.generate_response(prompt)
        
        # Prepare response data
        response_data = {
            "id": conversation_id,
            "question": query,
            "answer": answer,
            "relevant_products": [
                {
                    "id": product["id"],
                    "name": product["productName"],
                    "price": product["price"]
                }
                for product in search_results
            ],
            "response_time": response_time,
            "tokens": tokens,
            "total_products_found": len(search_results)
        }
        
        return response_data

    def save_conversation(self, conversation_data: Dict[str, Any]) -> None:
        """Save conversation to database"""
        # Implement your database saving logic here
        pass

    def save_feedback(self, conversation_id: str, feedback: int) -> Dict[str, Any]:
        """Save user feedback"""
        try:
            # Implement your feedback saving logic here
            return {"status": "success", "message": "Feedback saved successfully"}
        except Exception as e:
            logging.error(f"Error saving feedback: {str(e)}")
            return {"status": "error", "message": str(e)}

def main():
    # Initialize the RAG system
    rag_system = EcommerceRAG()
    
    # Example usage
    while True:
        query = input("\nWhat would you like to know about our products? (type 'exit' to quit): ")
        if query.lower() == 'exit':
            break
            
        # Optional filters
        category = input("Specify category (press Enter to skip): ").strip() or None
        max_price = input("Maximum price (press Enter to skip): ").strip()
        max_price = float(max_price) if max_price else None
        
        # Process query
        result = rag_system.handle_query(query, category, max_price)
        
        # Display results
        print("\nAnswer:", result["answer"])
        print("\nRelevant Products:")
        for product in result["relevant_products"]:
            print(f"- {product['name']} (${product['price']}) [ID: {product['id']}]")
        
        print(f"\nResponse Time: {result['response_time']:.2f} seconds")
        print(f"Total Tokens Used: {result['tokens']['total_tokens']}")
        
        # Get feedback
        feedback = input("\nWas this response helpful? (1: Yes, -1: No): ")
        if feedback in ['1', '-1']:
            rag_system.save_feedback(result['id'], int(feedback))

if __name__ == "__main__":
    main()

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: multi-qa-MiniLM-L6-cos-v1
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu



What would you like to know about our products? (type 'exit' to quit):  hey i have 900 what shoes can i buy 
Specify category (press Enter to skip):  shoes
Maximum price (press Enter to skip):  1000


Batches: 100%|██████████████████████████████████████████████| 1/1 [00:00<00:00, 67.31it/s]
INFO:elastic_transport.transport:POST http://localhost:9200/shop_products/_search [status:200 duration:0.004s]
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"



Answer: With a budget of $900, you have plenty of options. One great choice is our Platform Canvas Sneakers (CLT012) which are currently discounted to $559.99 (20% off the original price of $699.99).

These trendy sneakers come in four stylish colors: White, Black, Navy, and Red. They're available in sizes UK4 to UK8, with a regular fit and a comfortable 4cm platform height. 

If you're looking for something similar, I don't have other options to suggest at the moment, but I highly recommend checking these out. Would you like to know more about this product or would you like to proceed with the purchase?

Relevant Products:
- Platform Canvas Sneakers ($699.99) [ID: CLT012]

Response Time: 1.44 seconds
Total Tokens Used: 522



Was this response helpful? (1: Yes, -1: No):  1


KeyboardInterrupt: Interrupted by user

In [8]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from groq import Groq
import time
from concurrent.futures import ThreadPoolExecutor
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from typing import List, Dict, Any, Tuple
import logging
import os
from dotenv import load_dotenv

logging.basicConfig(level=logging.INFO)
load_dotenv()

class GroqModelEvaluator:
    def __init__(self, embedding_model_name: str = 'all-MiniLM-L6-v2'):
        self.groq_client = Groq(api_key=os.getenv('GROQ_API_KEY'))
        self.embedding_model = SentenceTransformer(embedding_model_name)
        self.available_models = [
            'gemma-7b-it',
            'llama-3.1-70b-versatile',
            'llama-3.2-3b-preview',
            'llama3-70b-8192',
            'mixtral-8x7b-32768'
        ]
        
    def build_prompt(self, query: str, product_info: Dict) -> str:
        """Builds a consistent prompt for all models"""
        prompt_template = """
Given the following product information, please answer the customer's question.
Use only the information provided in the product details.

Product Information:
{product_info}

Customer Question: {question}

Please provide a clear and concise answer based solely on the above product information.
""".strip()
        
        product_details = f"""
Name: {product_info['productName']}
Price: ${product_info['price']}
Category: {product_info['category']}
Description: {product_info['productDescription']}
Available Colors: {', '.join(product_info['availableColours'])}
Available Sizes: {', '.join(product_info['sizes'])}
""".strip()
        
        return prompt_template.format(
            product_info=product_details,
            question=query
        )

    def get_model_response(self, prompt: str, model_name: str) -> Tuple[str, float]:
        """Get response from specific Groq model with timing"""
        start_time = time.time()
        try:
            response = self.groq_client.chat.completions.create(
                model=model_name,
                messages=[{"role": "user", "content": prompt}],
                max_tokens=512
            )
            response_time = time.time() - start_time
            return response.choices[0].message.content, response_time
        except Exception as e:
            logging.error(f"Error with model {model_name}: {str(e)}")
            return "", time.time() - start_time

    def compute_similarity(self, text1: str, text2: str) -> float:
        """Compute cosine similarity between two texts"""
        # Encode texts to vectors
        vec1 = self.embedding_model.encode([text1])[0]
        vec2 = self.embedding_model.encode([text2])[0]
        
        # Reshape vectors for cosine_similarity
        vec1 = vec1.reshape(1, -1)
        vec2 = vec2.reshape(1, -1)
        
        return cosine_similarity(vec1, vec2)[0][0]

    def evaluate_model(self, model_name: str, ground_truth: List[Dict]) -> Dict[str, List]:
        """Evaluate a single model on ground truth data"""
        results = {
            'questions': [],
            'original_answers': [],
            'model_answers': [],
            'similarities': [],
            'response_times': []
        }
        
        for item in tqdm(ground_truth, desc=f"Evaluating {model_name}"):
            prompt = self.build_prompt(item['question'], item['product'])
            model_answer, response_time = self.get_model_response(prompt, model_name)
            
            if model_answer:  # Only process if we got a valid response
                similarity = self.compute_similarity(item['answer'], model_answer)
                
                results['questions'].append(item['question'])
                results['original_answers'].append(item['answer'])
                results['model_answers'].append(model_answer)
                results['similarities'].append(similarity)
                results['response_times'].append(response_time)
        
        return results

    def evaluate_all_models(self, ground_truth: List[Dict]) -> Dict[str, Dict]:
        """Evaluate all models and return combined results"""
        all_results = {}
        
        for model_name in self.available_models:
            logging.info(f"Starting evaluation of {model_name}")
            results = self.evaluate_model(model_name, ground_truth)
            all_results[model_name] = results
            
            # Save intermediate results
            self.save_results(model_name, results)
            
        return all_results

    def save_results(self, model_name: str, results: Dict[str, List]) -> None:
        """Save evaluation results to CSV"""
        df = pd.DataFrame({
            'question': results['questions'],
            'original_answer': results['original_answers'],
            'model_answer': results['model_answers'],
            'similarity': results['similarities'],
            'response_time': results['response_times']
        })
        
        df.to_csv(f"evaluation_results_{model_name}.csv", index=False)

    def visualize_results(self, all_results: Dict[str, Dict]) -> None:
        """Create visualization of model comparisons"""
        plt.figure(figsize=(15, 10))
        
        # Plot similarity distributions
        plt.subplot(2, 1, 1)
        for model_name, results in all_results.items():
            sns.kdeplot(results['similarities'], label=model_name)
        plt.title('Distribution of Cosine Similarities Across Models')
        plt.xlabel('Cosine Similarity')
        plt.ylabel('Density')
        plt.legend()
        
        # Plot average response times
        plt.subplot(2, 1, 2)
        model_names = []
        avg_times = []
        avg_similarities = []
        
        for model_name, results in all_results.items():
            model_names.append(model_name)
            avg_times.append(np.mean(results['response_times']))
            avg_similarities.append(np.mean(results['similarities']))
        
        x = np.arange(len(model_names))
        width = 0.35
        
        plt.bar(x - width/2, avg_times, width, label='Avg Response Time (s)')
        plt.bar(x + width/2, avg_similarities, width, label='Avg Similarity')
        plt.xticks(x, model_names, rotation=45)
        plt.title('Average Response Time and Similarity by Model')
        plt.legend()
        
        plt.tight_layout()
        plt.savefig('model_comparison_results.png')
        
        # Print summary statistics
        print("\nModel Performance Summary:")
        for model_name in model_names:
            results = all_results[model_name]
            print(f"\n{model_name}:")
            print(f"Average Similarity: {np.mean(results['similarities']):.3f}")
            print(f"Average Response Time: {np.mean(results['response_times']):.3f}s")
            print(f"90th Percentile Similarity: {np.percentile(results['similarities'], 90):.3f}")

def main():
    # Load your ground truth data
    ground_truth_data = pd.read_csv('../data/product_qa_groundtruth.csv').to_dict('records')
    
    # Initialize evaluator
    evaluator = GroqModelEvaluator()
    
    # Run evaluation
    results = evaluator.evaluate_all_models(ground_truth_data)
    
    # Visualize results
    evaluator.visualize_results(results)

if __name__ == "__main__":
    main()

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:root:Starting evaluation of gemma-7b-it
Evaluating gemma-7b-it:   0%|                                     | 0/115 [00:00<?, ?it/s]


KeyError: 'product'

In [7]:
!pip install   matplotlib  seaborn pandas 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting seaborn
  Obtaining dependency information for seaborn from https://files.pythonhosted.org/packages/83/11/00d3c3dfc25ad54e731d91449895a79e4bf2384dc3ac01809010ba88f6d5/seaborn-0.13.2-py3-none-any.whl.metadata
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m[31m64.4 MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: seaborn
Successfully installed seaborn-0.13.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from groq import Groq
import time
from concurrent.futures import ThreadPoolExecutor
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from typing import List, Dict, Any, Tuple
import logging
import os
from dotenv import load_dotenv

logging.basicConfig(level=logging.INFO)
load_dotenv()

class GroqModelEvaluator:
    def __init__(self, embedding_model_name: str = 'all-MiniLM-L6-v2'):
        self.groq_client = Groq(api_key=os.getenv('GROQ_API_KEY'))
        self.embedding_model = SentenceTransformer(embedding_model_name)
        self.available_models = [
            'gemma-7b-it',
            'llama-3.1-70b-versatile',
            'llama-3.2-3b-preview',
            'llama3-70b-8192',
            'mixtral-8x7b-32768'
        ]
    
    def inspect_ground_truth(self, ground_truth: List[Dict]) -> None:
        """Debug helper to inspect ground truth data structure"""
        print("\nGround Truth Data Inspection:")
        print(f"Number of items: {len(ground_truth)}")
        if len(ground_truth) > 0:
            print("\nKeys in first item:", ground_truth[0].keys())
            print("\nFirst item content:")
            for key, value in ground_truth[0].items():
                print(f"{key}: {value}")
    
    def build_prompt(self, item: Dict) -> str:
        """Builds a consistent prompt for all models using flat data structure"""
        # Convert comma-separated strings to lists
        colors = [c.strip() for c in item['colors'].split(',')] if isinstance(item['colors'], str) else item['colors']
        sizes = [s.strip() for s in item['sizes'].split(',')] if isinstance(item['sizes'], str) else item['sizes']
        
        product_details = f"""
Product ID: {item['product_id']}
Name: {item['product_name']}
Price: ${item['price']}
Category: {item['category']}
Available Colors: {', '.join(colors)}
Available Sizes: {', '.join(sizes)}
""".strip()
        
        prompt_template = """
Given the following product information, please answer the customer's question.
Use only the information provided in the product details.

Product Information:
{product_info}

Customer Question: {question}

Please provide a clear and concise answer based solely on the above product information.
""".strip()
        
        return prompt_template.format(
            product_info=product_details,
            question=item['question']
        )

    def get_model_response(self, prompt: str, model_name: str) -> Tuple[str, float]:
        """Get response from specific Groq model with timing"""
        start_time = time.time()
        try:
            response = self.groq_client.chat.completions.create(
                model=model_name,
                messages=[{"role": "user", "content": prompt}],
                max_tokens=512
            )
            response_time = time.time() - start_time
            return response.choices[0].message.content, response_time
        except Exception as e:
            logging.error(f"Error with model {model_name}: {str(e)}")
            return "", time.time() - start_time

    def compute_similarity(self, text1: str, text2: str) -> float:
        """Compute cosine similarity between two texts"""
        if not text1 or not text2:
            return 0.0
        
        # Encode texts to vectors
        vec1 = self.embedding_model.encode([text1])[0]
        vec2 = self.embedding_model.encode([text2])[0]
        
        # Reshape vectors for cosine_similarity
        vec1 = vec1.reshape(1, -1)
        vec2 = vec2.reshape(1, -1)
        
        return cosine_similarity(vec1, vec2)[0][0]

    def evaluate_model(self, model_name: str, ground_truth: List[Dict]) -> Dict[str, List]:
        """Evaluate a single model on ground truth data"""
        results = {
            'questions': [],
            'original_answers': [],
            'model_answers': [],
            'similarities': [],
            'response_times': []
        }
        
        for item in tqdm(ground_truth, desc=f"Evaluating {model_name}"):
            try:
                prompt = self.build_prompt(item)
                model_answer, response_time = self.get_model_response(prompt, model_name)
                
                if model_answer:  # Only process if we got a valid response
                    similarity = self.compute_similarity(item['answer'], model_answer)
                    
                    results['questions'].append(item['question'])
                    results['original_answers'].append(item['answer'])
                    results['model_answers'].append(model_answer)
                    results['similarities'].append(similarity)
                    results['response_times'].append(response_time)
            except Exception as e:
                logging.error(f"Error processing item: {str(e)}")
                logging.error(f"Item content: {item}")
                continue
        
        return results

    def evaluate_all_models(self, ground_truth: List[Dict]) -> Dict[str, Dict]:
        """Evaluate all models and return combined results"""
        all_results = {}
        
        for model_name in self.available_models:
            logging.info(f"Starting evaluation of {model_name}")
            results = self.evaluate_model(model_name, ground_truth)
            all_results[model_name] = results
            
            # Save intermediate results
            self.save_results(model_name, results)
            
        return all_results

    def save_results(self, model_name: str, results: Dict[str, List]) -> None:
        """Save evaluation results to CSV"""
        df = pd.DataFrame({
            'question': results['questions'],
            'original_answer': results['original_answers'],
            'model_answer': results['model_answers'],
            'similarity': results['similarities'],
            'response_time': results['response_times']
        })
        
        df.to_csv(f"evaluation_results_{model_name}.csv", index=False)

    def visualize_results(self, all_results: Dict[str, Dict]) -> None:
        """Create visualization of model comparisons"""
        if not all_results:
            logging.warning("No results to visualize")
            return
            
        plt.figure(figsize=(15, 10))
        
        # Plot similarity distributions
        plt.subplot(2, 1, 1)
        for model_name, results in all_results.items():
            if results['similarities']:  # Only plot if we have similarities
                sns.kdeplot(results['similarities'], label=model_name)
        plt.title('Distribution of Cosine Similarities Across Models')
        plt.xlabel('Cosine Similarity')
        plt.ylabel('Density')
        plt.legend()
        
        # Plot average response times
        plt.subplot(2, 1, 2)
        model_names = []
        avg_times = []
        avg_similarities = []
        
        for model_name, results in all_results.items():
            if results['response_times'] and results['similarities']:
                model_names.append(model_name)
                avg_times.append(np.mean(results['response_times']))
                avg_similarities.append(np.mean(results['similarities']))
        
        if model_names:
            x = np.arange(len(model_names))
            width = 0.35
            
            plt.bar(x - width/2, avg_times, width, label='Avg Response Time (s)')
            plt.bar(x + width/2, avg_similarities, width, label='Avg Similarity')
            plt.xticks(x, model_names, rotation=45)
            plt.title('Average Response Time and Similarity by Model')
            plt.legend()
        
        plt.tight_layout()
        plt.savefig('model_comparison_results.png')
        
        # Print summary statistics
        print("\nModel Performance Summary:")
        for model_name in model_names:
            results = all_results[model_name]
            print(f"\n{model_name}:")
            print(f"Average Similarity: {np.mean(results['similarities']):.3f}")
            print(f"Average Response Time: {np.mean(results['response_times']):.3f}s")
            print(f"90th Percentile Similarity: {np.percentile(results['similarities'], 90):.3f}")

def main():
    # Load your ground truth data
    ground_truth_data = pd.read_csv('../data/product_qa_groundtruth.csv').to_dict('records')
    
    # Initialize evaluator
    evaluator = GroqModelEvaluator()
    
    # Debug: Inspect ground truth data
    evaluator.inspect_ground_truth(ground_truth_data)
    
    # Run evaluation
    results = evaluator.evaluate_all_models(ground_truth_data)
    
    # Visualize results
    evaluator.visualize_results(results)

if __name__ == "__main__":
    main()

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:root:Starting evaluation of gemma-7b-it



Ground Truth Data Inspection:
Number of items: 115

Keys in first item: dict_keys(['product_id', 'product_name', 'category', 'price', 'colors', 'sizes', 'question', 'answer'])

First item content:
product_id: CLT001
product_name: Premium Egyptian Cotton Oxford Shirt
category: shirts
price: 699.99
colors: White, Light Blue, Pink, Light Grey, Powder Blue
sizes: S, M, L, XL, XXL
question: What is the ID of the Premium Egyptian Cotton Oxford Shirt?
answer: CLT001


Evaluating gemma-7b-it:   0%|                                     | 0/115 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"

Batches: 100%|██████████████████████████████████████████████| 1/1 [00:00<00:00, 74.74it/s][A

Batches: 100%|██████████████████████████████████████████████| 1/1 [00:00<00:00, 66.36it/s][A
Evaluating gemma-7b-it:   1%|▎                            | 1/115 [00:00<01:03,  1.80it/s]INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"

Batches: 100%|██████████████████████████████████████████████| 1/1 [00:00<00:00, 95.08it/s][A

Batches: 100%|██████████████████████████████████████████████| 1/1 [00:00<00:00, 95.64it/s][A
Evaluating gemma-7b-it:   2%|▌                            | 2/115 [00:00<00:49,  2.26it/s]INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"

Batches: 100%|█████████████████████████████████████████████| 