In [4]:
import os
import time
import uuid
from typing import Dict, Any, Tuple, List
import logging
import json
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
from groq import Groq
from dotenv import load_dotenv

# Setup logging and environment variables
logging.basicConfig(level=logging.INFO)
load_dotenv()

# Load environment variables
model_name = os.getenv('MODEL_NAME', 'multi-qa-MiniLM-L6-cos-v1')
es_url = os.getenv('ELASTICSEARCH_URL', 'http://localhost:9200')
groq_api_key = os.getenv('GROQ_API_KEY')



# Initialize models and clients
model = SentenceTransformer(model_name)
es_client = Elasticsearch(es_url)

def elastic_search_knn(field: str, vector: List[float], index_name: str = "shop_products") -> List[Dict]:
    """Performs a K-Nearest Neighbors (KNN) search on Elasticsearch.

    Args:
        field (str): The field containing the vector to search against.
        vector (List[float]): The query vector.
        index_name (str): The Elasticsearch index name.

    Returns:
        List[Dict]: List of search results with questions and answers.
    """
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000
    }
    search_query = {
        "knn": knn,
        "_source": ["question", "answer", "product_id", "product_name"]
    }
    
    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    return [hit['_source'] for hit in es_results['hits']['hits']]

def question_answer_vector_knn(question: str) -> List[Dict]:
    """Encodes the question and performs KNN search.

    Args:
        question (str): Input question to encode and search.

    Returns:
        List[Dict]: List of relevant question-answer pairs.
    """
    logging.info(f"Encoding question: {question}")
    question_vector = model.encode(question)
    return elastic_search_knn('question_answer_vector', question_vector)

def build_prompt(query: str, search_results: List[Dict]) -> str:
    """Builds a prompt for the language model.

    Args:
        query (str): Original user query.
        search_results (List[Dict]): Search results with relevant QA pairs.

    Returns:
        str: Formatted prompt for LLM.
    """
    prompt_template = """
    You are a knowledgeable and helpful product support assistant. Use the following relevant 
    question-answer pairs to help answer the user's question. If you cannot find a relevant answer 
    in the provided context, say so honestly.

    Context:
    {context}

    User Question: {question}

    Please provide a clear and accurate answer based on the given context:
    """
    
    context = "\n\n".join([
        f"Question: {doc['question']}\nAnswer: {doc['answer']}\nProduct: {doc['product_name']}"
        for doc in search_results
    ])
    
    return prompt_template.format(question=query, context=context).strip()

def llm(prompt: str, model: str = 'mixtral-8x7b-32768') -> Tuple[str, Dict[str, Any], float]:
    """Generates a response using Groq LLM.

    Args:
        prompt (str): Input prompt.
        model (str): Model identifier.

    Returns:
        Tuple[str, Dict[str, Any], float]: Generated answer, token usage, and response time.
    """
    client = Groq(api_key=groq_api_key)
    start_time = time.time()
    
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.7
        )
        response_time = time.time() - start_time
        
        return (
            response.choices[0].message.content,
            response.usage.to_dict(),
            response_time
        )
    except Exception as e:
        logging.error(f"Error in LLM call: {str(e)}")
        raise

def calculate_cost(tokens: int, model: str) -> float:
    """Calculates API usage cost.

    Args:
        tokens (int): Total tokens used.
        model (str): Model name.

    Returns:
        float: Estimated cost in USD.
    """
    # Adjust rate based on actual pricing
    return tokens * 0.00001

def evaluate_relevance(question: str, answer: str) -> Tuple[str, str, Dict[str, int]]:
    """Evaluates answer relevance.

    Args:
        question (str): Original question.
        answer (str): Generated answer.

    Returns:
        Tuple[str, str, Dict[str, int]]: Relevance rating, explanation, and token usage.
    """
    logging.info("Evaluating answer relevance...")
    
    evaluation_template = f'''
    You are an expert evaluator for a RAG system. Analyze the relevance of the answer 
    to the given question and classify it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

    Question: {question}
    Generated Answer: {answer}

    Provide your evaluation in JSON format:
    {{
      "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
      "Explanation": "[Your explanation for the classification]"
    }}
    '''
    
    evaluation, tokens, _ = llm(evaluation_template)
    
    try:
        json_eval = json.loads(evaluation)
        logging.info(f"Evaluation result: {json_eval}")
        return json_eval['Relevance'], json_eval['Explanation'], tokens
    except json.JSONDecodeError:
        logging.error("Failed to parse evaluation JSON")
        return "UNKNOWN", "Failed to parse evaluation", tokens

def rag(query: str, model: str = 'mixtral-8x7b-32768') -> Dict[str, Any]:
    """Executes the complete RAG pipeline.

    Args:
        query (str): User query.
        model (str): LLM model to use.

    Returns:
        Dict[str, Any]: Complete results including answer, relevance, and metrics.
    """
    conversation_id = str(uuid.uuid4())
    
    # Vector search
    search_results = question_answer_vector_knn(query)
    
    # Generate answer
    prompt = build_prompt(query, search_results)
    answer, tokens, response_time = llm(prompt, model=model)
    
    # Evaluate relevance
    relevance, relevance_explanation, eval_tokens = evaluate_relevance(query, answer)
    
    # Calculate costs
    total_cost = calculate_cost(
        tokens['total_tokens'] + eval_tokens['total_tokens'],
        model
    )
    
    # Compile results
    conversation_data = {
        "id": conversation_id,
        "question": query,
        "answer": answer,
        "model_used": model,
        "response_time": response_time,
        "relevance": relevance,
        "relevance_explanation": relevance_explanation,
        "prompt_tokens": tokens['prompt_tokens'],
        "completion_tokens": tokens['completion_tokens'],
        "total_tokens": tokens['total_tokens'],
        "eval_prompt_tokens": eval_tokens['prompt_tokens'],
        "eval_completion_tokens": eval_tokens['completion_tokens'],
        "eval_total_tokens": eval_tokens['total_tokens'],
        "cost": total_cost
    }
    
    return conversation_data

def get_answer_for_question(question: str) -> Dict[str, Any]:
    """Main interface for getting answers.

    Args:
        question (str): User question.

    Returns:
        Dict[str, Any]: Generated answer and metadata.
    """
    return rag(question)

if __name__ == "__main__":
    # Interactive testing
    custom_question = input("Enter your question: ")
    answer_data = get_answer_for_question(custom_question)
    
    print(f"\nQuestion: {custom_question}")
    print(f"Answer: {answer_data['answer']}")
    print(f"\nRelevance: {answer_data['relevance']}")
    print(f"Explanation: {answer_data['relevance_explanation']}")
    print(f"\nResponse Time: {answer_data['response_time']:.2f} seconds")
    print(f"Total Tokens: {answer_data['total_tokens']}")
    print(f"Estimated Cost: ${answer_data['cost']:.6f}")

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: multi-qa-MiniLM-L6-cos-v1
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu


Enter your question:  hey i have 1000 i wnat to buy  a trouse 


INFO:root:Encoding question: hey i have 1000 i wnat to buy  a trouse 
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 78.50it/s]
INFO:elastic_transport.transport:POST http://localhost:9200/shop_products/_search [status:400 duration:0.017s]


BadRequestError: BadRequestError(400, 'search_phase_execution_exception', 'failed to create query: field [question_answer_vector] does not exist in the mapping')

In [None]:
import json
import logging
import os
from typing import List, Dict, Any, Optional
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch, helpers
from datetime import datetime
from tqdm import tqdm

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

class EcommerceSearchEngine:
    def __init__(
        self,
        elastic_url: str = "http://localhost:9200",
        model_name: str = "all-MiniLM-L6-v2",
        index_name: str = "ecommerce_index"
    ):
        self.es_client = Elasticsearch(elastic_url)
        self.model = SentenceTransformer(model_name)
        self.index_name = index_name
        self.vector_dims = 384

    def create_index(self) -> None:
        """Create Elasticsearch index with mappings matching the product structure."""
        index_settings = {
            "settings": {
                "number_of_shards": 1,
                "number_of_replicas": 0,
                "analysis": {
                    "analyzer": {
                        "product_analyzer": {
                            "type": "custom",
                            "tokenizer": "standard",
                            "filter": ["lowercase", "stop"]
                        }
                    }
                }
            },
            "mappings": {
                "properties": {
                    "id": {"type": "keyword"},
                    "productName": {
                        "type": "text",
                        "analyzer": "product_analyzer",
                        "fields": {
                            "keyword": {"type": "keyword"}
                        }
                    },
                    "price": {"type": "float"},
                    "category": {"type": "keyword"},
                    "image": {"type": "keyword"},
                    "productDescription": {
                        "type": "text",
                        "analyzer": "product_analyzer"
                    },
                    "availableColours": {"type": "keyword"},
                    "sizes": {"type": "keyword"},
                    "discount": {"type": "float"},
                    "product_vector": {
                        "type": "dense_vector",
                        "dims": self.vector_dims,
                        "index": True,
                        "similarity": "cosine"
                    }
                }
            }
        }
        
        if self.es_client.indices.exists(index=self.index_name):
            self.es_client.indices.delete(index=self.index_name)
        
        self.es_client.indices.create(index=self.index_name, body=index_settings)
        logging.info(f"Created index: {self.index_name}")

    def prepare_product_document(self, product: Dict[str, Any]) -> Dict[str, Any]:
        """Prepare a product document for indexing with vector embedding."""
        searchable_text = f"""
        {product['productName']} {product['category']}
        {product['productDescription']}
        Colors: {' '.join(product['availableColours'])}
        Sizes: {' '.join(product['sizes'])}
        """
        vector = self.model.encode(searchable_text).tolist()
        
        return {
            "_index": self.index_name,
            "_source": {
                **product,
                "product_vector": vector
            }
        }

    def ingest_products(self, products: List[Dict[str, Any]]) -> None:
        """Ingest products into Elasticsearch."""
        actions = []
        for product in tqdm(products, desc="Preparing products"):
            actions.append(self.prepare_product_document(product))
        
        helpers.bulk(self.es_client, actions)
        logging.info(f"Ingested {len(actions)} products")

    def hybrid_search(self, query: str, size: int = 3) -> List[Dict[str, Any]]:
        """Perform hybrid search combining vector similarity and text matching."""
        query_vector = self.model.encode(query).tolist()
        
        search_query = {
            "size": size,
            "query": {
                "bool": {
                    "should": [
                        {
                            "script_score": {
                                "query": {"match_all": {}},
                                "script": {
                                    "source": "cosineSimilarity(params.query_vector, 'product_vector') + 1.0",
                                    "params": {"query_vector": query_vector}
                                }
                            }
                        },
                        {
                            "multi_match": {
                                "query": query,
                                "fields": [
                                    "productName^3",
                                    "productDescription^2",
                                    "category"
                                ],
                                "fuzziness": "AUTO"
                            }
                        }
                    ]
                }
            }
        }
        
        response = self.es_client.search(index=self.index_name, body=search_query)
        return [hit["_source"] for hit in response["hits"]["hits"]]

class EcommerceChatbot:
    def __init__(self, search_engine: EcommerceSearchEngine):
        self.search_engine = search_engine

    def extract_section_from_description(self, description: str, section_name: str) -> str:
        """Extract a specific section from the product description."""
        sections = description.split('\n\n')
        for section in sections:
            if section.startswith(section_name):
                return section.replace(f"{section_name}:\n", "").strip()
        return ""

    def answer_question(self, question: str) -> str:
        """Answer a customer question using hybrid search and section extraction."""
        results = self.search_engine.hybrid_search(question)
        
        if not results:
            return "I'm sorry, I couldn't find any relevant products matching your query."
        
        product = results[0]
        question_lower = question.lower()
        
        # Price related queries
        if any(word in question_lower for word in ['price', 'cost', 'how much']):
            return f"The {product['productName']} is priced at ${product['price']}."
        
        # Size related queries
        if any(word in question_lower for word in ['size', 'sizes', 'available size']):
            return f"The {product['productName']} is available in these sizes: {', '.join(product['sizes'])}."
        
        # Color related queries
        if any(word in question_lower for word in ['color', 'colours', 'available color']):
            return f"The {product['productName']} is available in these colors: {', '.join(product['availableColours'])}."
        
        # Material composition queries
        if any(word in question_lower for word in ['material', 'fabric', 'made of', 'composition']):
            material_info = self.extract_section_from_description(product['productDescription'], 'Material Composition')
            return f"The {product['productName']} material details:\n{material_info}"
        
        # Care instructions queries
        if any(word in question_lower for word in ['care', 'wash', 'cleaning', 'iron']):
            care_info = self.extract_section_from_description(product['productDescription'], 'Care Instructions')
            return f"Care instructions for {product['productName']}:\n{care_info}"
        
        # Fit details queries
        if any(word in question_lower for word in ['fit', 'length', 'measurements', 'inseam']):
            fit_info = self.extract_section_from_description(product['productDescription'], 'Fit Details')
            return f"Fit details for {product['productName']}:\n{fit_info}"
        
        # Styling queries
        if any(word in question_lower for word in ['style', 'wear', 'outfit', 'styling']):
            style_info = self.extract_section_from_description(product['productDescription'], 'Styling Tips')
            return f"Styling suggestions for {product['productName']}:\n{style_info}"
        
        # Design features queries
        if any(word in question_lower for word in ['features', 'details', 'pockets', 'design']):
            design_info = self.extract_section_from_description(product['productDescription'], 'Design Features')
            return f"Design features of {product['productName']}:\n{design_info}"
        
        # General product information
        return f"Here's what I found about the {product['productName']}:\n{product['productDescription']}"

def load_product_data(file_path: str) -> List[Dict[str, Any]]:
    """Load product data from JSON file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        logging.info(f"Successfully loaded {len(data)} products from {file_path}")
        return data
    except FileNotFoundError:
        logging.error(f"Data file not found: {file_path}")
        raise
    except json.JSONDecodeError:
        logging.error(f"Invalid JSON format in file: {file_path}")
        raise
    except Exception as e:
        logging.error(f"Error loading data: {str(e)}")
        raise

def main():
    # Initialize components
    search_engine = EcommerceSearchEngine()
    search_engine.create_index()
    
    # Load product data from JSON file
    data_path = "../data/products_data.json"  # Update this path to match your data file location
    products = load_product_data(data_path)
    
    # Ingest products
    search_engine.ingest_products(products)
    
    # Initialize chatbot
    chatbot = EcommerceChatbot(search_engine)
    
    # Interactive chat loop
    print("Chatbot initialized! Type 'quit' to exit.")
    while True:
        question = input("\nWhat would you like to know about our products? ")
        if question.lower() == 'quit':
            break
            
        answer = chatbot.answer_question(question)
        print(f"\nAnswer: {answer}")

if __name__ == "__main__":
    main()

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:elastic_transport.transport:HEAD http://localhost:9200/ecommerce_index [status:200 duration:0.003s]
INFO:elastic_transport.transport:DELETE http://localhost:9200/ecommerce_index [status:200 duration:0.098s]
INFO:elastic_transport.transport:PUT http://localhost:9200/ecommerce_index [status:200 duration:0.436s]
INFO:root:Created index: ecommerce_index
INFO:root:Successfully loaded 23 products from ../data/products_data.json
Preparing products:   0%|                                                                                          | 0/23 [00:00<?, ?it/s]
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 19.87it/s][A

Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 

Chatbot initialized! Type 'quit' to exit.



What would you like to know about our products?  i have 100 i wnat jeans 


Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 93.52it/s]
INFO:elastic_transport.transport:POST http://localhost:9200/ecommerce_index/_search [status:200 duration:0.040s]



Answer: Here's what I found about the Stretch Denim High-Rise Jeans:
Modern high-rise jeans with perfect stretch

Material Composition:
- 92% Cotton, 6% Polyester, 2% Elastane
- Premium stretch denim
- Brushed metal hardware

Fit Details:
- High-rise fit: 11-inch rise
- Slim through hip and thigh
- Ankle length
- Inseam: 28 inches

Care Instructions:
- Machine wash cold with similar colors
- Inside out wash recommended
- Tumble dry medium
- Do not bleach

Design Features:
- Sculpting stretch technology
- Five-pocket styling
- Contoured waistband
- Reinforced belt loops

Styling Tips:
- Perfect with cropped tops for a modern look
- Pair with ankle boots and blazer for office wear
- Great with oversized sweaters for weekend style


In [2]:
import json
import logging
import os
from typing import List, Dict, Any, Optional
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch, helpers
from datetime import datetime
from tqdm import tqdm

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

class EcommerceSearchEngine:
    def __init__(
        self,
        elastic_url: str = "http://localhost:9200",
        model_name: str = "all-MiniLM-L6-v2",
        index_name: str = "ecommerce_index"
    ):
        self.es_client = Elasticsearch(elastic_url)
        self.model = SentenceTransformer(model_name)
        self.index_name = index_name
        self.vector_dims = 384

    def create_index(self) -> None:
        """Create Elasticsearch index with mappings matching the product structure."""
        index_settings = {
            "settings": {
                "number_of_shards": 1,
                "number_of_replicas": 0,
                "analysis": {
                    "analyzer": {
                        "product_analyzer": {
                            "type": "custom",
                            "tokenizer": "standard",
                            "filter": ["lowercase", "stop"]
                        }
                    }
                }
            },
            "mappings": {
                "properties": {
                    "id": {"type": "keyword"},
                    "productName": {
                        "type": "text",
                        "analyzer": "product_analyzer",
                        "fields": {
                            "keyword": {"type": "keyword"}
                        }
                    },
                    "price": {"type": "float"},
                    "category": {"type": "keyword"},
                    "image": {"type": "keyword"},
                    "productDescription": {
                        "type": "text",
                        "analyzer": "product_analyzer"
                    },
                    "availableColours": {"type": "keyword"},
                    "sizes": {"type": "keyword"},
                    "discount": {"type": "float"},
                    "product_vector": {
                        "type": "dense_vector",
                        "dims": self.vector_dims,
                        "index": True,
                        "similarity": "cosine"
                    }
                }
            }
        }
        
        if self.es_client.indices.exists(index=self.index_name):
            self.es_client.indices.delete(index=self.index_name)
        
        self.es_client.indices.create(index=self.index_name, body=index_settings)
        logging.info(f"Created index: {self.index_name}")

    def prepare_product_document(self, product: Dict[str, Any]) -> Dict[str, Any]:
        """Prepare a product document for indexing with vector embedding."""
        searchable_text = f"""
        {product['productName']} {product['category']}
        {product['productDescription']}
        Colors: {' '.join(product['availableColours'])}
        Sizes: {' '.join(product['sizes'])}
        """
        vector = self.model.encode(searchable_text).tolist()
        
        return {
            "_index": self.index_name,
            "_source": {
                **product,
                "product_vector": vector
            }
        }

    def ingest_products(self, products: List[Dict[str, Any]]) -> None:
        """Ingest products into Elasticsearch."""
        actions = []
        for product in tqdm(products, desc="Preparing products"):
            actions.append(self.prepare_product_document(product))
        
        helpers.bulk(self.es_client, actions)
        logging.info(f"Ingested {len(actions)} products")

    def hybrid_search(self, query: str, size: int = 3) -> List[Dict[str, Any]]:
        """Perform hybrid search combining vector similarity and text matching."""
        query_vector = self.model.encode(query).tolist()
        
        search_query = {
            "size": size,
            "query": {
                "bool": {
                    "should": [
                        {
                            "script_score": {
                                "query": {"match_all": {}},
                                "script": {
                                    "source": "cosineSimilarity(params.query_vector, 'product_vector') + 1.0",
                                    "params": {"query_vector": query_vector}
                                }
                            }
                        },
                        {
                            "multi_match": {
                                "query": query,
                                "fields": [
                                    "productName^3",
                                    "productDescription^2",
                                    "category"
                                ],
                                "fuzziness": "AUTO"
                            }
                        }
                    ]
                }
            }
        }
        
        response = self.es_client.search(index=self.index_name, body=search_query)
        return [hit["_source"] for hit in response["hits"]["hits"]]

class EcommerceChatbot:
    def __init__(self, search_engine: EcommerceSearchEngine):
        self.search_engine = search_engine

    def extract_section_from_description(self, description: str, section_name: str) -> str:
        """Extract a specific section from the product description."""
        sections = description.split('\n\n')
        for section in sections:
            if section.startswith(section_name):
                return section.replace(f"{section_name}:\n", "").strip()
        return ""

    def answer_question(self, question: str) -> str:
        """Answer a customer question using hybrid search and section extraction."""
        results = self.search_engine.hybrid_search(question)
        
        if not results:
            return "I'm sorry, I couldn't find any relevant products matching your query."
        
        product = results[0]
        question_lower = question.lower()
        
        # Price related queries
        if any(word in question_lower for word in ['price', 'cost', 'how much']):
            return f"The {product['productName']} is priced at R{product['price']}."
        
        # Size related queries
        if any(word in question_lower for word in ['size', 'sizes', 'available size']):
            return f"The {product['productName']} is available in these sizes: {', '.join(product['sizes'])}."
        
        # Color related queries
        if any(word in question_lower for word in ['color', 'colours', 'available color']):
            return f"The {product['productName']} is available in these colors: {', '.join(product['availableColours'])}."
        
        # Material composition queries
        if any(word in question_lower for word in ['material', 'fabric', 'made of', 'composition']):
            material_info = self.extract_section_from_description(product['productDescription'], 'Material Composition')
            return f"The {product['productName']} material details:\n{material_info}"
        
        # Care instructions queries
        if any(word in question_lower for word in ['care', 'wash', 'cleaning', 'iron']):
            care_info = self.extract_section_from_description(product['productDescription'], 'Care Instructions')
            return f"Care instructions for {product['productName']}:\n{care_info}"
        
        # Fit details queries
        if any(word in question_lower for word in ['fit', 'length', 'measurements', 'inseam']):
            fit_info = self.extract_section_from_description(product['productDescription'], 'Fit Details')
            return f"Fit details for {product['productName']}:\n{fit_info}"
        
        # Styling queries
        if any(word in question_lower for word in ['style', 'wear', 'outfit', 'styling']):
            style_info = self.extract_section_from_description(product['productDescription'], 'Styling Tips')
            return f"Styling suggestions for {product['productName']}:\n{style_info}"
        
        # Design features queries
        if any(word in question_lower for word in ['features', 'details', 'pockets', 'design']):
            design_info = self.extract_section_from_description(product['productDescription'], 'Design Features')
            return f"Design features of {product['productName']}:\n{design_info}"
        
        # General product information
        return f"Here's what I found about the {product['productName']}:\n{product['productDescription']}"

def load_product_data(file_path: str) -> List[Dict[str, Any]]:
    """Load product data from JSON file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        logging.info(f"Successfully loaded {len(data)} products from {file_path}")
        return data
    except FileNotFoundError:
        logging.error(f"Data file not found: {file_path}")
        raise
    except json.JSONDecodeError:
        logging.error(f"Invalid JSON format in file: {file_path}")
        raise
    except Exception as e:
        logging.error(f"Error loading data: {str(e)}")
        raise

def main():
    # Initialize components
    search_engine = EcommerceSearchEngine()
    search_engine.create_index()
    
    # Load product data from JSON file
    data_path = "../data/products_data.json"  # Update this path to match your data file location
    products = load_product_data(data_path)
    
    # Ingest products
    search_engine.ingest_products(products)
    
    # Initialize chatbot
    chatbot = EcommerceChatbot(search_engine)
    
    # Interactive chat loop
    print("Chatbot initialized! Type 'quit' to exit.")
    while True:
        question = input("\nWhat would you like to know about our products? ")
        if question.lower() == 'quit':
            break
            
        answer = chatbot.answer_question(question)
        print(f"\nAnswer: {answer}")

if __name__ == "__main__":
    main()

2024-10-24 04:39:39,111 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2024-10-24 04:39:39,898 - INFO - Use pytorch device_name: cpu
2024-10-24 04:39:39,905 - INFO - HEAD http://localhost:9200/ecommerce_index [status:200 duration:0.005s]
2024-10-24 04:39:39,987 - INFO - DELETE http://localhost:9200/ecommerce_index [status:200 duration:0.081s]
2024-10-24 04:39:40,404 - INFO - PUT http://localhost:9200/ecommerce_index [status:200 duration:0.417s]
2024-10-24 04:39:40,405 - INFO - Created index: ecommerce_index
2024-10-24 04:39:40,408 - INFO - Successfully loaded 23 products from ../data/products_data.json
Preparing products:   0%|                                                                                          | 0/23 [00:00<?, ?it/s]
Batches:   0%|                                                                                                      | 0/1 [00:00<?, ?it/s][A
Batches: 100%|███████████████████████████████████████████████████████████████████████████████

Chatbot initialized! Type 'quit' to exit.



What would you like to know about our products?  hey i have 1000 what can i buy 


Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 105.49it/s]
2024-10-24 04:40:34,406 - INFO - POST http://localhost:9200/ecommerce_index/_search [status:200 duration:0.169s]



Answer: Here's what I found about the Premium Egyptian Cotton Oxford Shirt:
Crafted from 100% Egyptian cotton (120 thread count)

Material Composition:
- 100% Egyptian cotton
- Mother of pearl buttons
- Reinforced collar stays

Fit Details:
- Regular fit through chest and waist
- Shoulder-to-shoulder measurements: S(17"), M(18"), L(19")
- Center back length: 30 inches

Care Instructions:
- Machine wash cold
- Tumble dry low
- Iron on medium heat
- Do not bleach

Design Features:
- Button-down collar
- Single chest pocket
- Split yoke
- Rounded hem

Styling Tips:
- Perfect for formal occasions when paired with tailored trousers
- Can be dressed down with chinos for a smart-casual look
- Layer under a blazer for business meetings



What would you like to know about our products?  hey i have 200 i wnat to buy a trouse 


Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 96.65it/s]
2024-10-24 04:41:09,904 - INFO - POST http://localhost:9200/ecommerce_index/_search [status:200 duration:0.031s]



Answer: Here's what I found about the Tailored Wool Trousers:
Classic tailored wool trousers

Material Composition:
- 100% Australian wool
- Italian fabric
- Satin pocket lining

Fit Details:
- Mid-rise waist
- Straight leg cut
- 32-inch inseam
- Tailored fit

Care Instructions:
- Dry clean only
- Press with damp cloth
- Store on trouser hanger
- Brush after wearing

Design Features:
- Front pleats
- Side slant pockets
- Back welt pockets
- Extended tab closure

Styling Tips:
- Perfect for formal occasions
- Pair with silk blouse
- Great with loafers



What would you like to know about our products?  ok i want a jean


Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 88.01it/s]
2024-10-24 04:42:02,712 - INFO - POST http://localhost:9200/ecommerce_index/_search [status:200 duration:0.015s]



Answer: Here's what I found about the Stretch Denim High-Rise Jeans:
Modern high-rise jeans with perfect stretch

Material Composition:
- 92% Cotton, 6% Polyester, 2% Elastane
- Premium stretch denim
- Brushed metal hardware

Fit Details:
- High-rise fit: 11-inch rise
- Slim through hip and thigh
- Ankle length
- Inseam: 28 inches

Care Instructions:
- Machine wash cold with similar colors
- Inside out wash recommended
- Tumble dry medium
- Do not bleach

Design Features:
- Sculpting stretch technology
- Five-pocket styling
- Contoured waistband
- Reinforced belt loops

Styling Tips:
- Perfect with cropped tops for a modern look
- Pair with ankle boots and blazer for office wear
- Great with oversized sweaters for weekend style



What would you like to know about our products?  I HAVE 200 LIST  ALL THE THINGS THAT I CAN BUY 


Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 96.15it/s]
2024-10-24 04:43:06,064 - INFO - POST http://localhost:9200/ecommerce_index/_search [status:200 duration:0.038s]



Answer: Here's what I found about the Premium Egyptian Cotton Oxford Shirt:
Crafted from 100% Egyptian cotton (120 thread count)

Material Composition:
- 100% Egyptian cotton
- Mother of pearl buttons
- Reinforced collar stays

Fit Details:
- Regular fit through chest and waist
- Shoulder-to-shoulder measurements: S(17"), M(18"), L(19")
- Center back length: 30 inches

Care Instructions:
- Machine wash cold
- Tumble dry low
- Iron on medium heat
- Do not bleach

Design Features:
- Button-down collar
- Single chest pocket
- Split yoke
- Rounded hem

Styling Tips:
- Perfect for formal occasions when paired with tailored trousers
- Can be dressed down with chinos for a smart-casual look
- Layer under a blazer for business meetings


KeyboardInterrupt: Interrupted by user

In [7]:
import json
import pandas as pd
from tqdm.auto import tqdm
from tqdm import TqdmWarning
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
import warnings

# Suppress tqdm warnings about missing ipywidgets
warnings.filterwarnings('ignore', category=TqdmWarning)

def load_product_data(file_path='../data/products_data.json'):
    """Load product data from JSON file."""
    with open(file_path, 'r') as f:
        return json.load(f)

def initialize_model():
    """Initialize the sentence transformer model."""
    return SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

def create_text_representation(product):
    """Create a combined text representation for encoding."""
    return f"{product['productName']} {product['category']} {product['productDescription']}"

def encode_product_vectors(model, product):
    """Encode product information into vectors."""
    return {
        'productName_vector': model.encode(product['productName']).tolist(),
        'description_vector': model.encode(product['productDescription']).tolist(),
        'combined_vector': model.encode(create_text_representation(product)).tolist()
    }

def get_elasticsearch_settings():
    """Define Elasticsearch index settings and mappings."""
    return {
        'settings': {
            'number_of_shards': 1,
            'number_of_replicas': 0
        },
        'mappings': {
            'properties': {
                'id': {'type': 'keyword'},
                'productName': {'type': 'text'},
                'productDescription': {'type': 'text'},
                'price': {'type': 'float'},
                'category': {'type': 'keyword'},
                'productName_vector': {
                    'type': 'dense_vector',
                    'dims': 384,
                    'index': True,
                    'similarity': 'cosine'
                },
                'description_vector': {
                    'type': 'dense_vector',
                    'dims': 384,
                    'index': True,
                    'similarity': 'cosine'
                },
                'combined_vector': {
                    'type': 'dense_vector',
                    'dims': 384,
                    'index': True,
                    'similarity': 'cosine'
                }
            }
        }
    }

def setup_elasticsearch_index(es_client, index_name='ecommerce-products'):
    """Set up Elasticsearch index with proper settings."""
    if es_client.indices.exists(index=index_name):
        es_client.indices.delete(index=index_name)
    
    es_client.indices.create(
        index=index_name,
        body=get_elasticsearch_settings()
    )

def index_products(es_client, products, model, index_name='ecommerce-products'):
    """Index products with their vector representations."""
    for product in tqdm(products, desc="Indexing products"):
        # Encode vectors for the product
        vectors = encode_product_vectors(model, product)
        
        # Prepare document for indexing
        doc = {
            'id': product['id'],
            'productName': product['productName'],
            'productDescription': product['productDescription'],
            'price': product['price'],
            'category': product['category'],
            'availableColours': product['availableColours'],
            'sizes': product['sizes'],
            'discount': product['discount'],
            **vectors  # Add encoded vectors
        }
        
        # Index the document
        es_client.index(
            index=index_name,
            id=product['id'],
            body=doc
        )

def hybrid_search(es_client, model, query, price_range=None, index_name='ecommerce-products'):
    """
    Perform hybrid search combining dense vector similarity and keyword matching.
    
    Args:
        query (str): Customer query
        price_range (tuple): Optional tuple of (min_price, max_price)
    """
    # Encode the query
    query_vector = model.encode(query).tolist()
    
    # Build the search query
    search_query = {
        'size': 5,
        'query': {
            'bool': {
                'must': [
                    {
                        'script_score': {
                            'query': {'match_all': {}},
                            'script': {
                                'source': "cosineSimilarity(params.query_vector, 'combined_vector') + 1.0",
                                'params': {'query_vector': query_vector}
                            }
                        }
                    },
                    {
                        'multi_match': {
                            'query': query,
                            'fields': ['productName^2', 'productDescription', 'category'],
                            'fuzziness': 'AUTO'
                        }
                    }
                ]
            }
        }
    }
    
    # Add price filter if specified
    if price_range:
        min_price, max_price = price_range
        search_query['query']['bool']['filter'] = {
            'range': {
                'price': {
                    'gte': min_price,
                    'lte': max_price
                }
            }
        }
    
    # Execute search
    results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    return results['hits']['hits']

def main():
    # Load product data
    products = load_product_data()
    
    # Initialize model
    model = initialize_model()
    
    # Connect to Elasticsearch
    es = Elasticsearch(['http://localhost:9200'])
    
    # Setup index
    setup_elasticsearch_index(es)
    
    # Index products
    index_products(es, products, model)
    
    # Example searches
    queries = [
        "What is the price of the Wide-Leg Linen Trousers?",
        "Tell me more about linen pants",
        "Show me discounted summer dresses under $100"
    ]
    
    print("\nExample searches:")
    for query in queries:
        print(f"\nQuery: {query}")
        results = hybrid_search(es, model, query, price_range=(0, 100))
        
        for hit in results:
            source = hit['_source']
            print(f"- {source['productName']} (${source['price']:.2f})")
            print(f"  Category: {source['category']}")
            print(f"  Score: {hit['_score']:.2f}")

if __name__ == "__main__":
    main()

2024-10-24 05:23:00,373 - INFO - Load pretrained SentenceTransformer: multi-qa-MiniLM-L6-cos-v1
2024-10-24 05:23:01,391 - INFO - Use pytorch device_name: cpu
2024-10-24 05:23:01,398 - INFO - HEAD http://localhost:9200/ecommerce-products [status:404 duration:0.005s]
2024-10-24 05:23:01,896 - INFO - PUT http://localhost:9200/ecommerce-products [status:200 duration:0.497s]
Indexing products:   0%|                                                                                           | 0/23 [00:00<?, ?it/s]
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.96it/s][A

Batches:   0%|                                                                                                      | 0/1 [00:00<?, ?it/s][A
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.49it/s][A

Batches: 100%|██████████████████████████████████████████████


Example searches:

Query: What is the price of the Wide-Leg Linen Trousers?


Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 81.22it/s]
2024-10-24 05:23:05,112 - INFO - POST http://localhost:9200/ecommerce-products/_search [status:200 duration:0.041s]



Query: Tell me more about linen pants


Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 101.04it/s]
2024-10-24 05:23:05,146 - INFO - POST http://localhost:9200/ecommerce-products/_search [status:200 duration:0.017s]



Query: Show me discounted summer dresses under $100


Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 77.72it/s]
2024-10-24 05:23:05,184 - INFO - POST http://localhost:9200/ecommerce-products/_search [status:200 duration:0.020s]


In [11]:
import json
import pandas as pd
from tqdm.auto import tqdm
from tqdm import TqdmWarning
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
import warnings
import re

# Suppress tqdm warnings about missing ipywidgets
warnings.filterwarnings('ignore', category=TqdmWarning)

def load_product_data(file_path='products.json'):
    """Load product data from JSON file."""
    with open(file_path, 'r') as f:
        return json.load(f)

def initialize_model():
    """Initialize the sentence transformer model."""
    return SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

def create_text_representation(product):
    """Create a combined text representation for encoding."""
    return f"{product['productName']} {product['category']} {product['productDescription']}"

def encode_product_vectors(model, product):
    """Encode product information into vectors."""
    return {
        'productName_vector': model.encode(product['productName']).tolist(),
        'description_vector': model.encode(product['productDescription']).tolist(),
        'combined_vector': model.encode(create_text_representation(product)).tolist()
    }

def get_elasticsearch_settings():
    """Define Elasticsearch index settings and mappings."""
    return {
        'settings': {
            'number_of_shards': 1,
            'number_of_replicas': 0
        },
        'mappings': {
            'properties': {
                'id': {'type': 'keyword'},
                'productName': {'type': 'text'},
                'productDescription': {'type': 'text'},
                'price': {'type': 'float'},
                'category': {'type': 'keyword'},
                'availableColours': {'type': 'keyword'},
                'sizes': {'type': 'keyword'},
                'discount': {'type': 'float'},
                'productName_vector': {
                    'type': 'dense_vector',
                    'dims': 384,
                    'index': True,
                    'similarity': 'cosine'
                },
                'description_vector': {
                    'type': 'dense_vector',
                    'dims': 384,
                    'index': True,
                    'similarity': 'cosine'
                },
                'combined_vector': {
                    'type': 'dense_vector',
                    'dims': 384,
                    'index': True,
                    'similarity': 'cosine'
                }
            }
        }
    }

def setup_elasticsearch_index(es_client, index_name='ecommerce-products'):
    """Set up Elasticsearch index with proper settings."""
    if es_client.indices.exists(index=index_name):
        es_client.indices.delete(index=index_name)
    
    es_client.indices.create(
        index=index_name,
        body=get_elasticsearch_settings()
    )

def index_products(es_client, products, model, index_name='ecommerce-products'):
    """Index products with their vector representations."""
    for product in tqdm(products, desc="Indexing products"):
        # Encode vectors for the product
        vectors = encode_product_vectors(model, product)
        
        # Prepare document for indexing
        doc = {
            'id': product['id'],
            'productName': product['productName'],
            'productDescription': product['productDescription'],
            'price': product['price'],
            'category': product['category'],
            'availableColours': product['availableColours'],
            'sizes': product['sizes'],
            'discount': product['discount'],
            **vectors  # Add encoded vectors
        }
        
        # Index the document
        es_client.index(
            index=index_name,
            id=product['id'],
            body=doc
        )

def extract_price_range(query):
    """
    Extract price range from natural language query with support for various patterns.
    Returns tuple of (min_price, max_price) or None if no price range found.
    """
    query = query.lower()
    
    # Pattern for "under", "less than", "up to", "no more than"
    max_patterns = [
        r'under\s*\$?(\d+)',
        r'less than\s*\$?(\d+)',
        r'up to\s*\$?(\d+)',
        r'no more than\s*\$?(\d+)',
        r'cheaper than\s*\$?(\d+)',
        r'below\s*\$?(\d+)'
    ]
    
    # Pattern for "over", "more than", "at least"
    min_patterns = [
        r'over\s*\$?(\d+)',
        r'more than\s*\$?(\d+)',
        r'at least\s*\$?(\d+)',
        r'above\s*\$?(\d+)'
    ]
    
    # Pattern for price range "between X and Y"
    range_pattern = r'between\s*\$?(\d+)\s*and\s*\$?(\d+)'
    
    # Pattern for "around", "about", "approximately"
    approx_pattern = r'(?:around|about|approximately)\s*\$?(\d+)'
    
    # Check for range pattern first
    range_match = re.search(range_pattern, query)
    if range_match:
        return (float(range_match.group(1)), float(range_match.group(2)))
    
    # Check for maximum price patterns
    for pattern in max_patterns:
        match = re.search(pattern, query)
        if match:
            return (0, float(match.group(1)))
    
    # Check for minimum price patterns
    for pattern in min_patterns:
        match = re.search(pattern, query)
        if match:
            return (float(match.group(1)), float('inf'))
    
    # Check for approximate price
    approx_match = re.search(approx_pattern, query)
    if approx_match:
        price = float(approx_match.group(1))
        return (price * 0.8, price * 1.2)  # ±20% range
    
    return None

def hybrid_search(es_client, model, query, price_range=None, index_name='ecommerce-products'):
    """
    Perform hybrid search combining dense vector similarity, keyword matching, and filtered constraints.
    """
    # Encode the query
    query_vector = model.encode(query).tolist()
    
    # Build the search query
    search_query = {
        'size': 5,
        'query': {
            'bool': {
                'must': [
                    {
                        'script_score': {
                            'query': {'match_all': {}},
                            'script': {
                                'source': "cosineSimilarity(params.query_vector, 'combined_vector') + 1.0",
                                'params': {'query_vector': query_vector}
                            }
                        }
                    }
                ],
                'should': [
                    {
                        'multi_match': {
                            'query': query,
                            'fields': ['productName^3', 'category^2', 'productDescription'],
                            'type': 'cross_fields',
                            'operator': 'and'
                        }
                    }
                ],
                'filter': []
            }
        }
    }
    
    # Extract category terms from query (common clothing items)
    clothing_terms = ['jeans', 'shirt', 'dress', 'pants', 'shoes', 'jacket', 't-shirt', 'sweater', 'skirt']
    query_terms = query.lower().split()
    category_terms = [term for term in query_terms if term in clothing_terms]
    
    # Add category filter if category terms are found
    if category_terms:
        category_filter = {
            'bool': {
                'should': [
                    {'match_phrase': {'category': term}} for term in category_terms
                ],
                'minimum_should_match': 1
            }
        }
        search_query['query']['bool']['filter'].append(category_filter)
    
    # Add price filter if specified
    if price_range:
        min_price, max_price = price_range
        price_filter = {
            'range': {
                'price': {
                    'gte': min_price,
                    'lte': max_price
                }
            }
        }
        search_query['query']['bool']['filter'].append(price_filter)
    
    # Calculate final price after discount
    search_query['script_fields'] = {
        'final_price': {
            'script': {
                'source': 'doc["price"].value * (1 - doc["discount"].value / 100)'
            }
        }
    }
    
    # Sort results by relevance score and then by how close the price is to target
    if price_range and max_price != float('inf'):
        target_price = (min_price + max_price) / 2
        search_query['sort'] = [
            '_score',
            {
                '_script': {
                    'type': 'number',
                    'script': {
                        'source': f'Math.abs(doc["price"].value * (1 - doc["discount"].value / 100) - {target_price})',
                    },
                    'order': 'asc'
                }
            }
        ]
    
    # Execute search
    results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    return results['hits']['hits']

def print_search_results(results):
    """Print search results in a formatted way."""
    if not results:
        print("\nNo matching products found.")
        return
        
    print("\nHere are the products I found:")
    print("-" * 80)
    for hit in results:
        source = hit['_source']
        final_price = source['price'] * (1 - source['discount']/100)
        
        print(f"📦 {source['productName']}")
        print(f"📝 Category: {source['category']}")
        print(f"💰 Original Price: ${source['price']:.2f}")
        if source['discount']:
            print(f"🏷️ Discount: {source['discount']}%")
            print(f"💵 Final Price: ${final_price:.2f}")
        print(f"🎨 Colors: {', '.join(source['availableColours'])}")
        print(f"📏 Sizes: {', '.join(source['sizes'])}")
        print(f"\nDescription: {source['productDescription']}")
        print(f"Relevance Score: {hit['_score']:.2f}")
        print("-" * 80)

def search_products():
    """Simple product search interface."""
    # Load product data
    try:
        products = load_product_data()
    except FileNotFoundError:
        print("Error: products.json file not found!")
        return
    
    # Initialize model
    print("Initializing search system...")
    model = initialize_model()
    
    # Connect to Elasticsearch
    es = Elasticsearch(['http://localhost:9200'])
    
    # Check if Elasticsearch is running
    if not es.ping():
        print("Error: Could not connect to Elasticsearch. Make sure it's running on localhost:9200")
        return
    
    # Setup index
    setup_elasticsearch_index(es)
    
    # Index products
    print("Setting up product catalog...")
    index_products(es, products, model)
    print("Ready to help you find products! Type 'quit' to exit.")

    while True:
        # Get user query
        query = input("\nWhat would you like to find? ").strip()
        
        if query.lower() in ['quit', 'exit']:
            print("Goodbye!")
            break
        
        if not query:
            continue
            
        # Extract price range from query if present
        price_range = extract_price_range(query)
        
        # Perform search
        results = hybrid_search(es, model, query, price_range=price_range)
        print_search_results(results)

if __name__ == "__main__":
    search_products()

Error: products.json file not found!


In [12]:
import json
import pandas as pd
from tqdm.auto import tqdm
from tqdm import TqdmWarning
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
import warnings
import re

# Suppress tqdm warnings about missing ipywidgets
warnings.filterwarnings('ignore', category=TqdmWarning)

def load_product_data(file_path='../data/products_data.json'):
    """Load product data from JSON file."""
    with open(file_path, 'r') as f:
        return json.load(f)

def initialize_model():
    """Initialize the sentence transformer model."""
    return SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

def create_text_representation(product):
    """Create a combined text representation for encoding."""
    return f"{product['productName']} {product['category']} {product['productDescription']}"

def encode_product_vectors(model, product):
    """Encode product information into vectors."""
    return {
        'productName_vector': model.encode(product['productName']).tolist(),
        'description_vector': model.encode(product['productDescription']).tolist(),
        'combined_vector': model.encode(create_text_representation(product)).tolist()
    }

def get_elasticsearch_settings():
    """Define Elasticsearch index settings and mappings."""
    return {
        'settings': {
            'number_of_shards': 1,
            'number_of_replicas': 0
        },
        'mappings': {
            'properties': {
                'id': {'type': 'keyword'},
                'productName': {'type': 'text'},
                'productDescription': {'type': 'text'},
                'price': {'type': 'float'},
                'category': {'type': 'keyword'},
                'availableColours': {'type': 'keyword'},
                'sizes': {'type': 'keyword'},
                'discount': {'type': 'float'},
                'productName_vector': {
                    'type': 'dense_vector',
                    'dims': 384,
                    'index': True,
                    'similarity': 'cosine'
                },
                'description_vector': {
                    'type': 'dense_vector',
                    'dims': 384,
                    'index': True,
                    'similarity': 'cosine'
                },
                'combined_vector': {
                    'type': 'dense_vector',
                    'dims': 384,
                    'index': True,
                    'similarity': 'cosine'
                }
            }
        }
    }

def setup_elasticsearch_index(es_client, index_name='ecommerce-products'):
    """Set up Elasticsearch index with proper settings."""
    if es_client.indices.exists(index=index_name):
        es_client.indices.delete(index=index_name)
    
    es_client.indices.create(
        index=index_name,
        body=get_elasticsearch_settings()
    )

def index_products(es_client, products, model, index_name='ecommerce-products'):
    """Index products with their vector representations."""
    for product in tqdm(products, desc="Indexing products"):
        # Encode vectors for the product
        vectors = encode_product_vectors(model, product)
        
        # Prepare document for indexing
        doc = {
            'id': product['id'],
            'productName': product['productName'],
            'productDescription': product['productDescription'],
            'price': product['price'],
            'category': product['category'],
            'availableColours': product['availableColours'],
            'sizes': product['sizes'],
            'discount': product['discount'],
            **vectors  # Add encoded vectors
        }
        
        # Index the document
        es_client.index(
            index=index_name,
            id=product['id'],
            body=doc
        )

def extract_price_range(query):
    """
    Extract price range from natural language query with support for various patterns.
    Returns tuple of (min_price, max_price) or None if no price range found.
    """
    query = query.lower()
    
    # Pattern for "under", "less than", "up to", "no more than"
    max_patterns = [
        r'under\s*\$?(\d+)',
        r'less than\s*\$?(\d+)',
        r'up to\s*\$?(\d+)',
        r'no more than\s*\$?(\d+)',
        r'cheaper than\s*\$?(\d+)',
        r'below\s*\$?(\d+)'
    ]
    
    # Pattern for "over", "more than", "at least"
    min_patterns = [
        r'over\s*\$?(\d+)',
        r'more than\s*\$?(\d+)',
        r'at least\s*\$?(\d+)',
        r'above\s*\$?(\d+)'
    ]
    
    # Pattern for price range "between X and Y"
    range_pattern = r'between\s*\$?(\d+)\s*and\s*\$?(\d+)'
    
    # Pattern for "around", "about", "approximately"
    approx_pattern = r'(?:around|about|approximately)\s*\$?(\d+)'
    
    # Check for range pattern first
    range_match = re.search(range_pattern, query)
    if range_match:
        return (float(range_match.group(1)), float(range_match.group(2)))
    
    # Check for maximum price patterns
    for pattern in max_patterns:
        match = re.search(pattern, query)
        if match:
            return (0, float(match.group(1)))
    
    # Check for minimum price patterns
    for pattern in min_patterns:
        match = re.search(pattern, query)
        if match:
            return (float(match.group(1)), float('inf'))
    
    # Check for approximate price
    approx_match = re.search(approx_pattern, query)
    if approx_match:
        price = float(approx_match.group(1))
        return (price * 0.8, price * 1.2)  # ±20% range
    
    return None

def hybrid_search(es_client, model, query, price_range=None, index_name='ecommerce-products'):
    """
    Perform hybrid search combining dense vector similarity, keyword matching, and filtered constraints.
    """
    # Encode the query
    query_vector = model.encode(query).tolist()
    
    # Build the search query
    search_query = {
        'size': 5,
        'query': {
            'bool': {
                'must': [
                    {
                        'script_score': {
                            'query': {'match_all': {}},
                            'script': {
                                'source': "cosineSimilarity(params.query_vector, 'combined_vector') + 1.0",
                                'params': {'query_vector': query_vector}
                            }
                        }
                    }
                ],
                'should': [
                    {
                        'multi_match': {
                            'query': query,
                            'fields': ['productName^3', 'category^2', 'productDescription'],
                            'type': 'cross_fields',
                            'operator': 'and'
                        }
                    }
                ],
                'filter': []
            }
        }
    }
    
    # Extract category terms from query (common clothing items)
    clothing_terms = ['jeans', 'shirt', 'dress', 'pants', 'shoes', 'jacket', 't-shirt', 'sweater', 'skirt']
    query_terms = query.lower().split()
    category_terms = [term for term in query_terms if term in clothing_terms]
    
    # Add category filter if category terms are found
    if category_terms:
        category_filter = {
            'bool': {
                'should': [
                    {'match_phrase': {'category': term}} for term in category_terms
                ],
                'minimum_should_match': 1
            }
        }
        search_query['query']['bool']['filter'].append(category_filter)
    
    # Add price filter if specified
    if price_range:
        min_price, max_price = price_range
        price_filter = {
            'range': {
                'price': {
                    'gte': min_price,
                    'lte': max_price
                }
            }
        }
        search_query['query']['bool']['filter'].append(price_filter)
    
    # Calculate final price after discount
    search_query['script_fields'] = {
        'final_price': {
            'script': {
                'source': 'doc["price"].value * (1 - doc["discount"].value / 100)'
            }
        }
    }
    
    # Sort results by relevance score and then by how close the price is to target
    if price_range and max_price != float('inf'):
        target_price = (min_price + max_price) / 2
        search_query['sort'] = [
            '_score',
            {
                '_script': {
                    'type': 'number',
                    'script': {
                        'source': f'Math.abs(doc["price"].value * (1 - doc["discount"].value / 100) - {target_price})',
                    },
                    'order': 'asc'
                }
            }
        ]
    
    # Execute search
    results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    return results['hits']['hits']

def print_search_results(results):
    """Print search results in a formatted way."""
    if not results:
        print("\nNo matching products found.")
        return
        
    print("\nHere are the products I found:")
    print("-" * 80)
    for hit in results:
        source = hit['_source']
        final_price = source['price'] * (1 - source['discount']/100)
        
        print(f"📦 {source['productName']}")
        print(f"📝 Category: {source['category']}")
        print(f"💰 Original Price: ${source['price']:.2f}")
        if source['discount']:
            print(f"🏷️ Discount: {source['discount']}%")
            print(f"💵 Final Price: ${final_price:.2f}")
        print(f"🎨 Colors: {', '.join(source['availableColours'])}")
        print(f"📏 Sizes: {', '.join(source['sizes'])}")
        print(f"\nDescription: {source['productDescription']}")
        print(f"Relevance Score: {hit['_score']:.2f}")
        print("-" * 80)

def search_products():
    """Simple product search interface."""
    # Load product data
    try:
        products = load_product_data()
    except FileNotFoundError:
        print("Error: products.json file not found!")
        return
    
    # Initialize model
    print("Initializing search system...")
    model = initialize_model()
    
    # Connect to Elasticsearch
    es = Elasticsearch(['http://localhost:9200'])
    
    # Check if Elasticsearch is running
    if not es.ping():
        print("Error: Could not connect to Elasticsearch. Make sure it's running on localhost:9200")
        return
    
    # Setup index
    setup_elasticsearch_index(es)
    
    # Index products
    print("Setting up product catalog...")
    index_products(es, products, model)
    print("Ready to help you find products! Type 'quit' to exit.")

    while True:
        # Get user query
        query = input("\nWhat would you like to find? ").strip()
        
        if query.lower() in ['quit', 'exit']:
            print("Goodbye!")
            break
        
        if not query:
            continue
            
        # Extract price range from query if present
        price_range = extract_price_range(query)
        
        # Perform search
        results = hybrid_search(es, model, query, price_range=price_range)
        print_search_results(results)

if __name__ == "__main__":
    search_products()

2024-10-24 05:41:58,497 - INFO - Load pretrained SentenceTransformer: multi-qa-MiniLM-L6-cos-v1


Initializing search system...


2024-10-24 05:41:59,337 - INFO - Use pytorch device_name: cpu
2024-10-24 05:41:59,343 - INFO - HEAD http://localhost:9200/ [status:200 duration:0.003s]
2024-10-24 05:41:59,346 - INFO - HEAD http://localhost:9200/ecommerce-products [status:200 duration:0.003s]
2024-10-24 05:41:59,422 - INFO - DELETE http://localhost:9200/ecommerce-products [status:200 duration:0.076s]
2024-10-24 05:41:59,852 - INFO - PUT http://localhost:9200/ecommerce-products [status:200 duration:0.428s]


Setting up product catalog...


Indexing products:   0%|                                                                                           | 0/23 [00:00<?, ?it/s]
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 76.55it/s][A

Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 33.06it/s][A

Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 32.39it/s][A
2024-10-24 05:41:59,965 - INFO - PUT http://localhost:9200/ecommerce-products/_doc/CLT001 [status:201 duration:0.020s]
Indexing products:   4%|███▌                                                                               | 1/23 [00:00<00:02,  8.98it/s]
Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 116.73it/s][A

Batches: 100%|██████████████████

Ready to help you find products! Type 'quit' to exit.



What would you like to find?  i want to buy jeans for 300 


Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 94.65it/s]
2024-10-24 05:42:31,881 - INFO - POST http://localhost:9200/ecommerce-products/_search [status:200 duration:0.048s]



Here are the products I found:
--------------------------------------------------------------------------------


KeyError: '_source'

In [14]:
import json
import pandas as pd
from tqdm.auto import tqdm
from tqdm import TqdmWarning
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
import warnings
import re

# Suppress tqdm warnings about missing ipywidgets
warnings.filterwarnings('ignore', category=TqdmWarning)

def load_product_data(file_path='../data/products_data.json'):
    """Load product data from JSON file."""
    try:
        with open(file_path, 'r') as f:
            return json.load(f)
    except FileNotFoundError:
        print(f"Error: Could not find file at {file_path}")
        return []
    except json.JSONDecodeError:
        print("Error: Invalid JSON file")
        return []

def initialize_model():
    """Initialize the sentence transformer model."""
    try:
        return SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
    except Exception as e:
        print(f"Error initializing model: {str(e)}")
        return None

def create_text_representation(product):
    """Create a combined text representation for encoding."""
    return f"{product['productName']} {product['category']} {product['productDescription']}"

def encode_product_vectors(model, product):
    """Encode product information into vectors."""
    try:
        return {
            'productName_vector': model.encode(product['productName']).tolist(),
            'description_vector': model.encode(product['productDescription']).tolist(),
            'combined_vector': model.encode(create_text_representation(product)).tolist()
        }
    except Exception as e:
        print(f"Error encoding vectors: {str(e)}")
        return None

def get_elasticsearch_settings():
    """Define Elasticsearch index settings and mappings."""
    return {
        'settings': {
            'number_of_shards': 1,
            'number_of_replicas': 0
        },
        'mappings': {
            'properties': {
                'id': {'type': 'keyword'},
                'productName': {'type': 'text'},
                'productDescription': {'type': 'text'},
                'price': {'type': 'float'},
                'category': {'type': 'keyword'},
                'availableColours': {'type': 'keyword'},
                'sizes': {'type': 'keyword'},
                'discount': {'type': 'float'},
                'productName_vector': {
                    'type': 'dense_vector',
                    'dims': 384,
                    'index': True,
                    'similarity': 'cosine'
                },
                'description_vector': {
                    'type': 'dense_vector',
                    'dims': 384,
                    'index': True,
                    'similarity': 'cosine'
                },
                'combined_vector': {
                    'type': 'dense_vector',
                    'dims': 384,
                    'index': True,
                    'similarity': 'cosine'
                }
            }
        }
    }

def setup_elasticsearch_index(es_client, index_name='ecommerce-products'):
    """Set up Elasticsearch index with proper settings."""
    try:
        if es_client.indices.exists(index=index_name):
            es_client.indices.delete(index=index_name)
        
        es_client.indices.create(
            index=index_name,
            body=get_elasticsearch_settings()
        )
        return True
    except Exception as e:
        print(f"Error setting up Elasticsearch index: {str(e)}")
        return False

def index_products(es_client, products, model, index_name='ecommerce-products'):
    """Index products with their vector representations."""
    try:
        successful_indexes = 0
        for product in tqdm(products, desc="Indexing products"):
            # Encode vectors for the product
            vectors = encode_product_vectors(model, product)
            if not vectors:
                continue
            
            # Prepare document for indexing
            doc = {
                'id': product['id'],
                'productName': product['productName'],
                'productDescription': product['productDescription'],
                'price': product['price'],
                'category': product['category'],
                'availableColours': product['availableColours'],
                'sizes': product['sizes'],
                'discount': product['discount'],
                **vectors  # Add encoded vectors
            }
            
            # Index the document
            response = es_client.index(
                index=index_name,
                id=product['id'],
                body=doc
            )
            if response['result'] == 'created':
                successful_indexes += 1
        
        print(f"Successfully indexed {successful_indexes} products")
        return successful_indexes
    except Exception as e:
        print(f"Error during indexing: {str(e)}")
        return 0

def extract_price_range(query):
    """Extract price range from natural language query."""
    query = query.lower()
    
    # Pattern for "under", "less than", "up to", "no more than"
    max_patterns = [
        r'under\s*\$?(\d+)',
        r'less than\s*\$?(\d+)',
        r'up to\s*\$?(\d+)',
        r'no more than\s*\$?(\d+)',
        r'cheaper than\s*\$?(\d+)',
        r'below\s*\$?(\d+)'
    ]
    
    # Pattern for "over", "more than", "at least"
    min_patterns = [
        r'over\s*\$?(\d+)',
        r'more than\s*\$?(\d+)',
        r'at least\s*\$?(\d+)',
        r'above\s*\$?(\d+)'
    ]
    
    # Pattern for price range "between X and Y"
    range_pattern = r'between\s*\$?(\d+)\s*and\s*\$?(\d+)'
    
    # Pattern for "around", "about", "approximately"
    approx_pattern = r'(?:around|about|approximately)\s*\$?(\d+)'
    
    try:
        # Check for range pattern first
        range_match = re.search(range_pattern, query)
        if range_match:
            return (float(range_match.group(1)), float(range_match.group(2)))
        
        # Check for maximum price patterns
        for pattern in max_patterns:
            match = re.search(pattern, query)
            if match:
                return (0, float(match.group(1)))
        
        # Check for minimum price patterns
        for pattern in min_patterns:
            match = re.search(pattern, query)
            if match:
                return (float(match.group(1)), float('inf'))
        
        # Check for approximate price
        approx_match = re.search(approx_pattern, query)
        if approx_match:
            price = float(approx_match.group(1))
            return (price * 0.8, price * 1.2)  # ±20% range
        
        return None
    except Exception as e:
        print(f"Error extracting price range: {str(e)}")
        return None

def hybrid_search(es_client, model, query, price_range=None, index_name='ecommerce-products'):
    """Perform hybrid search combining vector similarity and keyword matching."""
    try:
        # Encode the query
        query_vector = model.encode(query).tolist()
        
        # Build the search query
        search_query = {
            'size': 5,
            'query': {
                'bool': {
                    'must': [
                        {
                            'script_score': {
                                'query': {'match_all': {}},
                                'script': {
                                    'source': "cosineSimilarity(params.query_vector, 'combined_vector') + 1.0",
                                    'params': {'query_vector': query_vector}
                                }
                            }
                        }
                    ],
                    'should': [
                        {
                            'multi_match': {
                                'query': query,
                                'fields': ['productName^3', 'category^2', 'productDescription'],
                                'type': 'cross_fields',
                                'operator': 'and'
                            }
                        }
                    ],
                    'filter': []
                }
            }
        }
        
        # Extract category terms from query
        clothing_terms = ['jeans', 'shirt', 'dress', 'pants', 'shoes', 'jacket', 't-shirt', 'sweater', 'skirt']
        query_terms = query.lower().split()
        category_terms = [term for term in query_terms if term in clothing_terms]
        
        # Add category filter if category terms are found
        if category_terms:
            category_filter = {
                'bool': {
                    'should': [
                        {'match_phrase': {'category': term}} for term in category_terms
                    ],
                    'minimum_should_match': 1
                }
            }
            search_query['query']['bool']['filter'].append(category_filter)
        
        # Add price filter if specified
        if price_range:
            min_price, max_price = price_range
            price_filter = {
                'range': {
                    'price': {
                        'gte': min_price,
                        'lte': max_price if max_price != float('inf') else None
                    }
                }
            }
            search_query['query']['bool']['filter'].append(price_filter)
        
        # Execute search
        response = es_client.search(
            index=index_name,
            body=search_query
        )
        
        if 'hits' not in response or 'hits' not in response['hits']:
            print("No results found in response")
            return []
        
        return response['hits']['hits']
        
    except Exception as e:
        print(f"Search error: {str(e)}")
        return []

def print_search_results(results):
    """Print search results in a formatted way."""
    if not results:
        print("\nNo matching products found.")
        return
        
    print("\nHere are the products I found:")
    print("-" * 80)
    
    try:
        for hit in results:
            if '_source' not in hit:
                continue
                
            source = hit['_source']
            
            # Check if all required fields are present
            required_fields = ['productName', 'category', 'price', 'discount', 
                             'availableColours', 'sizes', 'productDescription']
            if not all(field in source for field in required_fields):
                continue
            
            try:
                final_price = source['price'] * (1 - source['discount']/100)
                
                print(f"📦 {source['productName']}")
                print(f"📝 Category: {source['category']}")
                print(f"💰 Original Price: ${source['price']:.2f}")
                if source['discount']:
                    print(f"🏷️ Discount: {source['discount']}%")
                    print(f"💵 Final Price: ${final_price:.2f}")
                print(f"🎨 Colors: {', '.join(source['availableColours'])}")
                print(f"📏 Sizes: {', '.join(source['sizes'])}")
                print(f"\nDescription: {source['productDescription']}")
                print(f"Relevance Score: {hit.get('_score', 0):.2f}")
                print("-" * 80)
            except (KeyError, TypeError) as e:
                continue
                
    except Exception as e:
        print(f"Error displaying results: {str(e)}")

def search_products():
    """Main search interface."""
    # Load product data
    products = load_product_data()
    if not products:
        return
    
    # Initialize model
    print("Initializing search system...")
    model = initialize_model()
    if not model:
        return
    
    # Connect to Elasticsearch
    try:
        es = Elasticsearch(['http://localhost:9200'])
        if not es.ping():
            print("Error: Could not connect to Elasticsearch. Make sure it's running on localhost:9200")
            return
    except Exception as e:
        print(f"Error connecting to Elasticsearch: {str(e)}")
        return
    
    # Setup index
    if not setup_elasticsearch_index(es):
        return
    
    # Index products
    print("Setting up product catalog...")
    if not index_products(es, products, model):
        return
        
    print("Ready to help you find products! Type 'quit' to exit.")
    
    # Verify index is populated
    try:
        count = es.count(index='ecommerce-products')['count']
        print(f"Number of products indexed: {count}")
    except Exception as e:
        print(f"Error checking index count: {str(e)}")
        return

    while True:
        try:
            # Get user query
            query = input("\nWhat would you like to find? ").strip()
            
            if query.lower() in ['quit', 'exit']:
                print("Goodbye!")
                break
            
            if not query:
                continue
                
            # Extract price range from query if present
            price_range = extract_price_range(query)
            
            # Perform search
            results = hybrid_search(es, model, query, price_range=price_range)
            print_search_results(results)
            
        except Exception as e:
            print(f"Error processing query: {str(e)}")
            continue

if __name__ == "__main__":
    search_products()

2024-10-24 05:47:11,344 - INFO - Load pretrained SentenceTransformer: multi-qa-MiniLM-L6-cos-v1


Initializing search system...


2024-10-24 05:47:12,152 - INFO - Use pytorch device_name: cpu
2024-10-24 05:47:12,158 - INFO - HEAD http://localhost:9200/ [status:200 duration:0.003s]
2024-10-24 05:47:12,160 - INFO - HEAD http://localhost:9200/ecommerce-products [status:200 duration:0.002s]
2024-10-24 05:47:12,246 - INFO - DELETE http://localhost:9200/ecommerce-products [status:200 duration:0.085s]
2024-10-24 05:47:12,674 - INFO - PUT http://localhost:9200/ecommerce-products [status:200 duration:0.427s]


Setting up product catalog...


Indexing products:   0%|                                                                                           | 0/23 [00:00<?, ?it/s]
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 75.77it/s][A

Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 27.47it/s][A

Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 32.41it/s][A
2024-10-24 05:47:12,797 - INFO - PUT http://localhost:9200/ecommerce-products/_doc/CLT001 [status:201 duration:0.023s]
Indexing products:   4%|███▌                                                                               | 1/23 [00:00<00:02,  8.26it/s]
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 89.37it/s][A

Batches: 100%|██████████████████

Successfully indexed 23 products
Ready to help you find products! Type 'quit' to exit.
Number of products indexed: 15



What would you like to find?  i want jeans that costs below 500


Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 110.09it/s]
2024-10-24 05:47:43,220 - INFO - POST http://localhost:9200/ecommerce-products/_search [status:200 duration:0.008s]



No matching products found.



What would you like to find?   i want jeans that costs below 50000


Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 99.27it/s]
2024-10-24 05:48:24,124 - INFO - POST http://localhost:9200/ecommerce-products/_search [status:200 duration:0.007s]



Here are the products I found:
--------------------------------------------------------------------------------
📦 Stretch Denim High-Rise Jeans
📝 Category: jeans
💰 Original Price: $899.99
🎨 Colors: Dark Blue, Mid Blue, Light Wash, Black
📏 Sizes: 24, 26, 28, 30, 32, 34

Description: Modern high-rise jeans with perfect stretch

Material Composition:
- 92% Cotton, 6% Polyester, 2% Elastane
- Premium stretch denim
- Brushed metal hardware

Fit Details:
- High-rise fit: 11-inch rise
- Slim through hip and thigh
- Ankle length
- Inseam: 28 inches

Care Instructions:
- Machine wash cold with similar colors
- Inside out wash recommended
- Tumble dry medium
- Do not bleach

Design Features:
- Sculpting stretch technology
- Five-pocket styling
- Contoured waistband
- Reinforced belt loops

Styling Tips:
- Perfect with cropped tops for a modern look
- Pair with ankle boots and blazer for office wear
- Great with oversized sweaters for weekend style
Relevance Score: 1.49
-------------------------

KeyboardInterrupt: Interrupted by user

In [17]:
import os
import time
import uuid
from typing import Dict, Any, Tuple
import logging
import json
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
from groq import Groq
from dotenv import load_dotenv

logging.basicConfig(level=logging.INFO)
load_dotenv()

model_name = os.getenv('MODEL_NAME', 'multi-qa-MiniLM-L6-cos-v1')
es_url = os.getenv('ELASTICSEARCH_URL', 'http://localhost:9200')
groq_api_key = os.getenv('GROQ_API_KEY')

model = SentenceTransformer(model_name)
es_client = Elasticsearch(es_url)

def elastic_search_hybrid(query: str, index_name: str = "ecommerce-products") -> list:
    """Performs a hybrid search combining vector similarity and keyword matching.

    Args:
        query (str): The search query.
        index_name (str): The name of the Elasticsearch index.

    Returns:
        list: A list of relevant product information.
    """
    query_vector = model.encode(query).tolist()
    
    search_query = {
        'size': 5,
        'query': {
            'bool': {
                'must': [
                    {
                        'script_score': {
                            'query': {'match_all': {}},
                            'script': {
                                'source': "cosineSimilarity(params.query_vector, 'combined_vector') + 1.0",
                                'params': {'query_vector': query_vector}
                            }
                        }
                    }
                ],
                'should': [
                    {
                        'multi_match': {
                            'query': query,
                            'fields': ['productName^3', 'category^2', 'productDescription'],
                            'type': 'cross_fields',
                            'operator': 'and'
                        }
                    }
                ]
            }
        }
    }
    
    try:
        results = es_client.search(
            index=index_name,
            body=search_query
        )
        return [hit['_source'] for hit in results['hits']['hits']]
    except Exception as e:
        logging.error(f"Search error: {str(e)}")
        return []

def build_product_context(search_results: list) -> str:
    """Builds a context string from product search results.

    Args:
        search_results (list): List of product information.

    Returns:
        str: Formatted context string containing product information.
    """
    context_items = []
    for product in search_results:
        final_price = product['price'] * (1 - product.get('discount', 0)/100)
        context_items.append(
            f"Product: {product['productName']}\n"
            f"Category: {product['category']}\n"
            f"Price: ${product['price']:.2f}\n"
            f"Final Price: ${final_price:.2f}\n"
            f"Colors: {', '.join(product['availableColours'])}\n"
            f"Sizes: {', '.join(product['sizes'])}\n"
            f"Description: {product['productDescription']}\n"
        )
    return "\n---\n".join(context_items)

def build_prompt(query: str, search_results: list) -> str:
    """Builds a prompt for the e-commerce assistant.

    Args:
        query (str): The customer's question.
        search_results (list): The relevant product information.

    Returns:
        str: A formatted prompt for the language model.
    """
    prompt_template = """You are a knowledgeable and helpful e-commerce shopping assistant. Using the provided product information, 
    answer the customer's question accurately and professionally. If asked about prices, include both original and discounted prices 
    where applicable. If asked about availability, mention both colors and sizes. If you're unsure about any specific detail, be honest 
    about the limitation of the information available.

    Product Information:
    {context}

    Customer Question: {question}

    Please provide a helpful, accurate, and natural response based on the available product information."""

    context = build_product_context(search_results)
    return prompt_template.format(context=context, question=query)

def llm(prompt: str, model: str = 'llama-3.1-70b-versatile') -> Tuple[str, Dict[str, Any], float]:
    """Generates a response using the Groq LLM.

    Args:
        prompt (str): The prompt for the LLM.
        model (str): The model to use.

    Returns:
        Tuple[str, Dict[str, Any], float]: The response, token usage, and response time.
    """
    client = Groq()
    start_time = time.time()
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    end_time = time.time()

    return response.choices[0].message.content, response.usage.to_dict(), end_time - start_time

def rag(query: str, model: str = 'llama-3.1-70b-versatile') -> Dict[str, Any]:
    """Executes the RAG pipeline for e-commerce product queries.

    Args:
        query (str): The customer's question.
        model (str): The LLM model to use.

    Returns:
        Dict[str, Any]: The response data including answer and metadata.
    """
    conversation_id = str(uuid.uuid4())
    search_results = elastic_search_hybrid(query)
    
    if not search_results:
        return {
            "id": conversation_id,
            "question": query,
            "answer": "I apologize, but I couldn't find any relevant products matching your query. Could you please try rephrasing your question or provide more specific details?",
            "model_used": model,
            "response_time": 0,
            "total_tokens": 0,
            "search_results": []
        }
    
    prompt = build_prompt(query, search_results)
    answer, tokens, response_time = llm(prompt, model=model)

    return {
        "id": conversation_id,
        "question": query,
        "answer": answer,
        "model_used": model,
        "response_time": response_time,
        "total_tokens": tokens['total_tokens'],
        "search_results": search_results
    }

def get_answer_for_question(question: str) -> Dict[str, Any]:
    """Gets an answer for a customer's question about products.

    Args:
        question (str): The customer's question.

    Returns:
        Dict[str, Any]: The answer and related metadata.
    """
    return rag(question)

if __name__ == "__main__":
    print("E-commerce Product Assistant - Ask me anything about our products!")
    print("(Type 'quit' to exit)")
    
    while True:
        question = input("\nYour question: ").strip()
        if question.lower() == 'quit':
            break
            
        answer_data = get_answer_for_question(question)
        print(f"\nAnswer: {answer_data['answer']}")
        print(f"\nResponse Time: {answer_data['response_time']:.2f} seconds")
        print(f"Total Tokens: {answer_data['total_tokens']}")

2024-10-24 06:22:48,956 - INFO - Load pretrained SentenceTransformer: multi-qa-MiniLM-L6-cos-v1
2024-10-24 06:22:49,948 - INFO - Use pytorch device_name: cpu


E-commerce Product Assistant - Ask me anything about our products!
(Type 'quit' to exit)



Your question:  hey i have 700 i want shoes that i can buy that are in my money range


Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 75.51it/s]
2024-10-24 06:22:53,982 - INFO - POST http://localhost:9200/ecommerce-products/_search [status:200 duration:0.009s]
2024-10-24 06:22:55,159 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"



Answer: I can help you find some great shoes that fit your budget. 

I see that you have a budget of $700, and based on our current collection, I would recommend taking a look at our Platform Canvas Sneakers. These trendy shoes are originally priced at $699.99, but we currently have them on sale for $559.99, which is well within your budget.

The Platform Canvas Sneakers come in four different colors - White, Black, Navy, and Red - and are available in sizes UK4 to UK8. They feature a durable canvas upper, rubber platform sole, and memory foam insoles for added comfort. The shoes also have a regular fit and a comfortable 4cm platform height.

If you're interested in purchasing the Platform Canvas Sneakers, I'd be happy to assist you with the order.

Response Time: 1.17 seconds
Total Tokens: 1264


KeyboardInterrupt: Interrupted by user

In [None]:
import os
import time
import uuid
from typing import Dict, Any, Tuple, List
import logging
import json
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
from groq import Groq
from dotenv import load_dotenv
from dataclasses import dataclass
from datetime import datetime

logging.basicConfig(level=logging.INFO)
load_dotenv()

model_name = os.getenv('MODEL_NAME', 'multi-qa-MiniLM-L6-cos-v1')
es_url = os.getenv('ELASTICSEARCH_URL', 'http://localhost:9200')
groq_api_key = os.getenv('GROQ_API_KEY')

model = SentenceTransformer(model_name)
es_client = Elasticsearch(es_url)

@dataclass
class Message:
    """Represents a single message in the conversation."""
    role: str
    content: str
    timestamp: float
    
class ConversationMemory:
    """Manages conversation history and context."""
    def __init__(self, max_messages: int = 10):
        self.messages: List[Message] = []
        self.max_messages = max_messages
        
    def add_message(self, role: str, content: str):
        """Adds a new message to the conversation history."""
        message = Message(role=role, content=content, timestamp=time.time())
        self.messages.append(message)
        
        # Keep only the most recent messages
        if len(self.messages) > self.max_messages:
            self.messages = self.messages[-self.max_messages:]
            
    def get_history(self, max_tokens: int = 1000) -> str:
        """Returns formatted conversation history."""
        history = []
        for msg in self.messages:
            formatted = f"{msg.role}: {msg.content}"
            history.append(formatted)
        return "\n".join(history[-self.max_messages:])
    
    def clear(self):
        """Clears the conversation history."""
        self.messages = []

def elastic_search_hybrid(query: str, index_name: str = "ecommerce-products") -> list:
    """Performs a hybrid search combining vector similarity and keyword matching."""
    query_vector = model.encode(query).tolist()
    
    search_query = {
        'size': 5,
        'query': {
            'bool': {
                'must': [
                    {
                        'script_score': {
                            'query': {'match_all': {}},
                            'script': {
                                'source': "cosineSimilarity(params.query_vector, 'combined_vector') + 1.0",
                                'params': {'query_vector': query_vector}
                            }
                        }
                    }
                ],
                'should': [
                    {
                        'multi_match': {
                            'query': query,
                            'fields': ['productName^3', 'category^2', 'productDescription'],
                            'type': 'cross_fields',
                            'operator': 'and'
                        }
                    }
                ]
            }
        }
    }
    
    try:
        results = es_client.search(
            index=index_name,
            body=search_query
        )
        return [hit['_source'] for hit in results['hits']['hits']]
    except Exception as e:
        logging.error(f"Search error: {str(e)}")
        return []

def build_product_context(search_results: list) -> str:
    """Builds a context string from product search results."""
    context_items = []
    for product in search_results:
        final_price = product['price'] * (1 - product.get('discount', 0)/100)
        context_items.append(
            f"Product: {product['productName']}\n"
            f"Category: {product['category']}\n"
            f"Price: ${product['price']:.2f}\n"
            f"Final Price: ${final_price:.2f}\n"
            f"Colors: {', '.join(product['availableColours'])}\n"
            f"Sizes: {', '.join(product['sizes'])}\n"
            f"Description: {product['productDescription']}\n"
        )
    return "\n---\n".join(context_items)

def build_prompt(query: str, search_results: list, conversation_memory: ConversationMemory) -> str:
    """Builds a prompt including conversation history and product information."""
    prompt_template = """You are a knowledgeable and helpful e-commerce shopping assistant. Using the provided product information 
    and conversation history, answer the customer's question accurately and professionally. Maintain context from the previous 
    conversation when relevant. If referring to previous interactions, be explicit about what was discussed before.

    Previous Conversation:
    {history}

    Product Information:
    {context}

    Current Question: {question}

    Please provide a helpful, accurate, and natural response that takes into account both the conversation history and the 
    available product information. If referring to previously discussed items, make those references clear."""

    context = build_product_context(search_results)
    history = conversation_memory.get_history()
    
    return prompt_template.format(
        history=history,
        context=context,
        question=query
    )

def llm(prompt: str, model: str = 'llama-3.1-70b-versatile') -> Tuple[str, Dict[str, Any], float]:
    """Generates a response using the Groq LLM."""
    client = Groq()
    start_time = time.time()
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    end_time = time.time()

    return response.choices[0].message.content, response.usage.to_dict(), end_time - start_time

class EcommerceAssistant:
    """Manages the e-commerce assistant's conversation and memory."""
    def __init__(self):
        self.conversation_memory = ConversationMemory()
        self.session_id = str(uuid.uuid4())
    
    def rag(self, query: str, model: str = 'llama-3.1-70b-versatile') -> Dict[str, Any]:
        """Executes the RAG pipeline with conversation memory."""
        conversation_id = str(uuid.uuid4())
        search_results = elastic_search_hybrid(query)
        
        if not search_results:
            response = "I apologize, but I couldn't find any relevant products matching your query. Could you please try rephrasing your question or provide more specific details?"
            self.conversation_memory.add_message("user", query)
            self.conversation_memory.add_message("assistant", response)
            return {
                "id": conversation_id,
                "question": query,
                "answer": response,
                "model_used": model,
                "response_time": 0,
                "total_tokens": 0,
                "search_results": []
            }
        
        prompt = build_prompt(query, search_results, self.conversation_memory)
        answer, tokens, response_time = llm(prompt, model=model)
        
        # Add the interaction to conversation memory
        self.conversation_memory.add_message("user", query)
        self.conversation_memory.add_message("assistant", answer)
        
        return {
            "id": conversation_id,
            "session_id": self.session_id,
            "question": query,
            "answer": answer,
            "model_used": model,
            "response_time": response_time,
            "total_tokens": tokens['total_tokens'],
            "search_results": search_results
        }
    
    def reset_conversation(self):
        """Resets the conversation memory and starts a new session."""
        self.conversation_memory.clear()
        self.session_id = str(uuid.uuid4())
        return {"message": "Conversation reset successfully", "new_session_id": self.session_id}

def format_response(response_data: Dict[str, Any]) -> str:
    """Formats the response data for display."""
    return f"""
Answer: {response_data['answer']}

Response Time: {response_data['response_time']:.2f} seconds
Total Tokens: {response_data['total_tokens']}
Session ID: {response_data['session_id']}
"""

if __name__ == "__main__":
    assistant = EcommerceAssistant()
    print("E-commerce Product Assistant - Ask me anything about our products!")
    print("(Type 'quit' to exit, 'reset' to start a new conversation)")
    
    while True:
        question = input("\nYour question: ").strip()
        
        if question.lower() == 'quit':
            break
        elif question.lower() == 'reset':
            result = assistant.reset_conversation()
            print(f"\n{result['message']}")
            print(f"New session started with ID: {result['new_session_id']}")
            continue
            
        answer_data = assistant.rag(question)
        print(format_response(answer_data))

2024-10-24 06:30:43,852 - INFO - Load pretrained SentenceTransformer: multi-qa-MiniLM-L6-cos-v1
2024-10-24 06:30:44,916 - INFO - Use pytorch device_name: cpu


E-commerce Product Assistant - Ask me anything about our products!
(Type 'quit' to exit, 'reset' to start a new conversation)



Your question:  hey i have 600 i want to buy  shoes 


Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 88.58it/s]
2024-10-24 06:31:08,771 - INFO - POST http://localhost:9200/ecommerce-products/_search [status:200 duration:0.009s]
2024-10-24 06:31:09,893 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"



Answer: Based on your budget of $600, I'd like to recommend some shoes from our collection. 

Considering you're looking for shoes, I think the Platform Canvas Sneakers would be an excellent choice. They're trendy, comfortable, and available at a discounted price of $559.99, which fits your budget. You can choose from four colors: White, Black, Navy, or Red. They also come in various sizes, so please let me know your size preference.

If you'd like to explore other options, please let me know what type of shoes you're interested in (e.g. boots, sandals, etc.) or what style you're looking for (e.g. formal, casual, etc.). I'll be happy to provide more tailored recommendations within your budget.

Response Time: 1.12 seconds
Total Tokens: 1305
Session ID: 546f4b17-cb4f-4c7f-814b-127e6e355c55




Your question:  can i get these in blue colour 


Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 109.20it/s]
2024-10-24 06:31:43,531 - INFO - POST http://localhost:9200/ecommerce-products/_search [status:200 duration:0.007s]
2024-10-24 06:31:44,984 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"



Answer: You're interested in a blue color option. Since we previously discussed the Platform Canvas Sneakers, I'd like to mention that they're available in a Navy blue color if you're still interested in those. 

However, I'm assuming you might be asking about a different product, given the context of our conversation. Could you please clarify which type of item or product you're interested in? Are you looking for outerwear, perhaps a shirt, or something else? I'll do my best to find a blue color option within your budget of $600.

Response Time: 1.45 seconds
Total Tokens: 1376
Session ID: 546f4b17-cb4f-4c7f-814b-127e6e355c55

