# Step 1: Analyze the Data (Loading & Cleaning)

Loading the data, exploring its structure, and performin necessary preprocessing to make it suitable for our chatbot application.

In [12]:
import pandas as pd
import spacy
from openai import OpenAI
import requests
import json
import logging
import re
import pandas as pd
import warnings
import pandas as pd
warnings.filterwarnings('ignore')
from bs4 import BeautifulSoup
from PIL import Image
from sentence_transformers import SentenceTransformer, util

# Response Formatting

def format_product_response(product):
    # Format the product details into a string
    return f"{product['Title']}, Price: {product['Price']:.2f}, Description: {product['Cleaned_Description'][:100]}..."


# Corrected file path
file_path = '../products.csv'
products_df = pd.read_csv(file_path, delimiter=';', usecols=['ID', 'Title', 'Description', 'Vendor', 'Type', 'Tags', 'Price'])

def clean_html(raw_html):
    if pd.isnull(raw_html):
        return ""
    clean_text = BeautifulSoup(raw_html, "html.parser").text
    return " ".join(clean_text.split())

products_df['Cleaned_Description'] = products_df['Description'].apply(clean_html)
products_df = products_df.dropna(subset=['Title'])
products_df = products_df[products_df['Cleaned_Description'].str.strip() != '']
products_df['Tags'] = products_df['Tags'].apply(lambda x: x.split(',') if pd.notnull(x) else [])

# Step 2: Data Preprocessing & Feature Engineering
We need to clean the data, especially the `Description` column, as it contains HTML tags and other non-text elements that may not be useful for our text processing tasks.

In [13]:
# Assuming spaCy is installed and the model is downloaded

nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    doc = nlp(text)
    lemmatized = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.is_space]
    return " ".join(lemmatized)

products_df['Processed_Description'] = products_df['Cleaned_Description'].apply(preprocess_text)

2. Extracting Features with Named Entity Recognition (NER)
Extracting entities like materials, benefits from descriptions:

In [14]:
def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

products_df['Entities'] = products_df['Cleaned_Description'].apply(extract_entities)


In [15]:
# Function to clean tags and convert them to lowercase
def clean_tags(tags):
    return [tag.lower().strip() for tag in tags]

# Clean the Tags and Type columns
products_df['Tags'] = products_df['Tags'].apply(lambda x: clean_tags(x) if isinstance(x, list) else [])
products_df['Type'] = products_df['Type'].str.lower().str.strip()

# Convert Price to numeric type for sorting
products_df['Price'] = pd.to_numeric(products_df['Price'], errors='coerce')

# 3. Loading Model

In [16]:
# Load the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')


# 4. Integrating OpenAI's API for Enhanced Query Processing

To integrate OpenAI's API, ensure you have an API key and have installed the `openai` Python package.

In [19]:
# openai.api_key = 'your-api-key-here'
client = OpenAI(api_key='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')

def process_query_with_gpt4(query):
    prompt = f"Parse the following user query to identify product attributes and price constraints: '{query}'. List attributes and any specific price constraints."
    try:
        response = client.chat.completions.create(
            model="gpt-4-turbo-preview",
            messages=[{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt}]
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        logging.error(f'Error processing query with GPT-4: {e}')
        return ""


In [20]:
# Assuming you've already encoded your product descriptions into embeddings
description_embeddings = model.encode(products_df['Processed_Description'].tolist(), convert_to_tensor=True)

def semantic_search_with_gpt4(query):
    # Process the query with GPT-4 to refine or expand it based on user intent
    processed_query = process_query_with_gpt4(query)
    query_embedding = model.encode(processed_query, convert_to_tensor=True)
    search_results = util.semantic_search(query_embedding, description_embeddings, top_k=5)
    
    matched_products = [products_df.iloc[hit['corpus_id']] for hit in search_results[0]]
    return pd.DataFrame(matched_products)


In [21]:
def semantic_search_with_pgvector(query, top_k=5):
    query_embedding = model.encode(query, convert_to_tensor=True)
    
    # Use pgvector to perform the search
    search_results = pgvector_search(query_embedding, top_k=top_k)
    
    # search_results is assumed to be a list of tuples with (id, title, description)
    # Create a DataFrame from the search results
    matched_products_df = pd.DataFrame(search_results, columns=['ID', 'Title', 'Description'])

    return matched_products_df


# Generating Dynamic Responses with GPT-4

Utilizing GPT-4's advanced text generation capabilities to create responses that are contextually relevant, detailed, and tailored to the user's query and the matched products.

In [22]:
def parse_gpt_response(response):
    attributes = re.findall(r"\battributes?:\s*([\w\s,]+)", response, re.I)
    min_price = re.search(r"\bmin(?:imum)? price:?\s*(\d+)", response, re.I)
    
    # Flatten attribute list and filter empty values
    attributes = [attr.strip() for sublist in attributes for attr in sublist.split(',') if attr.strip()]
    min_price = float(min_price.group(1)) if min_price else None

    return attributes, min_price


In [23]:
def search_products_by_attributes(attributes, min_price=None):
    # Filter products containing all the attributes
    filtered_products = products_df[
        products_df['Tags'].apply(lambda tags: all(attr in tags for attr in attributes)) |
        products_df['Type'].str.contains('|'.join(attributes), case=False, na=False)
    ]
    
    # Further filter by min price if specified
    if min_price is not None:
        filtered_products = filtered_products[filtered_products['Price'] >= min_price]
    
    # Sort by price
    sorted_products = filtered_products.sort_values(by='Price', ascending=True)
    
    return sorted_products

In [24]:
def generate_dynamic_response_with_gpt4(query):
    try:
        # Extract attributes and price constraints from the user's query
        processed_response = process_query_with_gpt4(query)
        
        # Parse the processed_response to get attributes and min_price
        attributes, min_price = parse_gpt_response(processed_response)

        # Search for products based on extracted attributes and price
        product_results = search_products_by_attributes(attributes, min_price)

        # Format the results into a response string
        response_string = "\n".join([f"{row['Title']}, Price: {row['Price']:.2f}" for index, row in product_results.iterrows()])

        return response_string if not product_results.empty else "No products found matching your criteria."
    except requests.exceptions.HTTPError as http_err:
        logging.error(f'HTTP error occurred: {http_err}')  # HTTP error
    except Exception as err:
        logging.error(f'Other error occurred: {err}')  # Other errors
    return "Error generating response. Please try again later."

In [10]:
# Sample query from the user
user_query = "I'm looking for eco-friendly skincare products with a minimum price of 20"

# Get the response from GPT-4
gpt_response = generate_dynamic_response_with_gpt4(user_query)

# Check and print the response from GPT-4
print(f"GPT-4 Response: {gpt_response}")

# Parse the response to extract attributes and min price
attributes, min_price = parse_gpt_response(gpt_response)

# Print the parsed information
print(f"Extracted Attributes: {attributes}")
print(f"Extracted Min Price: {min_price}")

GPT-4 Response: Rosemary Nettle Shampoo Bar, Price: 9.68, Description: This rosemary nettle shampoo bar is all you need for thick gorgeous hair!Rosemary is queen and king ...
Handmade Organic Vapor Rub, Price: 10.94, Description: This Vapor rub is a great alternative to the commercial rubs you can buy in the grocery store for ma...
Coconut Silk Conditioning Shampoo Bar, Price: 11.07, Description: The newest addition to my shampoo soap line - coconut silk conditioning shampoo bar. An incredible b...
Shampoo for sensible hairs (SMOOTH), Price: 15.29, Description: Delicacy, harmony and balance in its purest form - that's Smooth. Great hair is all about balance, b...
Organic Bug Spray Bug Repellant, Price: 16.55, Description: This completely natural bug spray is perfect for those nighttime hikes in mosquito season. Made with...
Electric Razor for Women Removal for Body Nose Hair Trimmer Face, Price: 18.68, Description: Features: 5-in-1 Electric ShaversWomen razor shaver includes 5 intercha

In [None]:
# def process_query_with_gpt4(query):
#     try:
#         response = client.chat.completions.create(
#             model="gpt-4-turbo-preview",  # Specify GPT-4 as the model
#             prompt=f"Given the user query: '{query}', extract and list all mentioned product attributes such as category, color, and any price constraints. Format your response accordingly.",
#             temperature=0.5,
#             max_tokens=100)
#         return response.choices[0].text.strip()

         
#     # Extracting the latest message from the response, assuming it's the assistant's response
#         if response.choices and response.choices[0].message:
#             assistant_message = response.choices[0].message['content']
#         else:
#             assistant_message = "No response from GPT-4."
#         return assistant_message.strip()
#     except Exception as e:
#         logging.error(f'Error processing query with GPT-4: {e}')
#         return "I encountered an error processing your request. Please try again."


# def process_query_with_gpt4(query):
#     try:
#         # Make sure to replace 'your_api_key_here' with your actual OpenAI API key        
#         response = openai.Completion.create(
#             engine="gpt-4-turbo-preview",  # Use the correct engine identifier for GPT-4 once available
#             prompt=f"Given the user query: '{query}', extract and list all mentioned product attributes such as category, color, and any price constraints. Format your response accordingly.",
#             temperature=0.2,
#             max_tokens=150,
#             top_p=1.0,
#             frequency_penalty=0.0,
#             presence_penalty=0.0
#         )
#         return response.choices[0].text.strip()
#     except Exception as e:
#         logging.error(f'Error processing query with GPT-4: {e}')
#         return "I encountered an error processing your request. Please try again."