### Imports

In [48]:
import hashlib
from typing import Dict, List, Optional, Union
import json
import pandas as pd
from tqdm.auto import tqdm
from groq import Groq

In [10]:
!pip install groq
import os
os.environ["GROQ_API_KEY"] = "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"
from groq import Groq


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


###  Loading  the Data

In [62]:
json_file = '../data/products_data.json'
with open(json_file, 'r') as file:
    documents_raw = json.load(file)

In [63]:
documents_raw

[{'id': 'CLT001',
  'productName': 'Premium Egyptian Cotton Oxford Shirt',
  'price': 699.99,
  'category': 'shirts',
  'image': 'https://images.unsplash.com/photo-1602293589930-45aad59ba3ab?w=300',
  'productDescription': 'Crafted from 100% Egyptian cotton (120 thread count)\n\nMaterial Composition:\n- 100% Egyptian cotton\n- Mother of pearl buttons\n- Reinforced collar stays\n\nFit Details:\n- Regular fit through chest and waist\n- Shoulder-to-shoulder measurements: S(17"), M(18"), L(19")\n- Center back length: 30 inches\n\nCare Instructions:\n- Machine wash cold\n- Tumble dry low\n- Iron on medium heat\n- Do not bleach\n\nDesign Features:\n- Button-down collar\n- Single chest pocket\n- Split yoke\n- Rounded hem\n\nStyling Tips:\n- Perfect for formal occasions when paired with tailored trousers\n- Can be dressed down with chinos for a smart-casual look\n- Layer under a blazer for business meetings',
  'availableColours': ['White',
   'Light Blue',
   'Pink',
   'Light Grey',
   'Powd

In [64]:
# Initialize an empty list to store processed product details
product_details = []

# Iterate through each product in the loaded JSON data
for product in documents_raw:
    product_id = product['id']
    
    # Extract product description sections
    description_text = product['productDescription'].split('\n\n')
    
    # Create a dictionary to store specifications
    specifications = {}
    current_section = None
    
    # Process the description text to organize it into sections
    for line in description_text:
        if line.endswith(':'):
            current_section = line[:-1]  # Remove the colon
            specifications[current_section] = []
        elif current_section and line.strip():
            specifications[current_section].append(line.strip('- '))

    # Append the product details to the list with the new structure
    product_details.append({
        'ID': product['id'],
        'Product_Name': product['productName'],
        'Price': product['price'],
        'Category': product['category'],
        'Image': product['image'],
        'Product_Description': {
            'Material_Composition': specifications.get('Material Composition', []),
            'Fit_Details': specifications.get('Fit Details', []),
            'Care_Instructions': specifications.get('Care Instructions', []),
            'Design_Features': specifications.get('Design Features', []),
            'Styling_Tips': specifications.get('Styling Tips', [])
        },
        'Colors': product['availableColours'],
        'Sizes': product['sizes'],
        'Discount': product['discount']
    })

# Print the results
print(f"Loaded product details with {len(product_details)} entries.")

Loaded product details with 23 entries.


In [65]:
product_details[17]

{'ID': 'CLT018',
 'Product_Name': 'Leather Block Heel Sandals',
 'Price': 1299.99,
 'Category': 'shoes',
 'Image': 'https://images.unsplash.com/photo-1543163521-1bf539c55dd2?w=300',
 'Product_Description': {'Material_Composition': [],
  'Fit_Details': [],
  'Care_Instructions': [],
  'Design_Features': [],
  'Styling_Tips': []},
 'Colors': ['Black', 'Tan', 'White', 'Metallic Gold'],
 'Sizes': ['UK4', 'UK5', 'UK6', 'UK7', 'UK8'],
 'Discount': 0}

### Creating Ground Truth Dataset

In [67]:
prompt_template = """
You are a customer interested in buying a product from our online shop.
Formulate 5 questions this customer might ask based on the product's information, and provide their answers.
The questions should be complete and not too short.
If possible, use as few words as possible from the record. Focus on rephrasing the information naturally.

Product Information:
- ID: {ID}
- Product Name: {Product_Name}
- Price: {Price}
- Category: {Category}
- Colors: {Colors}
- Sizes: {Sizes}
- Discount: {Discount}
- Description: {Product_Description}

IMPORTANT: Respond ONLY with a valid JSON object in exactly this format:
{{
    "qa_pairs": [
        {{
            "question": "question1",
            "answer": "answer1"
        }},
        {{
            "question": "question2",
            "answer": "answer2"
        }}
    ]
}}
""".strip()

In [68]:
def clean_json_string(s):
    """Clean and validate JSON string before parsing."""
    # Remove any leading/trailing whitespace or newlines
    s = s.strip()
    
    # If the string doesn't start with '{', try to find the first '{'
    if not s.startswith('{'):
        start_idx = s.find('{')
        if start_idx != -1:
            s = s[start_idx:]
    
    # If the string doesn't end with '}', try to find the last '}'
    if not s.endswith('}'):
        end_idx = s.rfind('}')
        if end_idx != -1:
            s = s[:end_idx+1]
    
    return s


In [69]:
def generate_qa_pairs(product):
    try:
        # Format the prompt with product details
        prompt = prompt_template.format(**product)
        
        # Create Groq client
        client = Groq()
        
        # Get response from the model
        response = client.chat.completions.create(
            model="mixtral-8x7b-32768",   
            messages=[{"role": "user", "content": prompt}],
            temperature=0.7,  # Add some randomness but keep responses focused
            max_tokens=1000
        )
        
        # Get the response content
        content = response.choices[0].message.content
        
        # Clean and parse the JSON
        cleaned_content = clean_json_string(content)
        qa_pairs = json.loads(cleaned_content)
        
        # Validate the structure
        if not isinstance(qa_pairs, dict) or 'qa_pairs' not in qa_pairs:
            raise ValueError("Invalid response structure")
        
        return qa_pairs
        
    except json.JSONDecodeError as e:
        print(f"JSON parsing error for product {product['ID']}: {str(e)}")
        print(f"Raw response: {content}")
        return None
    except Exception as e:
        print(f"Error generating QA pairs for product {product['ID']}: {str(e)}")
        return None

In [70]:
def process_products(product_details):
    all_qa_pairs = []
    
    for product in tqdm(product_details):
        try:
            product_id = product.get('ID')
            if not product_id:
                print(f"Warning: no ID found for product: {product}")
                continue
                
            # Generate QA pairs
            qa_response = generate_qa_pairs(product)
            
            if qa_response and 'qa_pairs' in qa_response:
                # Add product information to each QA pair
                for qa_pair in qa_response['qa_pairs']:
                    qa_pair['product_id'] = product_id
                    qa_pair['product_name'] = product.get('Product_Name')
                    qa_pair['category'] = product.get('Category')
                    qa_pair['price'] = product.get('Price')
                    qa_pair['colors'] = ', '.join(product.get('Colors', []))
                    qa_pair['sizes'] = ', '.join(product.get('Sizes', []))
                    all_qa_pairs.append(qa_pair)
                
        except Exception as e:
            print(f"Error processing product {product.get('ID', 'unknown')}: {str(e)}")
            continue
    
    return all_qa_pairs


In [71]:
def save_dataset(qa_pairs, output_file='product_qa_groundtruth.csv'):
    # Convert to DataFrame
    df = pd.DataFrame(qa_pairs)
    
    # Reorder columns to put product info first
    column_order = [
        'product_id', 'product_name', 'category', 'price', 
        'colors', 'sizes', 'question', 'answer'
    ]
    df = df[column_order]
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    print(f"\nSaved {len(df)} QA pairs to {output_file}")
    
    return df

In [73]:
if __name__ == "__main__":
    # Process the products
    qa_pairs = process_products(product_details)
    
    # Save and display results
    df = save_dataset(qa_pairs)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 23/23 [01:41<00:00,  4.40s/it]


Saved 115 QA pairs to product_qa_groundtruth.csv

First few rows of the dataset:





In [74]:
print("\nFirst few rows of the dataset:")
df.head()


First few rows of the dataset:


Unnamed: 0,product_id,product_name,category,price,colors,sizes,question,answer
0,CLT001,Premium Egyptian Cotton Oxford Shirt,shirts,699.99,"White, Light Blue, Pink, Light Grey, Powder Blue","S, M, L, XL, XXL",What is the ID of the Premium Egyptian Cotton ...,CLT001
1,CLT001,Premium Egyptian Cotton Oxford Shirt,shirts,699.99,"White, Light Blue, Pink, Light Grey, Powder Blue","S, M, L, XL, XXL",How much does the Oxford shirt cost?,699.99
2,CLT001,Premium Egyptian Cotton Oxford Shirt,shirts,699.99,"White, Light Blue, Pink, Light Grey, Powder Blue","S, M, L, XL, XXL",In which category can I find the Oxford shirt?,shirts
3,CLT001,Premium Egyptian Cotton Oxford Shirt,shirts,699.99,"White, Light Blue, Pink, Light Grey, Powder Blue","S, M, L, XL, XXL",What colors and sizes are available for this s...,"This shirt is available in White, Light Blue, ..."
4,CLT001,Premium Egyptian Cotton Oxford Shirt,shirts,699.99,"White, Light Blue, Pink, Light Grey, Powder Blue","S, M, L, XL, XXL",Are there any discounts or promotions for this...,"No, there are no discounts for this product. T..."
