In [30]:
import hashlib
import json
import os 
from sentence_transformers import SentenceTransformer
import mistralai
from mistralai import Mistral

In [31]:
def generate_document_id(doc):
    combined = f"{doc['category']}-{doc['question']}-{doc['answer'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

def add_ids_to_faqs(data):
    for category_data in data["faq_data"]:
        for question_data in category_data["questions"]:
            doc = {
                "category": category_data["category"],
                "question": question_data["question"],
                "answer": question_data["answer"]
            }
            question_data["id"] = generate_document_id(doc)

def save_data_with_ids(data, file_name):
    with open(file_name, "w") as f:
        json.dump(data, f, indent=4)

# Load the existing JSON data
with open('faq_data.json', 'r') as f:
    faq_data = json.load(f)

# Add document IDs
add_ids_to_faqs(faq_data)

# Save the updated data with IDs
save_data_with_ids(faq_data, 'faq_data_with_ids.json')

print("FAQ data with IDs saved to faq_data_with_ids.json")


FAQ data with IDs saved to faq_data_with_ids.json


In [32]:
client = Mistral(api_key=  os.getenv('MISTRAL_API_KEY'))

In [33]:
prompt_template = """
You emulate a user who wants to ask question to the NomadFood chatbot about the company.
Formulate 5 questions this user might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:


question: {question}
answer: {answer}

Provide the output in parsable JSON without using code blocks:

["question1", "question2"]
""".strip()

In [34]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.complete(
        model="mistral-large-latest",
        messages=[
            {"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content.strip()
    return json_response

In [44]:
import json
import pandas as pd
import pickle
import re
from tqdm.auto import tqdm

# Function to clean improperly escaped JSON strings
def clean_escaped_json_string(json_string):
    # Remove excessive escaping of quotes (replacing \\" with ")
    cleaned_string = re.sub(r'\\"', '"', json_string)
    
    # Remove extra surrounding quotes if they exist
    if cleaned_string.startswith('"') and cleaned_string.endswith('"'):
        cleaned_string = cleaned_string[1:-1]
    
    return cleaned_string

# Initialize results dictionary
results = {}

# Iterate over each category in faq_data
for category_data in tqdm(faq_data['faq_data']):  
    category = category_data['category']  # Access the category name
    for question_data in category_data['questions']:  # Iterate over questions in each category
        doc_id = question_data['id']  # Get the ID of each question
        
        # Clean the question data if it is a string
        if isinstance(question_data, str):
            question_data = clean_escaped_json_string(question_data)

        # Generate questions based on the data
        questions = generate_questions(question_data)  # Ensure this function handles the cleaned question data
        results[doc_id] = questions  # Store the result with the question ID as the key

# Save the results
with open('results.bin', 'wb') as f_out:
    pickle.dump(results, f_out)

# Load the results later if needed
with open('results.bin', 'rb') as f_in:
    results = pickle.load(f_in)


  0%|          | 0/4 [00:00<?, ?it/s]

In [53]:
results

{'e063fa64': '[\n"What kinds of food products does NomadFood offer?",\n"Can you tell me about the different categories of food products that NomadFood sells?",\n"Who are the suppliers for NomadFood\'s dairy products?",\n"Does NomadFood carry both fresh and frozen meat products?",\n"What types of packaged foods can I find in NomadFood\'s inventory?"\n]',
 '2bdb70ae': '[\n"Are all of NomadFood\'s products organic?",\n"Does NomadFood offer non-organic products?",\n"What certifies NomadFood\'s organic products?",\n"How are NomadFood\'s organic products grown?",\n"What alternatives does NomadFood provide for budget-conscious consumers?"\n]',
 'e7bf8de2': '[\n"How does NomadFood ensure the freshness of their products during transportation?",\n"What measures has NomadFood taken to control the temperature of their products during storage?",\n"Can you explain how NomadFood\'s supply chain contributes to product freshness?",\n"How does NomadFood\'s logistics team help keep perishable items fresh

In [55]:
import json
import pandas as pd
import re

parsed_results = {}

# Function to clean improperly escaped JSON strings
def clean_escaped_json_string(json_string):
    # Remove excessive escaping of quotes (replacing \\" with ")
    cleaned_string = re.sub(r'\\"', '"', json_string)
    
    # Remove extra surrounding quotes if they exist
    if cleaned_string.startswith('"') and cleaned_string.endswith('"'):
        cleaned_string = cleaned_string[1:-1]
    
    return cleaned_string

# Iterate over the results
for doc_id, json_questions in results.items():
    try:
        # Clean the escaped JSON string
        cleaned_json_questions = clean_escaped_json_string(json_questions)
        
        # Try to parse the cleaned json_questions
        parsed_content = json.loads(cleaned_json_questions)
        
        # If the parsed_content is a dictionary, extract the questions key
        if isinstance(parsed_content, dict) and "questions" in parsed_content:
            parsed_results[doc_id] = parsed_content["questions"]
        else:
            # Otherwise assume it is already a list of questions
            parsed_results[doc_id] = parsed_content
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON for doc_id {doc_id}: {e}")

# Prepare final results list
final_results = []

# Collect questions and their associated document IDs
for doc_id, questions in parsed_results.items():
    for question in questions:
        final_results.append((question, doc_id))

# Create DataFrame and save to Excel
df = pd.DataFrame(final_results, columns=['question', 'document'])
df.to_excel('ground-truth-data.xlsx', index=False)
