In [None]:
# importing chunked data list from other file
from new_try_latest import  chunked_data

# choosing random 1000 chunks to generate questions from

In [None]:
import random
chunked_sample=random.sample(chunked_data,1000)

In [None]:
from qdrant_client import QdrantClient

# Initialize Qdrant Client
qdrant_client = QdrantClient("http://localhost:6333")

In [None]:

def retrieve_similar_documents(query_embedding, collection_name="unified_collection", top_k=5):
    """
    Retrieve the top-k most similar documents from Qdrant for the given query embedding.
    """
    if not query_embedding or not isinstance(query_embedding, list):
        print("Invalid query embedding. Ensure it is a list of floats.")
        return [], [], ""

    results = qdrant_client.search(
        collection_name=collection_name,
        query_vector=query_embedding,
        limit=top_k  # Retrieve the top-k matches
    )

    # Extract relevant information from results
    similar_docs = [
        {
            "content": result.payload.get("content"),  # The actual text content
            "metadata": {
                "type": result.payload.get("type"),
                "url": result.payload.get("url")
            },
            "score": result.score  # Similarity score
        }
        for result in results
    ]
    context_chunks = [result.payload for result in results]
    context = "\n".join([result.payload["content"] for result in results])

    return similar_docs, context_chunks, context



In [None]:
def parse_completion(completion, batch):
    results = []
    try:
        # Split by '#### Content' to get individual sections
        sections = completion.split("#### Content")
        
        
        for i, section in enumerate(sections[1:], start=0):  # Skip the initial empty or header part
            lines = section.strip().split("\n")
            current_question = None
            current_answer = None
            qa_pairs = []

            # Extract all Q&A pairs from the section
            for line in lines:
                if "**Question:**" in line:
                    # If there is an ongoing Q&A, save it before starting a new one
                    if current_question and current_answer:
                        qa_pairs.append((current_question, current_answer))
                    current_question = line.split("**Question:**")[1].strip()
                    current_answer = None  # Reset answer for a new question
                elif "**Answer:**" in line:
                    current_answer = line.split("**Answer:**")[1].strip()

            # Add the last Q&A pair in the section
            if current_question and current_answer:
                qa_pairs.append((current_question, current_answer))

            # Map extracted Q&A pairs to the corresponding chunk in the batch
            for question, answer in qa_pairs:
                if i < len(batch):  # Ensure mapping does not exceed batch size
                    _, context_chunks, _ = retrieve_similar_documents(
                        query_embedding=batch[i].get("embedding"),  # Assuming batch items have 'embedding'
                        collection_name="unified_collection",
                        top_k=5
                    )
                    results.append({
                        "question": question,
                        "answer": answer,
                        "context": context_chunks,
                    })
        
        return results
    except Exception as e:
        print(f"Error parsing completion: {e}")
        return []


In [None]:
from openai import OpenAI
import time
import csv
import json
# Set up OpenAI API Key
client = OpenAI(
# Set your OpenAI API key
api_key= "xxx"
)
BATCH_SIZE = 10

# List to store generated triplets
triplets = []
def generate_qna_batch(batch):
    try:
        # Concatenate content from all chunks in the batch
        concatenated_content = "\n\n".join([f"Content {i+1}: {chunk['content']}" for i, chunk in enumerate(batch)])
        messages = [
    {"role": "system", "content": "You are an assistant that generates questions and answers for given content."},
    {
        "role": "user",
        "content": (
            "Based on the following content, generate  question and answer for each section. "
"Do not limit the number of questions and answers; generate as many as relevant.\n\n"
"### Questions and Answers:\n\n"
"#### Content 1:\n"
"**Question:** [Insert question here]\n"
"**Answer:** [Insert answer here]\n\n"


        ),
    },
    {"role": "user", "content": f"{concatenated_content}"},
]
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=messages,
            temperature=0.7,
        )
        # print("response is:",response)
        completion = response.choices[0].message.content.strip()
        
      
        # print("completion:",completion)
        return parse_completion(completion,batch)
      
    
        
   
    except Exception as e:
        print(f"Error generating Q&A for batch: {e}")
        return []    

    

        
      

# Process each chunk to generate triplets
for i in range(0, len(chunked_sample), BATCH_SIZE):
    print(f"Processing data {i}")
    batch = chunked_sample[i:i + BATCH_SIZE]
    batch_results=generate_qna_batch(batch)
    print("batch result is:",batch_results)
    triplets.extend(batch_results)

    print(f"Batch {i // BATCH_SIZE + 1} processed. Generated {len(batch_results)} Q&A pairs.")





In [None]:
def filter_vague_qna(triplets):
    filtered_triplets = []
    for triplet in triplets:
        # Check if the answer is too short or incomplete
        if triplet["answer"].endswith(":"):
            continue  # Skip vague Q&A
        
       
        filtered_triplets.append(triplet)
    return filtered_triplets

# Apply the filter
triplets_now = filter_vague_qna(triplets)


In [None]:
new_triplets=[]
for each in triplets_now:
    new_triplets.append({'question':each['question'],'answer':each['answer'],'context':each['context']})
with open("qna_triplets.json", "w") as json_file:
    json.dump(new_triplets, json_file, indent=4)

with open("qna_triplets_final.csv", "w", newline="") as csv_file:
    writer = csv.DictWriter(
        csv_file,
        fieldnames=["question", "answer", "context"]
    )
    writer.writeheader()
    writer.writerows(new_triplets)