<a href="https://colab.research.google.com/github/alexk2206/tds_capstone/blob/Alex-DEV/QA_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Generation of the Q and A Dataset**

In [27]:
import pandas as pd
import random
import json

import google.generativeai as genai
from IPython.display import display, Markdown
from google.colab import userdata
import time

import concurrent.futures


In [13]:
url ="https://raw.githubusercontent.com/alexk2206/tds_capstone/refs/heads/main/datasets/answer_combinations.json"

answer_combinations = pd.read_json(url)
print(f"answer_combinations shape: {answer_combinations.shape}")
answer_combinations.sample(15)

answer_combinations shape: (1381, 4)


Unnamed: 0,question,type,options,intended_answer
338,Customer type,SINGLE_SELECT,"[New customer, Existing customer, Partner, App...",[New customer]
1306,Any additional notes?,TEXT,[What additional information would you like to...,[Add additional information here]
454,What is the type of contact?,MULTI_SELECT,"[Existing customer, Supplier, New customer / P...","[New customer / Prospect, Press / media, Compe..."
242,What is the size of your company?,SINGLE_SELECT,"[1-10, 11-50, 51-200, 201-2000, larger than 2000]",[1-10]
1224,What products are you interested in?,MULTI_SELECT,"[Automotive radar target simulation, Noise fig...","[Automotive radar target simulation, High-spee..."
1041,What kind of follow up is planned,MULTI_SELECT,"[Email, Phone, Schedule a Visit, No action]","[Email, Phone, Schedule a Visit]"
333,Customer satisfaction,SINGLE_SELECT,"[Very satisfied, Satisfied, Unsatisfied, Very ...",[Very satisfied]
606,What products are you interested in?,MULTI_SELECT,"[Automotive radar target simulation, Noise fig...","[Automotive radar target simulation, Noise fig..."
314,What kind of follow up is planned,MULTI_SELECT,"[Email, Phone, Schedule a Visit, No action]",[Phone]
1160,What is the size of your company?,SINGLE_SELECT,"[1-10, 11-50, 51-200, 201-2000, larger than 2000]",[51-200]


In [14]:
key = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=key)
model = genai.GenerativeModel("gemini-1.5-flash")

In [15]:
max_output_tokens = 64

def generate_selection_answer_easy(question, intended_answer):
  prompt = f"""
  You are asked a question, and you need to provide a natural, conversational answer in the first person.
  Your response should mention **all** the options provided in the intended answer.
  If multiple options are given, briefly explain why each one is relevant, using the present tense.
  Be concise but clear, and avoid unnecessary elaboration.
  Question: {question}
  Options: {intended_answer}
  Answer as a sentence, mentioning and explaining all the provided options:
  """

  #prompt = f"""
  #You are being asked a question and must answer in a way that sounds like you're speaking in the first person.
  #Your response should be varied and sound natural, like you're being interviewed without talking unnecessarily much. Use actual numbers instead of writing them out, but only if necessary.
  #Answer in present tense like you are having a conversation, use up to {max_output_tokens} tokens but keep it short. When the intended answer is a range, pick a number inbetween.
  #Question: {question}\n
  #Answer: {intended_answer}\n
  #Answer as a sentence:
  #"""

  response = model.generate_content(
      contents = prompt,
      generation_config = genai.GenerationConfig(
          max_output_tokens=max_output_tokens,
          temperature=2)
    )

  answer = response.text.strip()

  time.sleep(4)

  return {"answer": answer, "difficulty": "easy"}

def generate_number_answer_easy(question, intended_answer):
  prompt = f"""
  You are being asked for contact information, and your response should be clear and concise, as if you're giving someone your phone number in a conversation.
  Mention the provided phone number and ensure your response sounds natural and professional.
  Your answer should be in the first person, present tense, and only include the relevant details.
  Question: {question}
  Options: {intended_answer}
  Answer as a sentence, providing the phone number and any relevant details:
  """

  response = model.generate_content(
      contents = prompt,
      generation_config = genai.GenerationConfig(
          max_output_tokens=max_output_tokens,
          temperature=2)
    )

  answer = response.text.strip()
  time.sleep(4)

  return {"answer": answer, "difficulty": "easy"}


def generate_freetext_answer_easy(question, intended_answer):
  prompt = f"""
  You are being asked if you have any additional notes or information to share.
  Your response should sound natural, in the first person, and can be either brief or more detailed, depending on the situation.
  If there's additional information, mention it clearly and politely. If there isn't anything else to add, express that in a conversational manner.
  Question: {question}
  Options: {intended_answer}
  Answer as a sentence, providing any additional information or politely stating that there's nothing else to add:
  """

  response = model.generate_content(
      contents = prompt,
      generation_config = genai.GenerationConfig(
          max_output_tokens=max_output_tokens,
          temperature=2)
    )

  answer = response.text.strip()
  time.sleep(4)

  return {"answer": answer, "difficulty": "easy"}



In [28]:
def generate_answer_for_row(row):
    question = row['question']
    intended_answer = row['options']
    question_type = row['type']

    if question_type == 'SINGLE_SELECT':
        return generate_selection_answer_easy(question, intended_answer)
    elif question_type == 'MULTI_SELECT':
        return generate_number_answer_easy(question, intended_answer)
    elif question_type == 'TEXT':
        return generate_freetext_answer_easy(question, intended_answer)
    else:
        return {"answer": "Unknown question type", "difficulty": "unknown"}

In [29]:
def parallelize_apply(df, func, num_workers=4):
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        results = list(executor.map(func, [row for _, row in df.iterrows()]))

    # Combine results into a new DataFrame with 'context' and 'difficulty' columns
    df[['context', 'difficulty']] = pd.DataFrame(results)
    return df

In [30]:
# Sampling
sample_type = "NUMBER"
sample_size = 10
sample_question = "Size of the trade fair team (on average)"

#answer_combinations_filtered = answer_combinations[answer_combinations['type'] == sample_type]
#answer_combinations_filtered = answer_combinations[answer_combinations['question'] == sample_question]
answer_combinations_filtered = answer_combinations.copy()

sampled_questions = answer_combinations_filtered.sample(n=min(sample_size, len(answer_combinations_filtered))).reset_index(drop=True)

#sampled_questions[['context', 'difficulty']] = sampled_questions.apply(lambda row: pd.Series(generate_answer_for_row(row)), axis=1)

sampled_questions = parallelize_apply(sampled_questions, generate_answer_for_row, num_workers=8)

sampled_questions

KeyboardInterrupt: 

In [None]:
# QA Dataset

qa_dataset = answer_combinations.copy()

qa_dataset

Unnamed: 0,question,type,options
0,CRM-System,SINGLE_SELECT,[CAS]
1,CRM-System,SINGLE_SELECT,[Microsoft Dynamics]
2,CRM-System,SINGLE_SELECT,[HubSpot]
3,CRM-System,SINGLE_SELECT,[Salesforce]
4,CRM-System,SINGLE_SELECT,[Adito]
...,...,...,...
577,Would you like to receive marketing informatio...,SINGLE_SELECT,[Yes]
578,Would you like to receive marketing informatio...,SINGLE_SELECT,[Yes]
579,Would you like to receive marketing informatio...,SINGLE_SELECT,[No]
580,Would you like to receive marketing informatio...,SINGLE_SELECT,[Yes]


In [None]:
qa_dataset['context'] = qa_dataset.apply(generate_answer_for_row, axis=1)

In [None]:
qa_dataset.sample(25)

Unnamed: 0,question,type,options,context
367,What is the type of contact?,MULTI_SELECT,"[Existing customer, Supplier, Competitor]","It's an existing customer, actually. We've be..."
397,What kind of follow up is planned,MULTI_SELECT,"[Phone, Schedule a Visit]","I'm planning a phone call, and then, if needed..."
439,What products are you interested in?,MULTI_SELECT,[Automotive radar target simulation],"Oh, I'm really focused on automotive radar tar..."
560,Would you like to receive marketing informatio...,SINGLE_SELECT,[Yes],"Sure, I'd like that. Yes."
274,Size of the trade fair team (on average),SINGLE_SELECT,[21-30],"It fluctuates, but I'd say we usually have bet..."
357,What is the size of your company?,SINGLE_SELECT,[1-10],"We're a small team, just between 1 and 10 people."
426,What products are you interested in?,MULTI_SELECT,"[Automotive radar target simulation, Double-Pu...","Right now, I'm really focused on automotive ra..."
227,Searches a solution for,MULTI_SELECT,"[Scan business cards, Clean up CRM, Extract da...",I'm working on a few things right now: scannin...
457,What type of company is it?,SINGLE_SELECT,[Construction company],"Oh, it's a construction company. We build thi..."
268,Size of the trade fair team (on average),SINGLE_SELECT,[6-10],We usually have a team of between 6 and 10 peo...


In [None]:
qa_dataset.to_json('qa_dataset_large.json', orient='records', lines=False, indent=4)