<a href="https://colab.research.google.com/github/alexk2206/tds_capstone/blob/Alex-DEV/QA_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Generation of the Q and A Dataset**

In [1]:
import pandas as pd
import random
import json

import google.generativeai as genai
from IPython.display import display, Markdown
from google.colab import userdata
import time

In [2]:
url ="https://raw.githubusercontent.com/alexk2206/tds_capstone/refs/heads/main/datasets/answer_combinations.json"

answer_combinations = pd.read_json(url)
print(f"answer_combinations shape: {answer_combinations.shape}")
answer_combinations.sample(15)

answer_combinations shape: (1381, 4)


Unnamed: 0,question,type,options,intended_answer
940,Who to copy in follow up,MULTI_SELECT,"[Stephan Maier, Joachim Wagner, Erik Schneider...","[Stephan Maier, Marisa Peng, Johannes Wagner, ..."
211,Customer group,SINGLE_SELECT,"[End User, Wholesaler, Distributor, Consultant...",[Wholesaler]
128,Would you like to receive marketing informatio...,SINGLE_SELECT,"[Yes, No]",[No]
904,Any additional notes?,TEXT,[What additional information would you like to...,[Add additional information here]
473,Searches a solution for,MULTI_SELECT,"[Scan business cards, Clean up CRM, Extract da...","[Scan business cards, Extract data from emails..."
826,Which language is wanted for communication?,SINGLE_SELECT,"[German, Italian, Japanese , English, Spanish]",[Spanish]
905,Which language is wanted for communication?,SINGLE_SELECT,"[German, Italian, Japanese , English, Spanish]",[Italian]
1371,What is the type of contact?,MULTI_SELECT,"[Existing customer, Supplier, New customer / P...","[Existing customer, Supplier]"
1157,What is the type of contact?,MULTI_SELECT,"[Existing customer, Supplier, New customer / P...","[Existing customer, Supplier, Press / media]"
884,Customer type,SINGLE_SELECT,"[New customer, Existing customer, Partner, App...",[Partner]


In [3]:
key = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=key)
model = genai.GenerativeModel("gemini-1.5-flash")

In [14]:
max_output_tokens = 64

def generate_selection_answer_easy(question, intended_answer):
  prompt = f"""
  You are asked a question, and you need to provide a natural, conversational answer in the first person.
  Your response should mention **all** the options provided in the intended answer. Act like you do not know which options there are.
  Be concise but clear, and avoid unnecessary elaboration. Use up to {max_output_tokens} tokens.
  Question: {question}\n
  Options: {intended_answer}\n
  Answer as a sentence, mentioning and explaining all the provided options:
  """

  #prompt = f"""
  #You are being asked a question and must answer in a way that sounds like you're speaking in the first person.
  #Your response should be varied and sound natural, like you're being interviewed without talking unnecessarily much. Use actual numbers instead of writing them out, but only if necessary.
  #Answer in present tense like you are having a conversation, use up to {max_output_tokens} tokens but keep it short. When the intended answer is a range, pick a number inbetween.
  #Question: {question}\n
  #Answer: {intended_answer}\n
  #Answer as a sentence:
  #"""

  response = model.generate_content(
      contents = prompt,
      generation_config = genai.GenerationConfig(
          max_output_tokens=max_output_tokens,
          temperature=2)
    )

  answer = response.text.strip()

  time.sleep(5)

  return {"answer": answer, "difficulty": "easy"}

def generate_number_answer_easy(question, intended_answer):
  prompt = f"""
  You are being asked for contact information, and your response should be clear and concise, as if you're giving someone your phone number and how you can be reached in a conversation.
  Mention the provided phone number and ensure your response sounds natural and professional.
  Your answer should be in the first person, present tense, and only include the relevant details. Use up to {max_output_tokens} tokens.
  Question: {question}\n
  Options: {intended_answer}\n
  Answer as a sentence, providing the phone number and any relevant details:
  """

  response = model.generate_content(
      contents = prompt,
      generation_config = genai.GenerationConfig(
          max_output_tokens=max_output_tokens,
          temperature=2)
    )

  answer = response.text.strip()
  time.sleep(5)

  return {"answer": answer, "difficulty": "easy"}


def generate_freetext_answer_easy(question, intended_answer):
  prompt = f"""
  You are being asked if you have any additional notes or information to share.
  Your response should sound natural, in the first person, and can be either brief or more detailed, depending on the situation.
  You can provide additional information but you don't have to and mention it clearly and politely.
  If there isn't anything else to add, express that in a conversational manner. Use up to {max_output_tokens} tokens.
  Question: {question}\n
  Options: {intended_answer}
  Answer as a sentence, providing any additional information or politely stating that there's nothing else to add:
  """

  response = model.generate_content(
      contents = prompt,
      generation_config = genai.GenerationConfig(
          max_output_tokens=max_output_tokens,
          temperature=2)
    )

  answer = response.text.strip()
  time.sleep(5)

  return {"answer": answer, "difficulty": "easy"}


def generate_date_answer_easy(question, intended_answer):
  prompt = f"""
  You are asked a question about a specific date, and you need to provide a natural, conversational answer in the first person.
  Include the date from the intended answer in your response, phrasing it naturally as if you're suggesting a meeting.
  Be concise but clear, and use up to {max_output_tokens} tokens.
  Question: {question}\n
  Intended Answer: {intended_answer}\n
  Context: Provide a conversational response mentioning the date in a natural way:
  """

  response = model.generate_content(
      contents = prompt,
      generation_config = genai.GenerationConfig(
          max_output_tokens=max_output_tokens,
          temperature=2)
    )

  answer = response.text.strip()
  time.sleep(5)

  return {"answer": answer, "difficulty": "easy"}

In [15]:
def generate_freetext_answer_medium(question, intended_answer):
  prompt = f"""
  You are being asked if you have any additional notes or information to share.
  Your response should sound natural, in the first person, and can be either brief or more detailed, depending on the situation.
  You should provide additional information and mention it clearly and politely.
  If there isn't anything else to add (which is sometimes the case), express that in a conversational manner. Use up to {max_output_tokens} tokens.
  Question: {question}\n
  Options: {intended_answer}
  Answer as a sentence, providing any additional information or politely stating that there's nothing else to add:
  """

  response = model.generate_content(
      contents = prompt,
      generation_config = genai.GenerationConfig(
          max_output_tokens=max_output_tokens,
          temperature=2)
    )

  answer = response.text.strip()
  time.sleep(5)

  return {"answer": answer, "difficulty": "medium"}

In [16]:
cycle_count = 0

def generate_answer_for_row(row):
    global cycle_count
    cycle_count += 1
    print(f"Cycle: {cycle_count}")

    question = row['question']
    intended_answer = row['intended_answer']
    question_type = row['type']

    if question_type in ['SINGLE_SELECT', 'MULTI_SELECT']:
        return generate_selection_answer_easy(question, intended_answer)
    elif question_type == 'NUMBER':
        return generate_number_answer_easy(question, intended_answer)
    elif question_type == 'TEXT':
        return generate_freetext_answer_easy(question, intended_answer)
    elif question_type == 'DATE':
        return generate_date_answer_easy(question, intended_answer)
    else:
        return {"answer": "Unknown question type", "difficulty": "unknown"}

In [17]:
# Sampling
sample_type = "TEXT"
sample_size = 100
sample_question = "Size of the trade fair team (on average)"

#answer_combinations_filtered = answer_combinations[answer_combinations['type'] == sample_type]
#answer_combinations_filtered = answer_combinations[answer_combinations['question'] == sample_question]
answer_combinations_filtered = answer_combinations.copy()

sampled_questions = answer_combinations_filtered.sample(n=min(sample_size, len(answer_combinations_filtered))).reset_index(drop=True)

sampled_questions[['context', 'difficulty']] = sampled_questions.apply(lambda row: pd.Series(generate_answer_for_row(row)), axis=1)

sampled_questions

Cycle: 1
Cycle: 2
Cycle: 3
Cycle: 4
Cycle: 5
Cycle: 6
Cycle: 7
Cycle: 8
Cycle: 9
Cycle: 10
Cycle: 11
Cycle: 12
Cycle: 13
Cycle: 14
Cycle: 15
Cycle: 16
Cycle: 17
Cycle: 18
Cycle: 19
Cycle: 20
Cycle: 21
Cycle: 22
Cycle: 23
Cycle: 24
Cycle: 25
Cycle: 26
Cycle: 27
Cycle: 28
Cycle: 29
Cycle: 30
Cycle: 31
Cycle: 32
Cycle: 33
Cycle: 34
Cycle: 35
Cycle: 36
Cycle: 37
Cycle: 38
Cycle: 39
Cycle: 40
Cycle: 41
Cycle: 42
Cycle: 43
Cycle: 44
Cycle: 45
Cycle: 46
Cycle: 47
Cycle: 48
Cycle: 49
Cycle: 50
Cycle: 51
Cycle: 52
Cycle: 53
Cycle: 54
Cycle: 55
Cycle: 56
Cycle: 57
Cycle: 58
Cycle: 59
Cycle: 60
Cycle: 61
Cycle: 62
Cycle: 63
Cycle: 64
Cycle: 65
Cycle: 66
Cycle: 67
Cycle: 68
Cycle: 69
Cycle: 70
Cycle: 71
Cycle: 72
Cycle: 73
Cycle: 74
Cycle: 75
Cycle: 76
Cycle: 77
Cycle: 78
Cycle: 79
Cycle: 80
Cycle: 81
Cycle: 82
Cycle: 83
Cycle: 84
Cycle: 85
Cycle: 86
Cycle: 87
Cycle: 88
Cycle: 89
Cycle: 90
Cycle: 91
Cycle: 92
Cycle: 93
Cycle: 94
Cycle: 95
Cycle: 96
Cycle: 97
Cycle: 98
Cycle: 99
Cycle: 100


Unnamed: 0,question,type,options,intended_answer,context,difficulty
0,What type of company is it?,SINGLE_SELECT,"[Construction company, Craft enterprises, Scaf...",[Construction company],"It's a construction company; that is, they bui...",easy
1,Would you like to receive marketing informatio...,SINGLE_SELECT,"[Yes, No]",[Yes],"Yes, I'd like to receive marketing emails; tha...",easy
2,What is the size of your company?,SINGLE_SELECT,"[1-10, 11-50, 51-200, 201-2000, larger than 2000]",[1-10],We're a small company; I'd say we're in the 1-...,easy
3,Next steps,SINGLE_SELECT,"[Offer, Meeting, Call]",[Meeting],"Next, I need to schedule a meeting to discuss ...",easy
4,What kind of follow up is planned,MULTI_SELECT,"[Email, Phone, Schedule a Visit, No action]","[Email, Phone, Schedule a Visit, No action]","I might email you, call you on the phone, sche...",easy
...,...,...,...,...,...,...
95,What is the size of your company?,SINGLE_SELECT,"[1-10, 11-50, 51-200, 201-2000, larger than 2000]",[larger than 2000],I work for a pretty big company; it's larger t...,easy
96,Data processing consent,SINGLE_SELECT,"[Yes, No]",[Yes],"Yes, I consent to my data being processed.",easy
97,What type of company is it?,SINGLE_SELECT,"[Construction company, Craft enterprises, Scaf...",[Craft enterprises],"It's a craft enterprises company, meaning they...",easy
98,Who to copy in follow up,MULTI_SELECT,"[Stephan Maier, Joachim Wagner, Erik Schneider...","[Joachim Wagner, Jessica Hanke, Domiki Stein]","I should probably copy Joachim Wagner, Jessica...",easy


In [18]:
sampled_questions.to_json('sampled_qa_dataset_easy.json', orient='records', indent=4)

In [7]:
# QA Dataset

qa_dataset = answer_combinations.copy()

qa_dataset.shape

(1381, 4)

In [None]:
qa_dataset.sample(25)

In [None]:
qa_dataset.to_json('qa_dataset_large.json', orient='records', lines=False, indent=4)