<a href="https://colab.research.google.com/github/alexk2206/tds_capstone/blob/main/QA_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Generation of the Q and A Dataset**

In [None]:
import pandas as pd
import random
import json

url ="https://raw.githubusercontent.com/alexk2206/tds_capstone/refs/heads/main/answer_combinations.json"

answer_combinations = pd.read_json(url)
answer_combinations

Unnamed: 0,question,type,options
0,Data processing consent,SINGLE_SELECT,[Yes]
1,Data processing consent,SINGLE_SELECT,[No]
2,Customer group,SINGLE_SELECT,[End User]
3,Customer group,SINGLE_SELECT,[Wholesaler]
4,Customer group,SINGLE_SELECT,[Distributor]
...,...,...,...
8530,Searches a solution for,MULTI_SELECT,"[Clean up CRM, Extract data from emails, Impro..."
8531,Searches a solution for,MULTI_SELECT,"[Scan business cards, Clean up CRM, Extract da..."
8532,Next steps,SINGLE_SELECT,[Offer]
8533,Next steps,SINGLE_SELECT,[Meeting]


In [None]:
#def limit_question_amount(df, column, max_amount, random_state):
#  return df.groupby(column, group_keys=False).apply(lambda group: group.sample(n=min(len(group), max_amount), random_state = 1))


In [None]:
def adjust_question_amount(df, column, random_state):
    random.seed(random_state)
    def adjust_group(group):
        max_amount = random.randint(24, 32)

        if len(group) < max_amount:
            return group.sample(n=max_amount, replace=True, random_state=random_state)
        else:
            return group.sample(n=max_amount, random_state=random_state)

    return df.groupby(column, group_keys=False).apply(adjust_group).reset_index(drop=True)

In [None]:
answer_combinations_limited = adjust_question_amount(answer_combinations, 'question', 1)
print(answer_combinations_limited.shape)
answer_combinations_limited

(582, 3)


  return df.groupby(column, group_keys=False).apply(adjust_group).reset_index(drop=True)


Unnamed: 0,question,type,options
0,CRM-System,SINGLE_SELECT,[CAS]
1,CRM-System,SINGLE_SELECT,[Microsoft Dynamics]
2,CRM-System,SINGLE_SELECT,[HubSpot]
3,CRM-System,SINGLE_SELECT,[Salesforce]
4,CRM-System,SINGLE_SELECT,[Adito]
...,...,...,...
577,Would you like to receive marketing informatio...,SINGLE_SELECT,[Yes]
578,Would you like to receive marketing informatio...,SINGLE_SELECT,[Yes]
579,Would you like to receive marketing informatio...,SINGLE_SELECT,[No]
580,Would you like to receive marketing informatio...,SINGLE_SELECT,[Yes]


In [None]:
import google.generativeai as genai
from IPython.display import display, Markdown
from google.colab import userdata
import time

# retrieving the key stored in Colab
key = userdata.get('GOOGLE_API_KEY')

# configure the key for calling GenAI model
genai.configure(api_key=key)

# load model
model = genai.GenerativeModel("gemini-1.5-flash")

In [None]:
max_output_tokens = 64

def generate_answer(question, options):
  prompt = f"""
  You are being asked a question and must answer in a way that sounds like you're speaking in the first person.
  Your response should be varied and sound natural, like you're being interviewed without talking unnecessarily much. Use actual numbers instead of writing them out, but only if necessary.
  Answer in present tense like you are having a conversation, use up to {max_output_tokens} tokens but keep it short.
  Question: {question}\n
  Answer: {options}\n
  Answer as a sentence:
  """
  response = model.generate_content(
      contents = prompt,
      generation_config = genai.GenerationConfig(
          max_output_tokens=max_output_tokens,
          temperature=2)
    )

  answer = response.text.strip()

  time.sleep(4)

  return answer

In [None]:
# sampling

sample_size = 5

answer_combinations_limited_sample = answer_combinations_limited.sample(sample_size).copy()

answer_combinations_limited_sample['context'] = answer_combinations_limited_sample.apply(lambda row: generate_answer(row['question'], row['options']), axis=1)

answer_combinations_limited_sample

Unnamed: 0,question,type,options,context
351,What is the size of your company?,SINGLE_SELECT,[51-200],"We're a medium-sized company, I'd say. Betwee..."
29,Customer group,SINGLE_SELECT,[End User],"Oh, I mostly deal with end users."
174,Productinterests,MULTI_SELECT,"[BusinessCards, VisitReport, Data Cleansing]","I'm really interested in business cards, visi..."
522,Which language is wanted for communication?,SINGLE_SELECT,[Spanish],I prefer Spanish.
65,Customer satisfaction,SINGLE_SELECT,[Satisfied],I'm seeing high customer satisfaction; most of...


In [None]:
# QA Dataset

qa_dataset = answer_combinations_limited.copy()

qa_dataset

Unnamed: 0,question,type,options
0,CRM-System,SINGLE_SELECT,[CAS]
1,CRM-System,SINGLE_SELECT,[Microsoft Dynamics]
2,CRM-System,SINGLE_SELECT,[HubSpot]
3,CRM-System,SINGLE_SELECT,[Salesforce]
4,CRM-System,SINGLE_SELECT,[Adito]
...,...,...,...
577,Would you like to receive marketing informatio...,SINGLE_SELECT,[Yes]
578,Would you like to receive marketing informatio...,SINGLE_SELECT,[Yes]
579,Would you like to receive marketing informatio...,SINGLE_SELECT,[No]
580,Would you like to receive marketing informatio...,SINGLE_SELECT,[Yes]


In [None]:
qa_dataset['context'] = qa_dataset.apply(lambda row: generate_answer(row['question'], row['options']), axis=1)

In [None]:
qa_dataset.sample(25)

Unnamed: 0,question,type,options,context
367,What is the type of contact?,MULTI_SELECT,"[Existing customer, Supplier, Competitor]","It's an existing customer, actually. We've be..."
397,What kind of follow up is planned,MULTI_SELECT,"[Phone, Schedule a Visit]","I'm planning a phone call, and then, if needed..."
439,What products are you interested in?,MULTI_SELECT,[Automotive radar target simulation],"Oh, I'm really focused on automotive radar tar..."
560,Would you like to receive marketing informatio...,SINGLE_SELECT,[Yes],"Sure, I'd like that. Yes."
274,Size of the trade fair team (on average),SINGLE_SELECT,[21-30],"It fluctuates, but I'd say we usually have bet..."
357,What is the size of your company?,SINGLE_SELECT,[1-10],"We're a small team, just between 1 and 10 people."
426,What products are you interested in?,MULTI_SELECT,"[Automotive radar target simulation, Double-Pu...","Right now, I'm really focused on automotive ra..."
227,Searches a solution for,MULTI_SELECT,"[Scan business cards, Clean up CRM, Extract da...",I'm working on a few things right now: scannin...
457,What type of company is it?,SINGLE_SELECT,[Construction company],"Oh, it's a construction company. We build thi..."
268,Size of the trade fair team (on average),SINGLE_SELECT,[6-10],We usually have a team of between 6 and 10 peo...


In [None]:
qa_dataset.to_json('qa_dataset_large.json', orient='records', lines=False, indent=4)