<a href="https://colab.research.google.com/github/alexk2206/tds_capstone/blob/Alex-DEV/QA_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Generation of the Q and A Dataset**

In [88]:
import pandas as pd
import random
import json

import google.generativeai as genai
from IPython.display import display, Markdown
from google.colab import userdata
import time

In [89]:
url ="https://raw.githubusercontent.com/alexk2206/tds_capstone/refs/heads/main/datasets/answer_combinations.json"

answer_combinations = pd.read_json(url)
print(f"answer_combinations shape: {answer_combinations.shape}")
answer_combinations.sample(15)

answer_combinations shape: (1381, 4)


Unnamed: 0,question,type,options,intended_answer
857,What products are you interested in?,MULTI_SELECT,"[Automotive radar target simulation, Noise fig...","[Automotive radar target simulation, Double-Pu..."
207,CRM-System,SINGLE_SELECT,"[Salesforce, Pipedrive, Close.io, Microsoft Dy...",[Salesforce]
51,CRM-System,SINGLE_SELECT,"[Salesforce, Pipedrive, Close.io, Microsoft Dy...",[Pipedrive]
547,What kind of follow up is planned,MULTI_SELECT,"[Email, Phone, Schedule a Visit, No action]","[Email, Phone]"
396,Notes,TEXT,[Please provide any additional information tha...,[Add additional information here]
770,What phone number can we use for contact?,NUMBER,[phone number],[0177506828836]
1265,Any additional notes?,TEXT,[What additional information would you like to...,[Add additional information here]
401,What type of company is it?,SINGLE_SELECT,"[Construction company, Craft enterprises, Scaf...",[Craft enterprises]
558,Data processing consent,SINGLE_SELECT,"[Yes, No]",[No]
189,Searches a solution for,MULTI_SELECT,"[Scan business cards, Clean up CRM, Extract da...","[Clean up CRM, Extract data from emails, Impro..."


In [90]:
key = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=key)
model = genai.GenerativeModel("gemini-1.5-flash")

In [97]:
max_output_tokens = 64

def generate_selection_answer_easy(question, intended_answer):
  prompt = f"""
  You are asked a question, and you need to provide a natural, conversational answer in the first person.
  Your response should mention **all** the options provided in the intended answer. Act like you do not know which options there are.
  Be concise but clear, and avoid unnecessary elaboration. Use up to {max_output_tokens} tokens.
  Question: {question}\n
  Options: {intended_answer}\n
  Answer as a sentence, mentioning and explaining all the provided options:
  """

  #prompt = f"""
  #You are being asked a question and must answer in a way that sounds like you're speaking in the first person.
  #Your response should be varied and sound natural, like you're being interviewed without talking unnecessarily much. Use actual numbers instead of writing them out, but only if necessary.
  #Answer in present tense like you are having a conversation, use up to {max_output_tokens} tokens but keep it short. When the intended answer is a range, pick a number inbetween.
  #Question: {question}\n
  #Answer: {intended_answer}\n
  #Answer as a sentence:
  #"""

  response = model.generate_content(
      contents = prompt,
      generation_config = genai.GenerationConfig(
          max_output_tokens=max_output_tokens,
          temperature=2)
    )

  answer = response.text.strip()

  time.sleep(4)

  return {"answer": answer, "difficulty": "easy"}

def generate_number_answer_easy(question, intended_answer):
  prompt = f"""
  You are being asked for contact information, and your response should be clear and concise, as if you're giving someone your phone number and how you can be reached in a conversation.
  Mention the provided phone number and ensure your response sounds natural and professional.
  Your answer should be in the first person, present tense, and only include the relevant details. Use up to {max_output_tokens} tokens.
  Question: {question}\n
  Options: {intended_answer}\n
  Answer as a sentence, providing the phone number and any relevant details:
  """

  response = model.generate_content(
      contents = prompt,
      generation_config = genai.GenerationConfig(
          max_output_tokens=max_output_tokens,
          temperature=2)
    )

  answer = response.text.strip()
  time.sleep(4)

  return {"answer": answer, "difficulty": "easy"}


def generate_freetext_answer_easy(question, intended_answer):
  prompt = f"""
  You are being asked if you have any additional notes or information to share.
  Your response should sound natural, in the first person, and can be either brief or more detailed, depending on the situation.
  You can provide additional information but you don't have to and mention it clearly and politely.
  If there isn't anything else to add, express that in a conversational manner. Use up to {max_output_tokens} tokens.
  Question: {question}\n
  Options: {intended_answer}
  Answer as a sentence, providing any additional information or politely stating that there's nothing else to add:
  """

  response = model.generate_content(
      contents = prompt,
      generation_config = genai.GenerationConfig(
          max_output_tokens=max_output_tokens,
          temperature=2)
    )

  answer = response.text.strip()
  time.sleep(4)

  return {"answer": answer, "difficulty": "easy"}


def generate_date_answer_easy(question, intended_answer):
  prompt = f"""
  You are asked a question about a specific date, and you need to provide a natural, conversational answer in the first person.
  Include the date from the intended answer in your response, phrasing it naturally as if you're suggesting a meeting.
  Be concise but clear, and use up to {max_output_tokens} tokens.
  Question: {question}\n
  Intended Answer: {intended_answer}\n
  Context: Provide a conversational response mentioning the date in a natural way:
  """

  response = model.generate_content(
      contents = prompt,
      generation_config = genai.GenerationConfig(
          max_output_tokens=max_output_tokens,
          temperature=2)
    )

  answer = response.text.strip()
  time.sleep(4)

  return {"answer": answer, "difficulty": "easy"}

In [101]:
def generate_freetext_answer_medium(question, intended_answer):
  prompt = f"""
  You are being asked if you have any additional notes or information to share.
  Your response should sound natural, in the first person, and can be either brief or more detailed, depending on the situation.
  You should provide additional information and mention it clearly and politely.
  If there isn't anything else to add (which is sometimes the case), express that in a conversational manner. Use up to {max_output_tokens} tokens.
  Question: {question}\n
  Options: {intended_answer}
  Answer as a sentence, providing any additional information or politely stating that there's nothing else to add:
  """

  response = model.generate_content(
      contents = prompt,
      generation_config = genai.GenerationConfig(
          max_output_tokens=max_output_tokens,
          temperature=2)
    )

  answer = response.text.strip()
  time.sleep(4)

  return {"answer": answer, "difficulty": "medium"}

In [106]:
cycle_count = 0

def generate_answer_for_row(row):
    global cycle_count
    cycle_count += 1
    print(f"Cycle: {cycle_count}")

    question = row['question']
    intended_answer = row['intended_answer']
    question_type = row['type']

    if question_type in ['SINGLE_SELECT', 'MULTI_SELECT']:
        return generate_selection_answer_easy(question, intended_answer)
    elif question_type == 'NUMBER':
        return generate_number_answer_easy(question, intended_answer)
    elif question_type == 'TEXT':
        return generate_freetext_answer_easy(question, intended_answer)
    elif question_type == 'DATE':
        return generate_date_answer_easy(question, intended_answer)
    else:
        return {"answer": "Unknown question type", "difficulty": "unknown"}

In [76]:
# Sampling
sample_type = "TEXT"
sample_size = 15
sample_question = "Size of the trade fair team (on average)"

answer_combinations_filtered = answer_combinations[answer_combinations['type'] == sample_type]
#answer_combinations_filtered = answer_combinations[answer_combinations['question'] == sample_question]
#answer_combinations_filtered = answer_combinations.copy()

sampled_questions = answer_combinations_filtered.sample(n=min(sample_size, len(answer_combinations_filtered))).reset_index(drop=True)

sampled_questions[['context', 'difficulty']] = sampled_questions.apply(lambda row: pd.Series(generate_answer_for_row(row)), axis=1)

sampled_questions

Unnamed: 0,question,type,options,intended_answer,context,difficulty
0,Notes,TEXT,[Please provide any additional information tha...,[Add additional information here],"No, I don't think so; I believe I've covered e...",easy
1,Notes,TEXT,[Please provide any additional information tha...,[Add additional information here],"No, I don't think so; that covers everything. ...",easy
2,Any additional notes?,TEXT,[What additional information would you like to...,[Add additional information here],"No, I think that covers everything. I don't h...",easy
3,Any additional notes?,TEXT,[What additional information would you like to...,[Add additional information here],"No, I think that covers everything. I don't h...",easy
4,Any additional notes?,TEXT,[What additional information would you like to...,[Add additional information here],"No, I don't think so; that covers everything. ...",easy
5,Any additional notes?,TEXT,[What additional information would you like to...,[Add additional information here],"No, I think that covers everything. I don't h...",easy
6,Notes,TEXT,[Please provide any additional information tha...,[Add additional information here],"No, I don't think so; that covers everything. ...",easy
7,Notes,TEXT,[Please provide any additional information tha...,[Add additional information here],"No, I think that covers everything. I don't h...",easy
8,Any additional notes?,TEXT,[What additional information would you like to...,[Add additional information here],"No, I don't think so; that covers everything.",easy
9,Notes,TEXT,[Please provide any additional information tha...,[Add additional information here],"No, I think that covers everything. I don't h...",easy


In [107]:
# QA Dataset

qa_dataset = answer_combinations.copy()

qa_dataset.shape

(1381, 4)

In [104]:
# Define the split and processing function
def split_and_process_dataset(dataset, frac, function, random_state=1):
    # Split the dataset into fractions
    chunks = []
    remaining_data = dataset.copy()

    while not remaining_data.empty:
        chunk = remaining_data.sample(frac=frac, random_state=random_state)
        remaining_data = remaining_data.drop(chunk.index)

        print(f"Processing {len(chunk)} rows...")

        # Apply the function to the chunk
        chunk[['context', 'difficulty']] = chunk.apply(
            lambda row: pd.Series(function(row)), axis=1
        )

        # Collect the processed chunk
        chunks.append(chunk)

    # Combine all chunks back into a single DataFrame
    processed_dataset = pd.concat(chunks, axis=0).reset_index(drop=True)

    return processed_dataset

# Example usage


In [108]:
import pandas as pd

def split_process_and_save_as_dfs(dataset, frac, function):
    """
    Splits a dataset into fractions, processes each fraction, and stores the processed fractions as DataFrames.

    Parameters:
        dataset (pd.DataFrame): The original dataset.
        frac (float): Fraction of the dataset to split and process at a time.
        function (callable): The function to apply to each row.

    Returns:
        List[pd.DataFrame]: A list of processed DataFrames.
    """
    remaining_data = dataset.copy()
    processed_chunks = []
    chunk_index = 1

    while not remaining_data.empty:
        # Sample a fraction of the dataset
        chunk = remaining_data.sample(frac=frac, random_state=chunk_index)
        remaining_data = remaining_data.drop(chunk.index)

        print(f"Processing {len(chunk)} rows for chunk {chunk_index}...")

        # Apply the function to the chunk
        chunk[['context', 'difficulty']] = chunk.apply(
            lambda row: pd.Series(function(row)), axis=1
        )

        # Save the processed chunk as a DataFrame
        processed_chunks.append(chunk)
        print(f"Chunk {chunk_index} processed and saved as a DataFrame.")

        chunk_index += 1

    print("All chunks processed and stored as DataFrames.")
    return processed_chunks

# Example usage
frac = 0.1  # Process 10% of the dataset at a time
processed_dfs = split_process_and_save_as_dfs(qa_dataset, frac, generate_answer_for_row)

# Example: Accessing the first processed DataFrame
print(processed_dfs[0])


Processing 138 rows for chunk 1...
Cycle: 1
Cycle: 2
Cycle: 3
Cycle: 4
Cycle: 5
Cycle: 6
Cycle: 7
Cycle: 8
Cycle: 9
Cycle: 10
Cycle: 11
Cycle: 12
Cycle: 13
Cycle: 14
Cycle: 15
Cycle: 16
Cycle: 17
Cycle: 18
Cycle: 19
Cycle: 20
Cycle: 21
Cycle: 22
Cycle: 23
Cycle: 24
Cycle: 25
Cycle: 26
Cycle: 27
Cycle: 28
Cycle: 29
Cycle: 30
Cycle: 31
Cycle: 32
Cycle: 33
Cycle: 34
Cycle: 35
Cycle: 36
Cycle: 37
Cycle: 38
Cycle: 39
Cycle: 40
Cycle: 41
Cycle: 42
Cycle: 43
Cycle: 44
Cycle: 45
Cycle: 46
Cycle: 47
Cycle: 48
Cycle: 49
Cycle: 50
Cycle: 51
Cycle: 52
Cycle: 53
Cycle: 54
Cycle: 55
Cycle: 56
Cycle: 57
Cycle: 58
Cycle: 59
Cycle: 60
Cycle: 61
Cycle: 62
Cycle: 63
Cycle: 64
Cycle: 65
Cycle: 66
Cycle: 67
Cycle: 68
Cycle: 69
Cycle: 70
Cycle: 71
Cycle: 72
Cycle: 73
Cycle: 74
Cycle: 75
Cycle: 76
Cycle: 77
Cycle: 78
Cycle: 79
Cycle: 80
Cycle: 81
Cycle: 82
Cycle: 83
Cycle: 84
Cycle: 85
Cycle: 86
Cycle: 87
Cycle: 88
Cycle: 89
Cycle: 90
Cycle: 91
Cycle: 92
Cycle: 93
Cycle: 94
Cycle: 95
Cycle: 96
Cycle: 97
Cycl



TooManyRequests: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: Resource has been exhausted (e.g. check quota).

In [None]:
qa_dataset.sample(25)

In [None]:
qa_dataset.to_json('qa_dataset_large.json', orient='records', lines=False, indent=4)