<a href="https://colab.research.google.com/github/alexk2206/tds_capstone/blob/Alex-DEV/QA_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# QA dataset
Created by: Alexander Keßler

In [None]:
import pandas as pd
import random
import json

import google.generativeai as genai
from IPython.display import display, Markdown
from google.colab import userdata
import time

## Load required data

In [None]:
url ="https://raw.githubusercontent.com/alexk2206/tds_capstone/refs/heads/main/datasets/answer_combinations.json"

answer_combinations = pd.read_json(url)
print(f"answer_combinations shape: {answer_combinations.shape}")
answer_combinations.sample(15)

answer_combinations shape: (1381, 4)


Unnamed: 0,question,type,options,intended_answer
865,Data processing consent,SINGLE_SELECT,"[Yes, No]",[Yes]
1198,Would you like to receive marketing informatio...,SINGLE_SELECT,"[Yes, No]",[No]
808,Notes,TEXT,[Please provide any additional information tha...,[Add additional information here]
824,Next steps,SINGLE_SELECT,"[Offer, Meeting, Call]",[Offer]
350,Who to copy in follow up,MULTI_SELECT,"[Stephan Maier, Joachim Wagner, Erik Schneider...","[Joachim Wagner, Erik Schneider, Sandro Kalter..."
1046,When do you wish to receive a follow-up?,DATE,[Date],[2025-01-21]
743,Next steps,SINGLE_SELECT,"[Offer, Meeting, Call]",[Offer]
893,What type of company is it?,SINGLE_SELECT,"[Construction company, Craft enterprises, Scaf...",[Trading company]
74,What products are you interested in?,MULTI_SELECT,"[Automotive radar target simulation, Noise fig...",[]
186,What is the size of your company?,SINGLE_SELECT,"[1-10, 11-50, 51-200, 201-2000, larger than 2000]",[1-10]


In [None]:
# Setting up the API Key and the required model

key = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=key)
model = genai.GenerativeModel("gemini-1.5-flash")

In [None]:
# Set the maximum output tokens for the responses to limit the length of the generated text. This was very useful to track at which iteration Gemini was creating contexts.
max_output_tokens = 48

def generate_selection_answer_easy(question, intended_answer):
  # The prompt is designed to make the model respond naturally, as if it doesn’t know the answer options, but it still uses the intended answer.
  prompt = f"""
  You are asked a question, and you need to provide a natural, conversational answer in the first person. Do not use special characters other than ',' and '.'.
  Act like you really do not know which options there are and the intended answer is your answer.
  When given a range, use a number between the two values.
  Be concise but clear, and avoid unnecessary elaboration. Use up to {max_output_tokens} tokens.
  Question: {question}\n
  Intended answer: {intended_answer}\n
  Answer as a sentence, mentioning and explaining all the provided options:
  """
  # Generating a response.
  response = model.generate_content(
      contents = prompt,
      generation_config = genai.GenerationConfig(
          max_output_tokens=max_output_tokens,
          temperature=2)  # Using a higher temperature to get more varied responses.
    )

  # The generated answer is stripped of any extra whitespace.
  answer = response.text.strip()

  # Introduce a small delay between API calls to avoid exceeding rate limits.
  # We found out, that 6 Seconds was the sweetspot here.
  time.sleep(6)

  # Return the generated answer along with the difficulty tag.
  return {"answer": answer, "difficulty": "easy"}


def generate_number_answer_easy(question, intended_answer):
  # The prompt is tailored for a contact information request, instructing the model to give a clear and concise phone number answer.
  prompt = f"""
  You are being asked for contact information, and your response should be clear and concise, as if you're giving someone your phone number and how you can be reached in a conversation.
  Mention the provided phone number and ensure your response sounds natural and professional.
  Your answer should be in the first person, present tense, and only include the relevant details. Use up to {max_output_tokens} tokens.
  Question: {question}\n
  Options: {intended_answer}\n
  Answer as a sentence, providing the phone number and any relevant details:
  """
  response = model.generate_content(
      contents = prompt,
      generation_config = genai.GenerationConfig(
          max_output_tokens=max_output_tokens,
          temperature=2)
    )

  answer = response.text.strip()

  time.sleep(6)

  return {"answer": answer, "difficulty": "easy"}


def generate_freetext_answer_easy(question, intended_answer):
  # The prompt is designed for open-ended answers where the model responds naturally, either providing information or politely declining.
  prompt = f"""
  You are being asked if you have any additional notes or information to share.
  Your response should sound natural, in the first person, and can be either brief or more detailed, depending on the situation.
  You can provide additional information but you don't have to and mention it clearly and politely.
  If there isn't anything else to add, express that in a conversational manner. Use up to {max_output_tokens} tokens.
  Question: {question}\n
  Options: {intended_answer}
  Answer as a sentence, providing any additional information or politely stating that there's nothing else to add:
  """
  response = model.generate_content(
      contents = prompt,
      generation_config = genai.GenerationConfig(
          max_output_tokens=max_output_tokens,
          temperature=2)
    )

  answer = response.text.strip()

  time.sleep(6)

  return {"answer": answer, "difficulty": "easy"}


def generate_date_answer_easy(question, intended_answer):
  # The prompt is tailored for a date-based question, encouraging a natural, conversational response mentioning the intended date.
  prompt = f"""
  You are asked a question about a specific date, and you need to provide a natural, conversational answer in the first person.
  Include the date from the intended answer in your response, phrasing it naturally as if you're suggesting a meeting.
  Be concise but clear, and use up to {max_output_tokens} tokens.
  Question: {question}\n
  Intended Answer: {intended_answer}\n
  Context: Provide a conversational response mentioning the date in a natural way:
  """
  response = model.generate_content(
      contents = prompt,
      generation_config = genai.GenerationConfig(
          max_output_tokens=max_output_tokens,
          temperature=2)
    )

  answer = response.text.strip()

  time.sleep(6)

  return {"answer": answer, "difficulty": "easy"}

In [None]:
# Define the function for generating a free-text answer in a medium difficulty level. We ended up not using it though.
def generate_freetext_answer_medium(question, intended_answer):
  # The prompt is similar to the easy version, but now the model is expected to provide more detailed additional information where applicable.
  prompt = f"""
  You are being asked if you have any additional notes or information to share.
  Your response should sound natural, in the first person, and can be either brief or more detailed, depending on the situation.
  You should provide additional information and mention it clearly and politely.
  If there isn't anything else to add (which is sometimes the case), express that in a conversational manner. Use up to {max_output_tokens} tokens.
  Question: {question}\n
  Options: {intended_answer}
  Answer as a sentence, providing any additional information or politely stating that there's nothing else to add:
  """

  # Generate response.
  response = model.generate_content(
      contents = prompt,
      generation_config = genai.GenerationConfig(
          max_output_tokens=max_output_tokens,
          temperature=2)  # Temperature remains high for more varied responses.
    )

  answer = response.text.strip()

  time.sleep(6)

  return {"answer": answer, "difficulty": "medium"}

In [None]:
# Initialize a global variable to track the number of cycles
cycle_count = 0

# Define the function that generates answers for a row of data based on its type
def generate_answer_for_row(row):
    global cycle_count
    cycle_count += 1  # Increment the cycle count for each row processed
    print(f"Cycle: {cycle_count}")  # Print the current cycle number to track progress

    question = row['question']
    intended_answer = row['intended_answer']
    question_type = row['type']

    # Check the type of the question and generate the appropriate answer
    if question_type in ['SINGLE_SELECT', 'MULTI_SELECT']:  # For selection type questions
        return generate_selection_answer_easy(question, intended_answer)
    elif question_type == 'NUMBER':  # For number type questions
        return generate_number_answer_easy(question, intended_answer)
    elif question_type == 'TEXT':  # For free-text questions
        return generate_freetext_answer_easy(question, intended_answer)
    elif question_type == 'DATE':  # For date type questions
        return generate_date_answer_easy(question, intended_answer)
    else:
        return {"answer": "Unknown question type", "difficulty": "unknown"}  # Default case for unknown question types


## Generating contexts

In [None]:
# Sampling
# This came in handy to check quickly on generated contexts.

cycle_count = 0

sample_type = "MULTI_SELECT"
sample_size = 5
sample_question = "Size of the trade fair team (on average)"

# Filtering the data in a desired way let us look up the contexts for special types or questions

answer_combinations_filtered = answer_combinations[answer_combinations['type'] == sample_type]
#answer_combinations_filtered = answer_combinations[answer_combinations['question'] == sample_question]
#answer_combinations_filtered = answer_combinations.copy()

sampled_questions = answer_combinations_filtered.sample(n=min(sample_size, len(answer_combinations_filtered))).reset_index(drop=True)

sampled_questions[['context', 'difficulty']] = sampled_questions.apply(lambda row: pd.Series(generate_answer_for_row(row)), axis=1)

sampled_questions

Cycle: 1
Cycle: 2
Cycle: 3
Cycle: 4
Cycle: 5


Unnamed: 0,question,type,options,intended_answer,context,difficulty
0,What is the type of contact?,MULTI_SELECT,"[Existing customer, Supplier, New customer / P...","[Existing customer, New customer / Prospect, P...","Hmm, well it could be an existing customer, or...",easy
1,What is the contact person interested in?,MULTI_SELECT,"[100 Additive Manufacturing, 200 Automation, 3...","[100 Additive Manufacturing, 200 Automation, 2...","Well, I think they might like 100 Additive Man...",easy
2,Products interested in,MULTI_SELECT,"[MY-SYSTEM, Notion, JTS, JS EcoLine, AKW100, A...","[MY-SYSTEM, Notion, JTS, JS EcoLine, AKW100]","Well, I'm interested in MY-SYSTEM, and Notion,...",easy
3,Productinterests,MULTI_SELECT,"[BusinessCards, DataEnrichment, VisitReport, D...","[BusinessCards, VisitReport]","Okay, I think I'm interested in BusinessCards ...",easy
4,What is the type of contact?,MULTI_SELECT,"[Existing customer, Supplier, New customer / P...",[Competitor],"Hmm, I think it must be a Competitor contact.",easy


In [None]:
sampled_questions.to_json('sampled_qa_dataset_easy.json', orient='records', indent=4)

In [None]:
# As we run into the Error 429 (ressource exhausted) more and more we decided to split up the dataset into 5 equally large parts.
# For every splitted df, around 276 rows were used and contexts created.
# Afterwards we saved each individually, uploated them to GitHub, loaded them back into the notebook again and appendeded them to one combined_qa_dataset

qa_dataset = answer_combinations.copy()
print(qa_dataset.shape)

split_size = len(qa_dataset) // 5
print(f"Split size: {split_size}")

qa_dataset_1 = qa_dataset.iloc[:split_size].reset_index(drop=True)
qa_dataset_2 = qa_dataset.iloc[split_size:2 * split_size].reset_index(drop=True)
qa_dataset_3 = qa_dataset.iloc[2 * split_size:3 * split_size].reset_index(drop=True)
qa_dataset_4 = qa_dataset.iloc[3 * split_size:4 * split_size].reset_index(drop=True)
qa_dataset_5 = qa_dataset.iloc[4 * split_size:].reset_index(drop=True)

print(f"qa_dataset_1 has {len(qa_dataset_1)} rows, qa_dataset_2 has {len(qa_dataset_2)} rows, qa_dataset_3 has {len(qa_dataset_3)} rows, qa_dataset_4 has {len(qa_dataset_4)} rows, and qa_dataset_5 has {len(qa_dataset_5)} rows.")


(1381, 4)
Split size: 276
qa_dataset_1 has 276 rows, qa_dataset_2 has 276 rows, qa_dataset_3 has 276 rows, qa_dataset_4 has 276 rows, and qa_dataset_5 has 277 rows.


In [None]:
cycle_count = 0
qa_dataset_1[['context', 'difficulty']] = qa_dataset_1.apply(lambda row: pd.Series(generate_answer_for_row(row)), axis=1)

Cycle: 1
Cycle: 2
Cycle: 3
Cycle: 4
Cycle: 5
Cycle: 6
Cycle: 7
Cycle: 8
Cycle: 9
Cycle: 10
Cycle: 11
Cycle: 12
Cycle: 13
Cycle: 14
Cycle: 15
Cycle: 16
Cycle: 17
Cycle: 18
Cycle: 19
Cycle: 20
Cycle: 21
Cycle: 22
Cycle: 23
Cycle: 24
Cycle: 25
Cycle: 26
Cycle: 27
Cycle: 28
Cycle: 29
Cycle: 30
Cycle: 31
Cycle: 32
Cycle: 33
Cycle: 34
Cycle: 35
Cycle: 36
Cycle: 37
Cycle: 38
Cycle: 39
Cycle: 40
Cycle: 41
Cycle: 42
Cycle: 43
Cycle: 44
Cycle: 45
Cycle: 46
Cycle: 47
Cycle: 48
Cycle: 49
Cycle: 50
Cycle: 51
Cycle: 52
Cycle: 53
Cycle: 54
Cycle: 55
Cycle: 56
Cycle: 57
Cycle: 58
Cycle: 59
Cycle: 60
Cycle: 61
Cycle: 62
Cycle: 63
Cycle: 64
Cycle: 65
Cycle: 66
Cycle: 67
Cycle: 68
Cycle: 69
Cycle: 70
Cycle: 71
Cycle: 72
Cycle: 73
Cycle: 74
Cycle: 75
Cycle: 76
Cycle: 77
Cycle: 78
Cycle: 79
Cycle: 80
Cycle: 81
Cycle: 82
Cycle: 83
Cycle: 84
Cycle: 85
Cycle: 86
Cycle: 87
Cycle: 88
Cycle: 89
Cycle: 90
Cycle: 91
Cycle: 92
Cycle: 93
Cycle: 94
Cycle: 95
Cycle: 96
Cycle: 97
Cycle: 98
Cycle: 99
Cycle: 100
Cycle: 1

In [None]:
qa_dataset_1.to_json('qa_dataset_1.json', orient='records', indent=4)

In [None]:
cycle_count = 0
qa_dataset_2[['context', 'difficulty']] = qa_dataset_2.apply(lambda row: pd.Series(generate_answer_for_row(row)), axis=1)
qa_dataset_2.to_json('qa_dataset_2.json', orient='records', indent=4)

Cycle: 1
Cycle: 2
Cycle: 3
Cycle: 4
Cycle: 5
Cycle: 6
Cycle: 7
Cycle: 8
Cycle: 9
Cycle: 10
Cycle: 11
Cycle: 12
Cycle: 13
Cycle: 14
Cycle: 15
Cycle: 16
Cycle: 17
Cycle: 18
Cycle: 19
Cycle: 20
Cycle: 21
Cycle: 22
Cycle: 23
Cycle: 24
Cycle: 25
Cycle: 26
Cycle: 27
Cycle: 28
Cycle: 29
Cycle: 30
Cycle: 31
Cycle: 32
Cycle: 33
Cycle: 34
Cycle: 35
Cycle: 36
Cycle: 37
Cycle: 38
Cycle: 39
Cycle: 40
Cycle: 41
Cycle: 42
Cycle: 43
Cycle: 44
Cycle: 45
Cycle: 46
Cycle: 47
Cycle: 48
Cycle: 49
Cycle: 50
Cycle: 51
Cycle: 52
Cycle: 53
Cycle: 54
Cycle: 55
Cycle: 56
Cycle: 57
Cycle: 58
Cycle: 59
Cycle: 60
Cycle: 61
Cycle: 62
Cycle: 63
Cycle: 64
Cycle: 65
Cycle: 66
Cycle: 67
Cycle: 68
Cycle: 69
Cycle: 70
Cycle: 71


ERROR:tornado.access:503 POST /v1beta/models/gemini-2.0-flash-exp:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 886.15ms


Cycle: 72
Cycle: 73
Cycle: 74
Cycle: 75
Cycle: 76
Cycle: 77
Cycle: 78
Cycle: 79
Cycle: 80
Cycle: 81
Cycle: 82
Cycle: 83
Cycle: 84
Cycle: 85
Cycle: 86
Cycle: 87
Cycle: 88
Cycle: 89
Cycle: 90
Cycle: 91
Cycle: 92
Cycle: 93
Cycle: 94
Cycle: 95
Cycle: 96
Cycle: 97
Cycle: 98
Cycle: 99
Cycle: 100
Cycle: 101
Cycle: 102
Cycle: 103
Cycle: 104
Cycle: 105
Cycle: 106
Cycle: 107
Cycle: 108
Cycle: 109
Cycle: 110
Cycle: 111
Cycle: 112
Cycle: 113
Cycle: 114
Cycle: 115
Cycle: 116
Cycle: 117
Cycle: 118
Cycle: 119
Cycle: 120
Cycle: 121
Cycle: 122
Cycle: 123
Cycle: 124
Cycle: 125
Cycle: 126
Cycle: 127
Cycle: 128
Cycle: 129
Cycle: 130
Cycle: 131
Cycle: 132
Cycle: 133
Cycle: 134
Cycle: 135
Cycle: 136
Cycle: 137
Cycle: 138
Cycle: 139
Cycle: 140
Cycle: 141
Cycle: 142
Cycle: 143
Cycle: 144
Cycle: 145
Cycle: 146
Cycle: 147
Cycle: 148
Cycle: 149
Cycle: 150
Cycle: 151
Cycle: 152
Cycle: 153
Cycle: 154
Cycle: 155


ERROR:tornado.access:503 POST /v1beta/models/gemini-2.0-flash-exp:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 507.19ms


Cycle: 156
Cycle: 157
Cycle: 158
Cycle: 159
Cycle: 160
Cycle: 161
Cycle: 162
Cycle: 163
Cycle: 164
Cycle: 165
Cycle: 166
Cycle: 167
Cycle: 168
Cycle: 169
Cycle: 170
Cycle: 171
Cycle: 172
Cycle: 173
Cycle: 174
Cycle: 175


ERROR:tornado.access:503 POST /v1beta/models/gemini-2.0-flash-exp:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 835.48ms


Cycle: 176
Cycle: 177
Cycle: 178
Cycle: 179
Cycle: 180
Cycle: 181
Cycle: 182
Cycle: 183
Cycle: 184
Cycle: 185
Cycle: 186
Cycle: 187
Cycle: 188
Cycle: 189
Cycle: 190
Cycle: 191
Cycle: 192
Cycle: 193
Cycle: 194
Cycle: 195
Cycle: 196
Cycle: 197
Cycle: 198
Cycle: 199
Cycle: 200
Cycle: 201
Cycle: 202
Cycle: 203


ERROR:tornado.access:503 POST /v1beta/models/gemini-2.0-flash-exp:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 407.14ms


Cycle: 204
Cycle: 205
Cycle: 206
Cycle: 207
Cycle: 208
Cycle: 209
Cycle: 210
Cycle: 211
Cycle: 212
Cycle: 213
Cycle: 214
Cycle: 215
Cycle: 216
Cycle: 217
Cycle: 218
Cycle: 219
Cycle: 220
Cycle: 221
Cycle: 222
Cycle: 223
Cycle: 224
Cycle: 225
Cycle: 226
Cycle: 227
Cycle: 228
Cycle: 229
Cycle: 230
Cycle: 231
Cycle: 232
Cycle: 233
Cycle: 234
Cycle: 235
Cycle: 236
Cycle: 237
Cycle: 238
Cycle: 239
Cycle: 240
Cycle: 241
Cycle: 242
Cycle: 243
Cycle: 244
Cycle: 245
Cycle: 246
Cycle: 247
Cycle: 248
Cycle: 249
Cycle: 250
Cycle: 251
Cycle: 252
Cycle: 253
Cycle: 254
Cycle: 255
Cycle: 256
Cycle: 257
Cycle: 258
Cycle: 259
Cycle: 260
Cycle: 261
Cycle: 262
Cycle: 263
Cycle: 264
Cycle: 265
Cycle: 266
Cycle: 267
Cycle: 268
Cycle: 269
Cycle: 270
Cycle: 271
Cycle: 272
Cycle: 273
Cycle: 274
Cycle: 275
Cycle: 276


In [None]:
cycle_count = 0
qa_dataset_3[['context', 'difficulty']] = qa_dataset_3.apply(lambda row: pd.Series(generate_answer_for_row(row)), axis=1)
qa_dataset_3.to_json('qa_dataset_3.json', orient='records', indent=4)

Cycle: 1
Cycle: 2
Cycle: 3
Cycle: 4
Cycle: 5
Cycle: 6
Cycle: 7
Cycle: 8
Cycle: 9
Cycle: 10
Cycle: 11
Cycle: 12
Cycle: 13
Cycle: 14
Cycle: 15
Cycle: 16
Cycle: 17
Cycle: 18
Cycle: 19
Cycle: 20
Cycle: 21
Cycle: 22
Cycle: 23
Cycle: 24
Cycle: 25
Cycle: 26
Cycle: 27
Cycle: 28
Cycle: 29
Cycle: 30
Cycle: 31
Cycle: 32
Cycle: 33
Cycle: 34
Cycle: 35
Cycle: 36
Cycle: 37
Cycle: 38
Cycle: 39
Cycle: 40
Cycle: 41
Cycle: 42
Cycle: 43
Cycle: 44
Cycle: 45
Cycle: 46
Cycle: 47
Cycle: 48
Cycle: 49
Cycle: 50
Cycle: 51
Cycle: 52
Cycle: 53
Cycle: 54
Cycle: 55
Cycle: 56
Cycle: 57
Cycle: 58
Cycle: 59
Cycle: 60
Cycle: 61
Cycle: 62
Cycle: 63
Cycle: 64
Cycle: 65
Cycle: 66
Cycle: 67
Cycle: 68
Cycle: 69
Cycle: 70
Cycle: 71
Cycle: 72
Cycle: 73
Cycle: 74
Cycle: 75
Cycle: 76
Cycle: 77
Cycle: 78
Cycle: 79
Cycle: 80
Cycle: 81
Cycle: 82
Cycle: 83
Cycle: 84
Cycle: 85
Cycle: 86
Cycle: 87
Cycle: 88
Cycle: 89
Cycle: 90
Cycle: 91
Cycle: 92
Cycle: 93
Cycle: 94
Cycle: 95
Cycle: 96
Cycle: 97
Cycle: 98
Cycle: 99
Cycle: 100
Cycle: 1

ERROR:tornado.access:503 POST /v1beta/models/gemini-2.0-flash-exp:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 607.95ms


Cycle: 225
Cycle: 226
Cycle: 227
Cycle: 228
Cycle: 229
Cycle: 230
Cycle: 231
Cycle: 232
Cycle: 233
Cycle: 234
Cycle: 235
Cycle: 236
Cycle: 237
Cycle: 238
Cycle: 239
Cycle: 240
Cycle: 241
Cycle: 242
Cycle: 243
Cycle: 244
Cycle: 245
Cycle: 246
Cycle: 247
Cycle: 248
Cycle: 249
Cycle: 250
Cycle: 251
Cycle: 252
Cycle: 253
Cycle: 254
Cycle: 255
Cycle: 256
Cycle: 257
Cycle: 258
Cycle: 259
Cycle: 260
Cycle: 261
Cycle: 262
Cycle: 263
Cycle: 264
Cycle: 265
Cycle: 266
Cycle: 267
Cycle: 268
Cycle: 269
Cycle: 270
Cycle: 271
Cycle: 272
Cycle: 273
Cycle: 274
Cycle: 275
Cycle: 276


In [None]:
cycle_count = 0
qa_dataset_4[['context', 'difficulty']] = qa_dataset_4.apply(lambda row: pd.Series(generate_answer_for_row(row)), axis=1)
qa_dataset_4.to_json('qa_dataset_4.json', orient='records', indent=4)

Cycle: 1
Cycle: 2
Cycle: 3
Cycle: 4
Cycle: 5
Cycle: 6
Cycle: 7
Cycle: 8
Cycle: 9
Cycle: 10
Cycle: 11
Cycle: 12
Cycle: 13
Cycle: 14
Cycle: 15
Cycle: 16
Cycle: 17
Cycle: 18
Cycle: 19
Cycle: 20
Cycle: 21
Cycle: 22
Cycle: 23
Cycle: 24
Cycle: 25
Cycle: 26
Cycle: 27
Cycle: 28
Cycle: 29
Cycle: 30
Cycle: 31
Cycle: 32
Cycle: 33
Cycle: 34
Cycle: 35
Cycle: 36
Cycle: 37
Cycle: 38
Cycle: 39
Cycle: 40
Cycle: 41
Cycle: 42
Cycle: 43
Cycle: 44
Cycle: 45
Cycle: 46
Cycle: 47
Cycle: 48
Cycle: 49
Cycle: 50
Cycle: 51
Cycle: 52
Cycle: 53
Cycle: 54
Cycle: 55
Cycle: 56
Cycle: 57
Cycle: 58
Cycle: 59
Cycle: 60
Cycle: 61
Cycle: 62
Cycle: 63
Cycle: 64
Cycle: 65
Cycle: 66
Cycle: 67
Cycle: 68
Cycle: 69
Cycle: 70
Cycle: 71
Cycle: 72
Cycle: 73
Cycle: 74
Cycle: 75
Cycle: 76
Cycle: 77
Cycle: 78
Cycle: 79
Cycle: 80
Cycle: 81
Cycle: 82
Cycle: 83
Cycle: 84
Cycle: 85
Cycle: 86
Cycle: 87
Cycle: 88
Cycle: 89
Cycle: 90
Cycle: 91
Cycle: 92
Cycle: 93
Cycle: 94
Cycle: 95
Cycle: 96
Cycle: 97
Cycle: 98
Cycle: 99
Cycle: 100
Cycle: 1

In [None]:
cycle_count = 0
qa_dataset_5[['context', 'difficulty']] = qa_dataset_5.apply(lambda row: pd.Series(generate_answer_for_row(row)), axis=1)
qa_dataset_5.to_json('qa_dataset_5.json', orient='records', indent=4)

Cycle: 1
Cycle: 2
Cycle: 3
Cycle: 4
Cycle: 5
Cycle: 6
Cycle: 7
Cycle: 8
Cycle: 9
Cycle: 10
Cycle: 11
Cycle: 12
Cycle: 13
Cycle: 14
Cycle: 15
Cycle: 16
Cycle: 17
Cycle: 18
Cycle: 19
Cycle: 20
Cycle: 21
Cycle: 22
Cycle: 23
Cycle: 24
Cycle: 25
Cycle: 26
Cycle: 27
Cycle: 28
Cycle: 29
Cycle: 30
Cycle: 31
Cycle: 32
Cycle: 33
Cycle: 34
Cycle: 35
Cycle: 36
Cycle: 37
Cycle: 38
Cycle: 39
Cycle: 40
Cycle: 41
Cycle: 42
Cycle: 43
Cycle: 44
Cycle: 45
Cycle: 46
Cycle: 47
Cycle: 48
Cycle: 49
Cycle: 50
Cycle: 51
Cycle: 52
Cycle: 53
Cycle: 54
Cycle: 55
Cycle: 56
Cycle: 57
Cycle: 58
Cycle: 59
Cycle: 60
Cycle: 61
Cycle: 62
Cycle: 63
Cycle: 64
Cycle: 65
Cycle: 66
Cycle: 67
Cycle: 68
Cycle: 69
Cycle: 70
Cycle: 71
Cycle: 72
Cycle: 73
Cycle: 74
Cycle: 75
Cycle: 76
Cycle: 77
Cycle: 78
Cycle: 79
Cycle: 80
Cycle: 81
Cycle: 82
Cycle: 83
Cycle: 84
Cycle: 85
Cycle: 86
Cycle: 87
Cycle: 88
Cycle: 89
Cycle: 90
Cycle: 91
Cycle: 92
Cycle: 93
Cycle: 94
Cycle: 95
Cycle: 96
Cycle: 97
Cycle: 98
Cycle: 99
Cycle: 100
Cycle: 1

ERROR:tornado.access:503 POST /v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 809.38ms


Cycle: 172
Cycle: 173
Cycle: 174
Cycle: 175
Cycle: 176
Cycle: 177
Cycle: 178
Cycle: 179
Cycle: 180
Cycle: 181
Cycle: 182
Cycle: 183
Cycle: 184
Cycle: 185
Cycle: 186
Cycle: 187
Cycle: 188
Cycle: 189
Cycle: 190
Cycle: 191
Cycle: 192
Cycle: 193
Cycle: 194
Cycle: 195
Cycle: 196
Cycle: 197
Cycle: 198
Cycle: 199
Cycle: 200
Cycle: 201
Cycle: 202
Cycle: 203
Cycle: 204
Cycle: 205
Cycle: 206
Cycle: 207
Cycle: 208
Cycle: 209
Cycle: 210
Cycle: 211
Cycle: 212
Cycle: 213
Cycle: 214
Cycle: 215
Cycle: 216
Cycle: 217
Cycle: 218
Cycle: 219
Cycle: 220
Cycle: 221
Cycle: 222
Cycle: 223
Cycle: 224
Cycle: 225
Cycle: 226
Cycle: 227
Cycle: 228
Cycle: 229
Cycle: 230
Cycle: 231
Cycle: 232
Cycle: 233
Cycle: 234
Cycle: 235
Cycle: 236
Cycle: 237
Cycle: 238
Cycle: 239
Cycle: 240
Cycle: 241
Cycle: 242
Cycle: 243
Cycle: 244
Cycle: 245
Cycle: 246
Cycle: 247
Cycle: 248
Cycle: 249
Cycle: 250
Cycle: 251
Cycle: 252
Cycle: 253
Cycle: 254
Cycle: 255
Cycle: 256
Cycle: 257
Cycle: 258
Cycle: 259
Cycle: 260
Cycle: 261
Cycle: 262

In [None]:
dfs = []

for i in range(1, 6):
    url = f'https://raw.githubusercontent.com/alexk2206/tds_capstone/refs/heads/main/datasets/qa_dataset_{i}.json'
    df = pd.read_json(url)
    dfs.append(df)

combined_qa_dataset = pd.concat(dfs, ignore_index=True)

In [None]:
print(f'combined_qa_dataset shape: {combined_qa_dataset.shape}')
combined_qa_dataset.to_json('combined_qa_dataset.json', orient='records', indent=4)

combined_qa_dataset shape: (1381, 6)


In [None]:
combined_qa_dataset.sample(15)

Unnamed: 0,question,type,options,intended_answer,context,difficulty
2,Which language is wanted for communication?,SINGLE_SELECT,"[German, Italian, Japanese , English, Spanish]",[German],"Okay, so for communication, I see we could use...",easy
77,Searches a solution for,MULTI_SELECT,"[Scan business cards, Clean up CRM, Extract da...","[Extract data from emails, Improve CRM data qu...",I'm looking for a solution that could either e...,easy
477,Next steps,SINGLE_SELECT,"[Offer, Meeting, Call]",[Meeting],"Well, I'd say the next step is a meeting. I am...",easy
791,What industry are you operating in?,SINGLE_SELECT,"[Aerospace, Computers & Networks, Government, ...",[Industrial],"Hmm, I'd say I operate in the Industrial secto...",easy
107,What is the size of your company?,SINGLE_SELECT,"[1-10, 11-50, 51-200, 201-2000, larger than 2000]",[11-50],I'd say we're about a '11-50' person company. ...,easy
36,Customer group,SINGLE_SELECT,"[End User, Wholesaler, Distributor, Consultant...",[Planner],"Hmm, a customer group? I guess that could be a...",easy
884,Customer type,SINGLE_SELECT,"[New customer, Existing customer, Partner, App...",[Partner],"Oh, customer type. I guess that would be Partn...",easy
736,Who to copy in follow up,MULTI_SELECT,"[Stephan Maier, Joachim Wagner, Erik Schneider...","[Oliver Eibel, Angelina Haug, Domiki Stein, Ti...","Okay so I guess I should copy Oliver Eibel, An...",easy
1189,Productinterests,MULTI_SELECT,"[BusinessCards, DataEnrichment, VisitReport, D...","[DataEnrichment, VisitReport]","I'm interested in Data Enrichment, because it ...",easy
323,Customer group,SINGLE_SELECT,"[End User, Wholesaler, Distributor, Consultant...",[Planner],Oh I'd say I'm a Planner. That sounds like me.,easy


In [None]:
qa_dataset.to_json('qa_dataset_large.json', orient='records', lines=False, indent=4)