<a href="https://colab.research.google.com/github/alexk2206/tds_capstone/blob/Alex-DEV/QA_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Generation of the Q and A Dataset**

In [39]:
import pandas as pd
import random
import json

import google.generativeai as genai
from IPython.display import display, Markdown
from google.colab import userdata
import time

In [40]:
url ="https://raw.githubusercontent.com/alexk2206/tds_capstone/refs/heads/main/datasets/answer_combinations.json"

answer_combinations = pd.read_json(url)
print(f"answer_combinations shape: {answer_combinations.shape}")
answer_combinations.sample(15)

answer_combinations shape: (1381, 4)


Unnamed: 0,question,type,options,intended_answer
963,When do you wish to receive a follow-up?,DATE,[Date],[2025-01-19]
342,When do you wish to receive a follow-up?,DATE,[Date],[2025-01-24]
123,Which language is wanted for communication?,SINGLE_SELECT,"[German, Italian, Japanese , English, Spanish]",[English]
1166,Customer satisfaction,SINGLE_SELECT,"[Very satisfied, Satisfied, Unsatisfied, Very ...",[Very unsatisfied]
1062,What phone number can we use for contact?,NUMBER,[phone number],[0162131396937]
1212,When do you wish to receive a follow-up?,DATE,[Date],[2025-01-26]
1001,Customer satisfaction,SINGLE_SELECT,"[Very satisfied, Satisfied, Unsatisfied, Very ...",[Satisfied]
1148,Products interested in,MULTI_SELECT,"[MY-SYSTEM, Notion, JTS, JS EcoLine, AKW100, A...","[MY-SYSTEM, Notion, JTS, JS EcoLine, AX100]"
91,Size of the trade fair team (on average),SINGLE_SELECT,"[1-5, 6-10, 11-15, 16-20, 21-30, 31-40, more t...",[21-30]
781,When does the contact person wish to receive a...,MULTI_SELECT,"[1 week, 2 weeks, 3 weeks]","[1 week, 2 weeks]"


In [66]:
key = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=key)
model = genai.GenerativeModel("gemini-2.0-flash-exp")

In [74]:
max_output_tokens = 48

def generate_selection_answer_easy(question, intended_answer):
  prompt = f"""
  You are asked a question, and you need to provide a natural, conversational answer in the first person. Do not use special characters other than ',' and '.'.
  Act like you really do not know which options there are and the intended answer is your answer.
  When given a range, use a number between the two values.
  Be concise but clear, and avoid unnecessary elaboration. Use up to {max_output_tokens} tokens.
  Question: {question}\n
  Intended answer: {intended_answer}\n
  Answer as a sentence, mentioning and explaining all the provided options:
  """

  #prompt = f"""
  #You are being asked a question and must answer in a way that sounds like you're speaking in the first person.
  #Your response should be varied and sound natural, like you're being interviewed without talking unnecessarily much. Use actual numbers instead of writing them out, but only if necessary.
  #Answer in present tense like you are having a conversation, use up to {max_output_tokens} tokens but keep it short. When the intended answer is a range, pick a number inbetween.
  #Question: {question}\n
  #Answer: {intended_answer}\n
  #Answer as a sentence:
  #"""

  response = model.generate_content(
      contents = prompt,
      generation_config = genai.GenerationConfig(
          max_output_tokens=max_output_tokens,
          temperature=2)
    )

  answer = response.text.strip()

  time.sleep(6)

  return {"answer": answer, "difficulty": "easy"}

def generate_number_answer_easy(question, intended_answer):
  prompt = f"""
  You are being asked for contact information, and your response should be clear and concise, as if you're giving someone your phone number and how you can be reached in a conversation.
  Mention the provided phone number and ensure your response sounds natural and professional.
  Your answer should be in the first person, present tense, and only include the relevant details. Use up to {max_output_tokens} tokens.
  Question: {question}\n
  Options: {intended_answer}\n
  Answer as a sentence, providing the phone number and any relevant details:
  """

  response = model.generate_content(
      contents = prompt,
      generation_config = genai.GenerationConfig(
          max_output_tokens=max_output_tokens,
          temperature=2)
    )

  answer = response.text.strip()
  time.sleep(6)

  return {"answer": answer, "difficulty": "easy"}


def generate_freetext_answer_easy(question, intended_answer):
  prompt = f"""
  You are being asked if you have any additional notes or information to share.
  Your response should sound natural, in the first person, and can be either brief or more detailed, depending on the situation.
  You can provide additional information but you don't have to and mention it clearly and politely.
  If there isn't anything else to add, express that in a conversational manner. Use up to {max_output_tokens} tokens.
  Question: {question}\n
  Options: {intended_answer}
  Answer as a sentence, providing any additional information or politely stating that there's nothing else to add:
  """

  response = model.generate_content(
      contents = prompt,
      generation_config = genai.GenerationConfig(
          max_output_tokens=max_output_tokens,
          temperature=2)
    )

  answer = response.text.strip()
  time.sleep(6)

  return {"answer": answer, "difficulty": "easy"}


def generate_date_answer_easy(question, intended_answer):
  prompt = f"""
  You are asked a question about a specific date, and you need to provide a natural, conversational answer in the first person.
  Include the date from the intended answer in your response, phrasing it naturally as if you're suggesting a meeting.
  Be concise but clear, and use up to {max_output_tokens} tokens.
  Question: {question}\n
  Intended Answer: {intended_answer}\n
  Context: Provide a conversational response mentioning the date in a natural way:
  """

  response = model.generate_content(
      contents = prompt,
      generation_config = genai.GenerationConfig(
          max_output_tokens=max_output_tokens,
          temperature=2)
    )

  answer = response.text.strip()
  time.sleep(6)

  return {"answer": answer, "difficulty": "easy"}

In [75]:
def generate_freetext_answer_medium(question, intended_answer):
  prompt = f"""
  You are being asked if you have any additional notes or information to share.
  Your response should sound natural, in the first person, and can be either brief or more detailed, depending on the situation.
  You should provide additional information and mention it clearly and politely.
  If there isn't anything else to add (which is sometimes the case), express that in a conversational manner. Use up to {max_output_tokens} tokens.
  Question: {question}\n
  Options: {intended_answer}
  Answer as a sentence, providing any additional information or politely stating that there's nothing else to add:
  """

  response = model.generate_content(
      contents = prompt,
      generation_config = genai.GenerationConfig(
          max_output_tokens=max_output_tokens,
          temperature=2)
    )

  answer = response.text.strip()
  time.sleep(6)

  return {"answer": answer, "difficulty": "medium"}

In [76]:
cycle_count = 0

def generate_answer_for_row(row):
    global cycle_count
    cycle_count += 1
    print(f"Cycle: {cycle_count}")

    question = row['question']
    intended_answer = row['intended_answer']
    question_type = row['type']

    if question_type in ['SINGLE_SELECT', 'MULTI_SELECT']:
        return generate_selection_answer_easy(question, intended_answer)
    elif question_type == 'NUMBER':
        return generate_number_answer_easy(question, intended_answer)
    elif question_type == 'TEXT':
        return generate_freetext_answer_easy(question, intended_answer)
    elif question_type == 'DATE':
        return generate_date_answer_easy(question, intended_answer)
    else:
        return {"answer": "Unknown question type", "difficulty": "unknown"}

In [77]:
cycle_count = 0

# Sampling
sample_type = "MULTI_SELECT"
sample_size = 5
sample_question = "Size of the trade fair team (on average)"

answer_combinations_filtered = answer_combinations[answer_combinations['type'] == sample_type]
#answer_combinations_filtered = answer_combinations[answer_combinations['question'] == sample_question]
#answer_combinations_filtered = answer_combinations.copy()

sampled_questions = answer_combinations_filtered.sample(n=min(sample_size, len(answer_combinations_filtered))).reset_index(drop=True)

sampled_questions[['context', 'difficulty']] = sampled_questions.apply(lambda row: pd.Series(generate_answer_for_row(row)), axis=1)

sampled_questions

Cycle: 1
Cycle: 2
Cycle: 3
Cycle: 4
Cycle: 5


Unnamed: 0,question,type,options,intended_answer,context,difficulty
0,What is the type of contact?,MULTI_SELECT,"[Existing customer, Supplier, New customer / P...","[Existing customer, New customer / Prospect, P...","Hmm, well it could be an existing customer, or...",easy
1,What is the contact person interested in?,MULTI_SELECT,"[100 Additive Manufacturing, 200 Automation, 3...","[100 Additive Manufacturing, 200 Automation, 2...","Well, I think they might like 100 Additive Man...",easy
2,Products interested in,MULTI_SELECT,"[MY-SYSTEM, Notion, JTS, JS EcoLine, AKW100, A...","[MY-SYSTEM, Notion, JTS, JS EcoLine, AKW100]","Well, I'm interested in MY-SYSTEM, and Notion,...",easy
3,Productinterests,MULTI_SELECT,"[BusinessCards, DataEnrichment, VisitReport, D...","[BusinessCards, VisitReport]","Okay, I think I'm interested in BusinessCards ...",easy
4,What is the type of contact?,MULTI_SELECT,"[Existing customer, Supplier, New customer / P...",[Competitor],"Hmm, I think it must be a Competitor contact.",easy


In [18]:
sampled_questions.to_json('sampled_qa_dataset_easy.json', orient='records', indent=4)

In [59]:
qa_dataset = answer_combinations.copy()
print(qa_dataset.shape)

split_size = len(qa_dataset) // 5
print(f"Split size: {split_size}")

qa_dataset_1 = qa_dataset.iloc[:split_size].reset_index(drop=True)
qa_dataset_2 = qa_dataset.iloc[split_size:2 * split_size].reset_index(drop=True)
qa_dataset_3 = qa_dataset.iloc[2 * split_size:3 * split_size].reset_index(drop=True)
qa_dataset_4 = qa_dataset.iloc[3 * split_size:4 * split_size].reset_index(drop=True)
qa_dataset_5 = qa_dataset.iloc[4 * split_size:].reset_index(drop=True)

print(f"qa_dataset_1 has {len(qa_dataset_1)} rows, qa_dataset_2 has {len(qa_dataset_2)} rows, qa_dataset_3 has {len(qa_dataset_3)} rows, qa_dataset_4 has {len(qa_dataset_4)} rows, and qa_dataset_5 has {len(qa_dataset_5)} rows.")


(1381, 4)
Split size: 276
qa_dataset_1 has 276 rows, qa_dataset_2 has 276 rows, qa_dataset_3 has 276 rows, qa_dataset_4 has 276 rows, and qa_dataset_5 has 277 rows.


In [60]:
cycle_count = 0
qa_dataset_1[['context', 'difficulty']] = qa_dataset_1.apply(lambda row: pd.Series(generate_answer_for_row(row)), axis=1)

Cycle: 1
Cycle: 2
Cycle: 3
Cycle: 4
Cycle: 5
Cycle: 6
Cycle: 7
Cycle: 8
Cycle: 9
Cycle: 10
Cycle: 11
Cycle: 12
Cycle: 13
Cycle: 14
Cycle: 15
Cycle: 16
Cycle: 17
Cycle: 18
Cycle: 19
Cycle: 20
Cycle: 21
Cycle: 22
Cycle: 23
Cycle: 24
Cycle: 25
Cycle: 26
Cycle: 27
Cycle: 28
Cycle: 29
Cycle: 30
Cycle: 31
Cycle: 32
Cycle: 33
Cycle: 34
Cycle: 35
Cycle: 36
Cycle: 37
Cycle: 38
Cycle: 39
Cycle: 40
Cycle: 41
Cycle: 42
Cycle: 43
Cycle: 44
Cycle: 45
Cycle: 46
Cycle: 47
Cycle: 48
Cycle: 49
Cycle: 50
Cycle: 51
Cycle: 52
Cycle: 53
Cycle: 54
Cycle: 55
Cycle: 56
Cycle: 57
Cycle: 58
Cycle: 59
Cycle: 60
Cycle: 61
Cycle: 62
Cycle: 63
Cycle: 64
Cycle: 65
Cycle: 66
Cycle: 67
Cycle: 68
Cycle: 69
Cycle: 70
Cycle: 71
Cycle: 72
Cycle: 73
Cycle: 74
Cycle: 75
Cycle: 76
Cycle: 77
Cycle: 78
Cycle: 79
Cycle: 80
Cycle: 81
Cycle: 82
Cycle: 83
Cycle: 84
Cycle: 85
Cycle: 86
Cycle: 87
Cycle: 88
Cycle: 89
Cycle: 90
Cycle: 91
Cycle: 92
Cycle: 93
Cycle: 94
Cycle: 95
Cycle: 96
Cycle: 97
Cycle: 98
Cycle: 99
Cycle: 100
Cycle: 1

In [63]:
qa_dataset_1.to_json('qa_dataset_1.json', orient='records', indent=4)

In [78]:
cycle_count = 0
qa_dataset_2[['context', 'difficulty']] = qa_dataset_2.apply(lambda row: pd.Series(generate_answer_for_row(row)), axis=1)
qa_dataset_2.to_json('qa_dataset_2.json', orient='records', indent=4)

Cycle: 1
Cycle: 2
Cycle: 3
Cycle: 4
Cycle: 5
Cycle: 6
Cycle: 7
Cycle: 8
Cycle: 9
Cycle: 10
Cycle: 11
Cycle: 12
Cycle: 13
Cycle: 14
Cycle: 15
Cycle: 16
Cycle: 17
Cycle: 18
Cycle: 19
Cycle: 20
Cycle: 21
Cycle: 22
Cycle: 23
Cycle: 24
Cycle: 25
Cycle: 26
Cycle: 27
Cycle: 28
Cycle: 29
Cycle: 30
Cycle: 31
Cycle: 32
Cycle: 33
Cycle: 34
Cycle: 35
Cycle: 36
Cycle: 37
Cycle: 38
Cycle: 39
Cycle: 40
Cycle: 41
Cycle: 42
Cycle: 43
Cycle: 44
Cycle: 45
Cycle: 46
Cycle: 47
Cycle: 48
Cycle: 49
Cycle: 50
Cycle: 51
Cycle: 52
Cycle: 53
Cycle: 54
Cycle: 55
Cycle: 56
Cycle: 57
Cycle: 58
Cycle: 59
Cycle: 60
Cycle: 61
Cycle: 62
Cycle: 63
Cycle: 64
Cycle: 65
Cycle: 66
Cycle: 67
Cycle: 68
Cycle: 69
Cycle: 70
Cycle: 71


ERROR:tornado.access:503 POST /v1beta/models/gemini-2.0-flash-exp:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 886.15ms


Cycle: 72
Cycle: 73
Cycle: 74
Cycle: 75
Cycle: 76
Cycle: 77
Cycle: 78
Cycle: 79
Cycle: 80
Cycle: 81
Cycle: 82
Cycle: 83
Cycle: 84
Cycle: 85
Cycle: 86
Cycle: 87
Cycle: 88
Cycle: 89
Cycle: 90
Cycle: 91
Cycle: 92
Cycle: 93
Cycle: 94
Cycle: 95
Cycle: 96
Cycle: 97
Cycle: 98
Cycle: 99
Cycle: 100
Cycle: 101
Cycle: 102
Cycle: 103
Cycle: 104
Cycle: 105
Cycle: 106
Cycle: 107
Cycle: 108
Cycle: 109
Cycle: 110
Cycle: 111
Cycle: 112
Cycle: 113
Cycle: 114
Cycle: 115
Cycle: 116
Cycle: 117
Cycle: 118
Cycle: 119
Cycle: 120
Cycle: 121
Cycle: 122
Cycle: 123
Cycle: 124
Cycle: 125
Cycle: 126
Cycle: 127
Cycle: 128
Cycle: 129
Cycle: 130
Cycle: 131
Cycle: 132
Cycle: 133
Cycle: 134
Cycle: 135
Cycle: 136
Cycle: 137
Cycle: 138
Cycle: 139
Cycle: 140
Cycle: 141
Cycle: 142
Cycle: 143
Cycle: 144
Cycle: 145
Cycle: 146
Cycle: 147
Cycle: 148
Cycle: 149
Cycle: 150
Cycle: 151
Cycle: 152
Cycle: 153
Cycle: 154
Cycle: 155


ERROR:tornado.access:503 POST /v1beta/models/gemini-2.0-flash-exp:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 507.19ms


Cycle: 156
Cycle: 157
Cycle: 158
Cycle: 159
Cycle: 160
Cycle: 161
Cycle: 162
Cycle: 163
Cycle: 164
Cycle: 165
Cycle: 166
Cycle: 167
Cycle: 168
Cycle: 169
Cycle: 170
Cycle: 171
Cycle: 172
Cycle: 173
Cycle: 174
Cycle: 175


ERROR:tornado.access:503 POST /v1beta/models/gemini-2.0-flash-exp:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 835.48ms


Cycle: 176
Cycle: 177
Cycle: 178
Cycle: 179
Cycle: 180
Cycle: 181
Cycle: 182
Cycle: 183
Cycle: 184
Cycle: 185
Cycle: 186
Cycle: 187
Cycle: 188
Cycle: 189
Cycle: 190
Cycle: 191
Cycle: 192
Cycle: 193
Cycle: 194
Cycle: 195
Cycle: 196
Cycle: 197
Cycle: 198
Cycle: 199
Cycle: 200
Cycle: 201
Cycle: 202
Cycle: 203


ERROR:tornado.access:503 POST /v1beta/models/gemini-2.0-flash-exp:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 407.14ms


Cycle: 204
Cycle: 205
Cycle: 206
Cycle: 207
Cycle: 208
Cycle: 209
Cycle: 210
Cycle: 211
Cycle: 212
Cycle: 213
Cycle: 214
Cycle: 215
Cycle: 216
Cycle: 217
Cycle: 218
Cycle: 219
Cycle: 220
Cycle: 221
Cycle: 222
Cycle: 223
Cycle: 224
Cycle: 225
Cycle: 226
Cycle: 227
Cycle: 228
Cycle: 229
Cycle: 230
Cycle: 231
Cycle: 232
Cycle: 233
Cycle: 234
Cycle: 235
Cycle: 236
Cycle: 237
Cycle: 238
Cycle: 239
Cycle: 240
Cycle: 241
Cycle: 242
Cycle: 243
Cycle: 244
Cycle: 245
Cycle: 246
Cycle: 247
Cycle: 248
Cycle: 249
Cycle: 250
Cycle: 251
Cycle: 252
Cycle: 253
Cycle: 254
Cycle: 255
Cycle: 256
Cycle: 257
Cycle: 258
Cycle: 259
Cycle: 260
Cycle: 261
Cycle: 262
Cycle: 263
Cycle: 264
Cycle: 265
Cycle: 266
Cycle: 267
Cycle: 268
Cycle: 269
Cycle: 270
Cycle: 271
Cycle: 272
Cycle: 273
Cycle: 274
Cycle: 275
Cycle: 276


In [79]:
cycle_count = 0
qa_dataset_3[['context', 'difficulty']] = qa_dataset_3.apply(lambda row: pd.Series(generate_answer_for_row(row)), axis=1)
qa_dataset_3.to_json('qa_dataset_3.json', orient='records', indent=4)

Cycle: 1
Cycle: 2
Cycle: 3
Cycle: 4
Cycle: 5
Cycle: 6
Cycle: 7
Cycle: 8
Cycle: 9
Cycle: 10
Cycle: 11
Cycle: 12
Cycle: 13
Cycle: 14
Cycle: 15
Cycle: 16
Cycle: 17
Cycle: 18
Cycle: 19
Cycle: 20
Cycle: 21
Cycle: 22
Cycle: 23
Cycle: 24
Cycle: 25
Cycle: 26
Cycle: 27
Cycle: 28
Cycle: 29
Cycle: 30
Cycle: 31
Cycle: 32
Cycle: 33
Cycle: 34
Cycle: 35
Cycle: 36
Cycle: 37
Cycle: 38
Cycle: 39
Cycle: 40
Cycle: 41
Cycle: 42
Cycle: 43
Cycle: 44
Cycle: 45
Cycle: 46
Cycle: 47
Cycle: 48
Cycle: 49
Cycle: 50
Cycle: 51
Cycle: 52
Cycle: 53
Cycle: 54
Cycle: 55
Cycle: 56
Cycle: 57
Cycle: 58
Cycle: 59
Cycle: 60
Cycle: 61
Cycle: 62
Cycle: 63
Cycle: 64
Cycle: 65
Cycle: 66
Cycle: 67
Cycle: 68
Cycle: 69
Cycle: 70
Cycle: 71
Cycle: 72
Cycle: 73
Cycle: 74
Cycle: 75
Cycle: 76
Cycle: 77
Cycle: 78
Cycle: 79
Cycle: 80
Cycle: 81
Cycle: 82
Cycle: 83
Cycle: 84
Cycle: 85
Cycle: 86
Cycle: 87
Cycle: 88
Cycle: 89
Cycle: 90
Cycle: 91
Cycle: 92
Cycle: 93
Cycle: 94
Cycle: 95
Cycle: 96
Cycle: 97
Cycle: 98
Cycle: 99
Cycle: 100
Cycle: 1

ERROR:tornado.access:503 POST /v1beta/models/gemini-2.0-flash-exp:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 607.95ms


Cycle: 225
Cycle: 226
Cycle: 227
Cycle: 228
Cycle: 229
Cycle: 230
Cycle: 231
Cycle: 232
Cycle: 233
Cycle: 234
Cycle: 235
Cycle: 236
Cycle: 237
Cycle: 238
Cycle: 239
Cycle: 240
Cycle: 241
Cycle: 242
Cycle: 243
Cycle: 244
Cycle: 245
Cycle: 246
Cycle: 247
Cycle: 248
Cycle: 249
Cycle: 250
Cycle: 251
Cycle: 252
Cycle: 253
Cycle: 254
Cycle: 255
Cycle: 256
Cycle: 257
Cycle: 258
Cycle: 259
Cycle: 260
Cycle: 261
Cycle: 262
Cycle: 263
Cycle: 264
Cycle: 265
Cycle: 266
Cycle: 267
Cycle: 268
Cycle: 269
Cycle: 270
Cycle: 271
Cycle: 272
Cycle: 273
Cycle: 274
Cycle: 275
Cycle: 276


In [80]:
cycle_count = 0
qa_dataset_4[['context', 'difficulty']] = qa_dataset_4.apply(lambda row: pd.Series(generate_answer_for_row(row)), axis=1)
qa_dataset_4.to_json('qa_dataset_4.json', orient='records', indent=4)

Cycle: 1
Cycle: 2
Cycle: 3
Cycle: 4
Cycle: 5
Cycle: 6
Cycle: 7
Cycle: 8
Cycle: 9
Cycle: 10
Cycle: 11
Cycle: 12
Cycle: 13
Cycle: 14
Cycle: 15
Cycle: 16
Cycle: 17
Cycle: 18
Cycle: 19
Cycle: 20
Cycle: 21
Cycle: 22
Cycle: 23
Cycle: 24
Cycle: 25
Cycle: 26
Cycle: 27
Cycle: 28
Cycle: 29
Cycle: 30
Cycle: 31
Cycle: 32
Cycle: 33
Cycle: 34
Cycle: 35
Cycle: 36
Cycle: 37
Cycle: 38
Cycle: 39
Cycle: 40
Cycle: 41
Cycle: 42
Cycle: 43
Cycle: 44
Cycle: 45
Cycle: 46
Cycle: 47
Cycle: 48
Cycle: 49
Cycle: 50
Cycle: 51
Cycle: 52
Cycle: 53
Cycle: 54
Cycle: 55
Cycle: 56
Cycle: 57
Cycle: 58
Cycle: 59
Cycle: 60
Cycle: 61
Cycle: 62
Cycle: 63
Cycle: 64
Cycle: 65
Cycle: 66
Cycle: 67
Cycle: 68
Cycle: 69
Cycle: 70
Cycle: 71
Cycle: 72
Cycle: 73
Cycle: 74
Cycle: 75
Cycle: 76
Cycle: 77
Cycle: 78
Cycle: 79
Cycle: 80
Cycle: 81
Cycle: 82
Cycle: 83
Cycle: 84
Cycle: 85
Cycle: 86
Cycle: 87
Cycle: 88
Cycle: 89
Cycle: 90
Cycle: 91
Cycle: 92
Cycle: 93
Cycle: 94
Cycle: 95
Cycle: 96
Cycle: 97
Cycle: 98
Cycle: 99
Cycle: 100
Cycle: 1

In [91]:
cycle_count = 0
qa_dataset_5[['context', 'difficulty']] = qa_dataset_5.apply(lambda row: pd.Series(generate_answer_for_row(row)), axis=1)
qa_dataset_5.to_json('qa_dataset_5.json', orient='records', indent=4)

Cycle: 1




TooManyRequests: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-exp:generateContent?%24alt=json%3Benum-encoding%3Dint: Resource has been exhausted (e.g. check quota).

In [86]:
combined_qa_dataset = pd.concat([qa_dataset_1, qa_dataset_2, qa_dataset_3, qa_dataset_4], ignore_index=True)
print(combined_qa_dataset.shape)
combined_qa_dataset.to_json('combined_qa_dataset.json', orient='records', indent=4)

(1104, 6)


In [88]:
combined_qa_dataset.sample(15)

Unnamed: 0,question,type,options,intended_answer,context,difficulty
18,What is the contact person interested in?,MULTI_SELECT,"[100 Additive Manufacturing, 200 Automation, 3...","[100 Additive Manufacturing, 300 Advanced Manu...","Oh, I think they're interested in either '100 ...",easy
798,Customer type,SINGLE_SELECT,"[New customer, Existing customer, Partner, App...",[Existing customer],"Oh, customer type? Hmm, I guess I'd say I'm an...",easy
259,When does the contact person wish to receive a...,MULTI_SELECT,"[1 week, 2 weeks, 3 weeks]","[1 week, 3 weeks]","Oh, I think they'd like a follow up in either ...",easy
51,CRM-System,SINGLE_SELECT,"[Salesforce, Pipedrive, Close.io, Microsoft Dy...",[Pipedrive],"Hmm, a CRM system? I guess a good choice would...",easy
372,What is the contact person interested in?,MULTI_SELECT,"[100 Additive Manufacturing, 200 Automation, 3...","[200 Automation, 300 Advanced Manufacturing, 2...","Oh gosh, I'm not really sure. Maybe they like ...",easy
735,CRM-System,SINGLE_SELECT,"[Salesforce, Pipedrive, Close.io, Microsoft Dy...",[CAS],"Oh, hmm. I'm not sure what the options are, bu...",easy
599,Customer satisfaction,SINGLE_SELECT,"[Very satisfied, Satisfied, Unsatisfied, Very ...",[Satisfied],Well I suppose I'd have to say I'm satisfied. ...,easy
807,Customer satisfaction,SINGLE_SELECT,"[Very satisfied, Satisfied, Unsatisfied, Very ...",[Unsatisfied],Oh I'm not really sure but maybe unsatisfied i...,easy
591,What is the type of contact?,MULTI_SELECT,"[Existing customer, Supplier, New customer / P...","[Existing customer, Supplier, Competitor]","I'm not really sure, it could be an existing c...",easy
670,What is the size of your company?,SINGLE_SELECT,"[1-10, 11-50, 51-200, 201-2000, larger than 2000]",[1-10],Oh gosh I'm not sure about the exact size. I'd...,easy


In [None]:
qa_dataset.to_json('qa_dataset_large.json', orient='records', lines=False, indent=4)