<a href="https://colab.research.google.com/github/alexk2206/tds_capstone/blob/Alex-DEV/answer_combinations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Answer Combinations
Created by: Alexander Keßler

In [1]:
import pandas as pd
import json
import random
from itertools import chain, combinations
from datetime import datetime, timedelta

## Load questionnaires

In [2]:
dfs = []

for i in range(1, 6):
    url = f'https://raw.githubusercontent.com/alexk2206/tds_capstone/refs/heads/main/questionnaires/questionnaire{i}.json'
    df = pd.read_json(url)
    df['options'] = df['options'].apply(lambda x: ', '.join([opt['option'] for opt in x]))
    dfs.append(df)

all_questions = pd.concat(dfs, ignore_index=True)
print(f"all_questions shape: {all_questions.shape}")

all_questions shape: (25, 4)


## Define functions
- function for combination creation of MULTI_SELECT questions
- function for phone number creation
- function for date creation
- function for creation of note taking prompt
- function for processing different types of questions
- function to scale up the number of questions

In [3]:
def generate_combinations(options_list, max_size):
    # Generates all possible combinations of options from the provided list, with combination sizes ranging from 0 to the minimum of the list length or max_size.
    # Returns a list of these combinations.
    return list(chain.from_iterable(combinations(options_list, r) for r in range(0, min(len(options_list), max_size) + 1)))



def generate_phone_number():
    # Generates a random phone number with a prefix starting with '01' followed by a random 3-digit number and optionally an extra digit.
    # The main number consists of a random number of digits (between 6 and 8).
    # Returns a list containing the generated phone number.
    phone_prefix = '01' + str(random.randint(100, 999)) + (str(random.randint(0, 9)) if random.random() < 0.5 else '')
    main_number = ''.join([str(random.randint(0, 9)) for _ in range(random.randint(6, 8))])
    phone_number = phone_prefix + main_number
    return [phone_number]



def generate_date(today=None):
    # Generates a random date within the last two weeks, based on the provided 'today' date (or the current date if none is given).
    # Returns a list containing the generated date in 'YYYY-MM-DD' format.
    if today is None:
        today = datetime.today()

    random_days = random.randint(0, 13)
    random_date = today - timedelta(days=random_days)

    date = random_date.strftime('%Y-%m-%d')
    return [date]



def generate_notes():
    # Returns a list containing the placeholder text 'Add additional information here' as the intended answer for text-based questions.
    return ['Add additional information here']


In [4]:
# As we wanted to handle the selection and the freetext questions seperately, we created these two functions:

def process_selections(row, max_size):
    # Processes selection-type questions (MULTI_SELECT and SINGLE_SELECT) by generating possible answer combinations
    # for MULTI_SELECT questions and individual options for SINGLE_SELECT questions.
    # Returns a list of dictionaries with expanded question-answer pairs.
    question = row['question']
    options_list = row['options']
    question_type = row['type']
    expanded = []

    if question_type == 'MULTI_SELECT':
        options_combinations = generate_combinations(options_list, max_size=max_size)
        for combo in options_combinations:
            expanded.append({'question': question, 'type': question_type, 'options': options_list, 'intended_answer': list(combo)})

    elif question_type == 'SINGLE_SELECT':
        for option in options_list:
            expanded.append({'question': question, 'type': question_type, 'options': options_list, 'intended_answer': [option]})

    return expanded



def process_freetext(row):
    # Processes free text-type questions (TEXT, NUMBER, and DATE) by generating appropriate intended answers
    # Returns a list of dictionaries with expanded question-answer pairs.
    question = row['question']
    options_list = row['options']
    question_type = row['type']
    expanded = []

    if question_type == 'TEXT':
        expanded.append({'question': question, 'type': question_type, 'options': options_list, 'intended_answer': generate_notes()})

    elif question_type == 'NUMBER':
        expanded.append({'question': question, 'type': question_type, 'options': options_list, 'intended_answer': generate_phone_number()})

    elif question_type == 'DATE':
        expanded.append({'question': question, 'type': question_type, 'options': options_list, 'intended_answer': generate_date()})

    return expanded

In [5]:
def adjust_question_amount(df, column, random_state):
    # Adjusts the amount of each unique question in the specified column to be between 48 and 64 occurrences.
    # If a group has fewer than the required amount, it samples with replacement. Otherwise, it samples without replacement.
    # With this range and approach we assured that we have at least a thousand questions in our Q&A dataset
    # Returns the DataFrame with adjusted group sizes.
    random.seed(random_state)

    def adjust_group(group):
        max_amount = random.randint(48, 64)

        if len(group) < max_amount:
            return group.sample(n=max_amount, replace=True, random_state=random_state)
        else:
            return group.sample(n=max_amount, random_state=random_state)

    return df.groupby(column, group_keys=False).apply(adjust_group).reset_index(drop=True)

## Apply functions on the dataset
- split dataset into selection questions and
- create intended answers for selection type questions
- scale up free text questions
- create intended answers for free text questions
- append dataset

In [6]:
# This code processes the `all_questions` DataFrame by first separating the selection questions (MULTI_SELECT and SINGLE_SELECT) from the free text questions (all others).
# After that, for the selection questions, it splits the 'options' column into a list and applies the `process_selections` function to generate combinations for MULTI_SELECT and individual options for SINGLE_SELECT questions.
# The resulting expanded data is normalized into a new DataFrame, `selection_intended_answers`, which contains the generated question-answer pairs.

selection_questions = all_questions[(all_questions['type'] == 'MULTI_SELECT') | (all_questions['type'] == 'SINGLE_SELECT')]
freetext_questions = all_questions[(all_questions['type'] != 'MULTI_SELECT') & (all_questions['type'] != 'SINGLE_SELECT')]

selection_counts = selection_questions['type'].value_counts()
freetext_counts = freetext_questions['type'].value_counts()

print(f'selection_questions shape: {selection_questions.shape}, counts per type:\n{selection_counts}')
print(f'freetext_questions shape: {freetext_questions.shape}, counts per type:\n{freetext_counts}')

selection_questions.loc[:, 'options'] = selection_questions['options'].str.split(', ').copy()
expanded_data = selection_questions.apply(lambda row: process_selections(row, max_size=6), axis=1).explode()
selection_intended_answers = pd.json_normalize(expanded_data)
selection_intended_answers.head(15)

selection_questions shape: (21, 4), counts per type:
type
SINGLE_SELECT    12
MULTI_SELECT      9
Name: count, dtype: int64
freetext_questions shape: (4, 4), counts per type:
type
TEXT      2
DATE      1
NUMBER    1
Name: count, dtype: int64


Unnamed: 0,question,type,options,intended_answer
0,Data processing consent,SINGLE_SELECT,"[Yes, No]",[Yes]
1,Data processing consent,SINGLE_SELECT,"[Yes, No]",[No]
2,Customer group,SINGLE_SELECT,"[End User, Wholesaler, Distributor, Consultant...",[End User]
3,Customer group,SINGLE_SELECT,"[End User, Wholesaler, Distributor, Consultant...",[Wholesaler]
4,Customer group,SINGLE_SELECT,"[End User, Wholesaler, Distributor, Consultant...",[Distributor]
5,Customer group,SINGLE_SELECT,"[End User, Wholesaler, Distributor, Consultant...",[Consultant]
6,Customer group,SINGLE_SELECT,"[End User, Wholesaler, Distributor, Consultant...",[Planner]
7,Customer group,SINGLE_SELECT,"[End User, Wholesaler, Distributor, Consultant...",[Architect]
8,Customer group,SINGLE_SELECT,"[End User, Wholesaler, Distributor, Consultant...",[R&D]
9,Products interested in,MULTI_SELECT,"[MY-SYSTEM, Notion, JTS, JS EcoLine, AKW100, A...",[]


In [7]:
# This code scales the `selection_intended_answers` DataFrame.
# Here we can allready scale the questions, because every intended answer is already unique.
# It ensures that each unique question has a number of occurrences between 48 and 64, using the `adjust_question_amount` function.

selection_intended_answers_scaled = adjust_question_amount(selection_intended_answers, 'question', 1)

selection_intended_answers_scaled_counts = selection_intended_answers_scaled['type'].value_counts()
print(f'selection_questions shape: {selection_intended_answers_scaled.shape}, counts per type:\n{selection_intended_answers_scaled_counts}')

selection_questions shape: (1172, 4), counts per type:
type
SINGLE_SELECT    663
MULTI_SELECT     509
Name: count, dtype: int64


  return df.groupby(column, group_keys=False).apply(adjust_group).reset_index(drop=True)


In [8]:
# This code scales the `freetext_questions_scaled` DataFrame.
# Here we have to scale befor generating intended answers in order to get different intended answers.
# If we would have generated them beforehand, every intended answer would have been the same.
# It ensures that each unique question has a number of occurrences between 48 and 64, using the `adjust_question_amount` function.

freetext_questions_scaled = adjust_question_amount(freetext_questions, 'question', 1)

freetext_questions_scaled_counts = freetext_questions_scaled['type'].value_counts()
print(f'selection_questions shape: {freetext_questions_scaled.shape}, counts per type:\n{freetext_questions_scaled_counts}')

selection_questions shape: (209, 4), counts per type:
type
TEXT      102
NUMBER     56
DATE       51
Name: count, dtype: int64


  return df.groupby(column, group_keys=False).apply(adjust_group).reset_index(drop=True)


In [9]:
# Split the 'options' column into lists of options (comma-separated) for each row
freetext_questions_scaled['options'] = freetext_questions_scaled['options'].str.split(', ')

# Initialize an empty list to accumulate the expanded rows
expanded_rows = []

# Iterate over each row in the DataFrame
for _, row in freetext_questions_scaled.iterrows():
    question = row['question']
    options_list = row['options']
    question_type = row['type']

    # For NUMBER questions, generate a phone number as the intended answer
    if question_type == 'NUMBER':
        expanded_rows.append({'question': question, 'type': question_type, 'options': options_list, 'intended_answer': generate_phone_number()})

    # For TEXT questions, set a default note as the intended answer
    elif question_type == 'TEXT':
        expanded_rows.append({'question': question, 'type': question_type, 'options': options_list, 'intended_answer' : generate_notes()})

    # For DATE questions, generate a random date as the intended answer
    elif question_type == 'DATE':
        expanded_rows.append({'question': question, 'type': question_type, 'options': options_list, 'intended_answer' : generate_date()})

    # For other types (i.e., if no specific condition matched), use the options list as the intended answer
    else:
        expanded_rows.append({'question': question, 'type': question_type, 'options': options_list, 'intended_answer' : options_list})

freetext_intended_answer_scaled = pd.DataFrame(expanded_rows)

In [10]:
# This code combines the selection and freetext intended answer DataFrames into one
# Shuffle the combined DataFrame randomly (fraction 1 means shuffling all rows) and reset the index

combined_df = pd.concat([selection_intended_answers_scaled, freetext_intended_answer_scaled], ignore_index=True)
answer_combinations = combined_df.sample(frac=1, random_state=1).reset_index(drop=True)

print(f'answer_combinations shape: {answer_combinations.shape}')
answer_combinations.sample(25)

answer_combinations shape: (1381, 4)


Unnamed: 0,question,type,options,intended_answer
645,What type of company is it?,SINGLE_SELECT,"[Construction company, Craft enterprises, Scaf...",[Education sector]
598,Productinterests,MULTI_SELECT,"[BusinessCards, DataEnrichment, VisitReport, D...","[VisitReport, Data Cleansing]"
365,Products interested in,MULTI_SELECT,"[MY-SYSTEM, Notion, JTS, JS EcoLine, AKW100, A...","[Notion, JTS, AKW100, AX100]"
1106,What is the contact person interested in?,MULTI_SELECT,"[100 Additive Manufacturing, 200 Automation, 3...","[100 Additive Manufacturing, 300 Advanced Manu..."
306,What kind of follow up is planned,MULTI_SELECT,"[Email, Phone, Schedule a Visit, No action]","[Email, Phone, No action]"
821,Would you like to receive marketing informatio...,SINGLE_SELECT,"[Yes, No]",[Yes]
1263,Next steps,SINGLE_SELECT,"[Offer, Meeting, Call]",[Offer]
1322,Which language is wanted for communication?,SINGLE_SELECT,"[German, Italian, Japanese , English, Spanish]",[German]
458,What is the contact person interested in?,MULTI_SELECT,"[100 Additive Manufacturing, 200 Automation, 3...","[100 Additive Manufacturing, 300 Advanced Manu..."
1130,What is the size of your company?,SINGLE_SELECT,"[1-10, 11-50, 51-200, 201-2000, larger than 2000]",[11-50]


In [11]:
# Save the dataset
answer_combinations.to_json('answer_combinations.json', orient='records')