<a href="https://colab.research.google.com/github/alexk2206/tds_capstone/blob/Alex-DEV/answer_combinations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [57]:
import pandas as pd
import json
import random
from itertools import chain, combinations
from datetime import datetime, timedelta

In [58]:
dfs = []

for i in range(1, 6):
    url = f"https://raw.githubusercontent.com/alexk2206/tds_capstone/refs/heads/main/questionnaires/questionnaire{i}.json"
    df = pd.read_json(url)
    df['options'] = df['options'].apply(lambda x: ", ".join([opt['option'] for opt in x]))
    dfs.append(df)

all_questions = pd.concat(dfs, ignore_index=True)

In [59]:
def generate_combinations(options_list, max_size):
    return list(chain.from_iterable(combinations(options_list, r) for r in range(0, min(len(options_list), max_size) + 1)))

def generate_phone_number():
    phone_prefix = "01" + str(random.randint(100, 999)) + (str(random.randint(0, 9)) if random.random() < 0.5 else "")
    main_number = "".join([str(random.randint(0, 9)) for _ in range(random.randint(6, 8))])
    phone_number = phone_prefix + main_number
    return phone_number

def generate_date(today=None):
    if today is None:
        today = datetime.today()

    random_days = random.randint(0, 13)
    random_date = today - timedelta(days=random_days)

    date = random_date.strftime("%Y-%m-%d")

    return date


In [60]:
all_questions['options'] = all_questions['options'].str.split(', ')

expanded_rows = []

# Iteriere über alle Zeilen im DataFrame
for _, row in all_questions.iterrows():
    question = row['question']
    options_list = row['options']
    question_type = row['type']

    if question_type == 'MULTI_SELECT':
        options_combinations = generate_combinations(options_list, max_size = 6)
        for combo in options_combinations:
            expanded_rows.append({'question': question, 'type': question_type, 'options': options_list, 'intended_answer' : list(combo)})

    elif question_type == 'SINGLE_SELECT':
        for option in options_list:
            expanded_rows.append({'question': question, 'type': question_type, 'options': options_list, 'intended_answer': [option]})

   # else:
    #    expanded_rows.append({'question': question, 'type': question_type, 'options': options_list, 'intended_answer' : options_list})

answer_combinations = pd.DataFrame(expanded_rows)

In [61]:
answer_combinations.info()
print(answer_combinations.shape)
answer_combinations

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4439 entries, 0 to 4438
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   question         4439 non-null   object
 1   type             4439 non-null   object
 2   options          4439 non-null   object
 3   intended_answer  4439 non-null   object
dtypes: object(4)
memory usage: 138.8+ KB
(4439, 4)


Unnamed: 0,question,type,options,intended_answer
0,Data processing consent,SINGLE_SELECT,"[Yes, No]",[Yes]
1,Data processing consent,SINGLE_SELECT,"[Yes, No]",[No]
2,Customer group,SINGLE_SELECT,"[End User, Wholesaler, Distributor, Consultant...",[End User]
3,Customer group,SINGLE_SELECT,"[End User, Wholesaler, Distributor, Consultant...",[Wholesaler]
4,Customer group,SINGLE_SELECT,"[End User, Wholesaler, Distributor, Consultant...",[Distributor]
...,...,...,...,...
4434,Searches a solution for,MULTI_SELECT,"[Scan business cards, Clean up CRM, Extract da...","[Clean up CRM, Extract data from emails, Impro..."
4435,Searches a solution for,MULTI_SELECT,"[Scan business cards, Clean up CRM, Extract da...","[Scan business cards, Clean up CRM, Extract da..."
4436,Next steps,SINGLE_SELECT,"[Offer, Meeting, Call]",[Offer]
4437,Next steps,SINGLE_SELECT,"[Offer, Meeting, Call]",[Meeting]


In [52]:
def adjust_question_amount(df, column, random_state):
    random.seed(random_state)
    def adjust_group(group):
        max_amount = random.randint(32, 48)

        if len(group) < max_amount:
            return group.sample(n=max_amount, replace=True, random_state=random_state)
        else:
            return group.sample(n=max_amount, random_state=random_state)

    return df.groupby(column, group_keys=False).apply(adjust_group).reset_index(drop=True)

In [53]:
answer_combinations_limited = adjust_question_amount(answer_combinations, 'question', 1)

  return df.groupby(column, group_keys=False).apply(adjust_group).reset_index(drop=True)


In [54]:
all_questions['options'] = all_questions['options'].str.split(', ')

expanded_rows = []

# Iteriere über alle Zeilen im DataFrame
for _, row in all_questions.iterrows():
    question = row['question']
    options_list = row['options']
    question_type = row['type']

    if question_type == 'NUMBER':
        expanded_rows.append({'question': question, 'type': question_type, 'options': options_list, 'intended_answer': generate_phone_number()})

    elif question_type == 'TEXT':
        expanded_rows.append({'question': question, 'type': question_type, 'options': options_list, 'intended_answer' : "Free text"})

    elif question_type == 'DATE':
        expanded_rows.append({'question': question, 'type': question_type, 'options': options_list, 'intended_answer' : generate_date()})

    else:
        expanded_rows.append({'question': question, 'type': question_type, 'options': options_list, 'intended_answer' : options_list})

answer_combinations = pd.DataFrame(expanded_rows)

In [55]:
print('answer_combinations type counts:')
print(answer_combinations['type'].value_counts())
print('answer_combinations_limited type counts:')
print(answer_combinations_limited['type'].value_counts())

answer_combinations type counts:
type
SINGLE_SELECT    12
MULTI_SELECT      9
TEXT              2
DATE              1
NUMBER            1
Name: count, dtype: int64
answer_combinations_limited type counts:
type
SINGLE_SELECT    475
MULTI_SELECT     350
TEXT              80
NUMBER            39
DATE              32
Name: count, dtype: int64


In [62]:
filtered_combinations = answer_combinations_limited[answer_combinations_limited['type'] == "NUMBER"]
filtered_combinations

Unnamed: 0,question,type,options,intended_answer
688,What phone number can we use for contact?,NUMBER,[phone number],[phone number]
689,What phone number can we use for contact?,NUMBER,[phone number],[phone number]
690,What phone number can we use for contact?,NUMBER,[phone number],[phone number]
691,What phone number can we use for contact?,NUMBER,[phone number],[phone number]
692,What phone number can we use for contact?,NUMBER,[phone number],[phone number]
693,What phone number can we use for contact?,NUMBER,[phone number],[phone number]
694,What phone number can we use for contact?,NUMBER,[phone number],[phone number]
695,What phone number can we use for contact?,NUMBER,[phone number],[phone number]
696,What phone number can we use for contact?,NUMBER,[phone number],[phone number]
697,What phone number can we use for contact?,NUMBER,[phone number],[phone number]


In [37]:
answer_combinations.to_json('answer_combinations.json', orient='records')