In [1]:
import requests as r
import pandas as pd
import json
import random
from faker import Faker

In [2]:
fake = Faker()
base_url = 'https://form-service.akvotest.org/api/form/'
geopoints_file = './random_households_geo.xlsx'
number_of_datapoints = 10

In [3]:
forms = [{
    "id":1699353915355,
    "name": "Households",
    "registration": {
        "1699354220734": "New"
    },
    "meta": "random_households_geo"
},{
    "id": 1701757876668,
    "name": "Wash in Schools",
    "registration": {
        "1701757914033": "New"
    },
    "meta": "base_village"
},{
    "id": 1699354006503,
    "name": "Community",
    "registration": {
        "1699354006535": "New"
    },
    "meta": "base_village"
}]

In [4]:
def generate_fake_data_for_question(question, answers, geo_value):
    q_type = question['type']
    # Check dependencies
    if 'dependency' in question:
        for dependency in question['dependency']:
            dependent_question_id = dependency['id']
            if dependent_question_id not in answers or \
               answers[dependent_question_id] not in dependency.get('options', []):
                return None
    if q_type == 'input':
        return fake.name()
    if q_type == 'text':
            return fake.paragraph()
    elif q_type == 'number':
        return random.randint(1, 100)
    elif q_type == 'option':
        return random.choice(question['option'])['name']
    elif q_type == 'multiple_option':
        options = [option['name'] for option in question['option']]
        num_choices = random.randint(1, len(options))
        return random.sample(options, num_choices)
    elif q_type == 'date':
        return fake.date()
    elif q_type == 'geo':
        return geo_value
    elif q_type == 'autofield':
        return "AUTOFIELD"
    else:
        return q_type

In [5]:
def generate_datapoint(questions, geo_value, registration):
    fake_datapoint = {}
    registration_ids = []
    if registration:
        fake_datapoint = registration.copy()
        registration_ids = [int(reg) for reg in list(registration)]
    for question in questions:
        answer = None
        if question["id"] not in registration_ids:
            answer = generate_fake_data_for_question(question, fake_datapoint, geo_value)
        else:
            answer = registration.get(question["id"])
        if answer is not None:
            fake_datapoint[question['id']] = answer
    return fake_datapoint

In [6]:
def generate_datapoints(form_id, registration, metadata):
    req = r.get(f"{base_url}{form_id}")
    req = req.json()
    question_groups = req['question_group']
    questions = [
        {**question, 'qg_id': group['id']} 
        for group in question_groups 
        for question in group['question']
    ]
    fake_data = []
    for i in range(metadata.shape[0]):
        meta_row = dict(metadata.iloc[i])
        geo_value = {'lat': float(meta_row.get('Latitude')), 'lng': float(meta_row.get('Longitude'))}
        fake_datapoint = generate_datapoint(questions, geo_value, registration)
        fake_data.append({**meta_row, **fake_datapoint})
    return fake_data

In [7]:
def get_metadata(form):
    meta = pd.read_excel(geopoints_file, form.get('meta'))
    for col in list(meta):
        if 'code' in col:
            meta[col] = meta[col].astype(int)
    return meta

In [8]:
for form in forms:
    form_id = form['id']
    form_name = form['name']
    registration = form.get("registration")
    metadata = get_metadata(form)
    datapoints = generate_datapoints(form_id, registration, metadata)
    pd.DataFrame(datapoints).to_csv(f"{form_id}-{form_name}.csv",index=False)