<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Global-Configurations" data-toc-modified-id="Global-Configurations-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Global Configurations</a></span></li><li><span><a href="#Filter-Columns-&amp;-Rows" data-toc-modified-id="Filter-Columns-&amp;-Rows-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Filter Columns &amp; Rows</a></span><ul class="toc-item"><li><span><a href="#Ignore-Suffix" data-toc-modified-id="Ignore-Suffix-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Ignore Suffix</a></span></li><li><span><a href="#Fill-Empty-Integer-Values" data-toc-modified-id="Fill-Empty-Integer-Values-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Fill Empty Integer Values</a></span></li><li><span><a href="#Replace-Empty-String-Values" data-toc-modified-id="Replace-Empty-String-Values-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Replace Empty String Values</a></span></li><li><span><a href="#Get-Time-Series-Values" data-toc-modified-id="Get-Time-Series-Values-2.4"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>Get Time Series Values</a></span></li><li><span><a href="#Get-LatLong-Values" data-toc-modified-id="Get-LatLong-Values-2.5"><span class="toc-item-num">2.5&nbsp;&nbsp;</span>Get LatLong Values</a></span></li><li><span><a href="#Replace-Akvo-Flow-Column-Names" data-toc-modified-id="Replace-Akvo-Flow-Column-Names-2.6"><span class="toc-item-num">2.6&nbsp;&nbsp;</span>Replace Akvo Flow Column Names</a></span></li><li><span><a href="#Replace-Datetime-to-String" data-toc-modified-id="Replace-Datetime-to-String-2.7"><span class="toc-item-num">2.7&nbsp;&nbsp;</span>Replace Datetime to String</a></span></li></ul></li><li><span><a href="#Generate-Settings" data-toc-modified-id="Generate-Settings-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Generate Settings</a></span><ul class="toc-item"><li><span><a href="#JSON-Config" data-toc-modified-id="JSON-Config-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>JSON Config</a></span></li><li><span><a href="#Replace-Dataset-Columns" data-toc-modified-id="Replace-Dataset-Columns-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Replace Dataset Columns</a></span></li><li><span><a href="#Define-Categories" data-toc-modified-id="Define-Categories-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Define Categories</a></span></li><li><span><a href="#Overview" data-toc-modified-id="Overview-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>Overview</a></span></li></ul></li><li><span><a href="#Record-Data" data-toc-modified-id="Record-Data-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Record Data</a></span></li></ul></div>

In [1]:
import pandas as pd
import string
import numpy as np
import sqlalchemy as db
import json
import os
from datetime import datetime
import requests as r
pd.set_option('max_columns', 200)

# class
from handler import handler

## Global Configurations

In [2]:
survey_id = 308090988
apiURL = 'http://localhost:8000/api/custom/{}'.format(survey_id)
php_config = r.get(apiURL).json()

In [3]:
instance = php_config['survey_detail']['instance']
form_id = php_config['survey_detail']['form_id']
monitoring_id = php_config['survey_detail']['monitoring_id']
geo = [] # array string

url = "https://api-auth0.akvo.org/flow/orgs/{}".format(instance)
#formInstanceURL = '{}/form_instances?survey_id={}&form_id={}'.format(url, survey_id, form_id)

In [4]:
source = php_config['survey_detail']['survey_name'] # string
akvoflow = True # boolean
max_category = 5 # integer
ignore_suffix = '' # integer
center_map =  php_config['survey_detail']['center_map'] # array integer
empty_string_value = 'No Answer' # String
timeseries = '' # String

## Auth0

In [5]:
auth = {
    "client_id": os.environ["AUTH0_CLIENT_ID_PROD"],
    "username": os.environ["AUTH0_USER_PROD"],
    "password": os.environ["AUTH0_PWD_PROD"],
    "grant_type":"password",
    "scope":"openid email"
}

In [6]:
token = r.post("https://akvo.eu.auth0.com/oauth/token", data=auth).json()["id_token"]

In [7]:
headers = {
    "Content-Type":"application/json",
    "Accept": "application/vnd.akvo.flow.v2+json",
    "Authorization": "Bearer {}".format(token)
}

## Function

In [8]:
def getData(endpoint):
    data = r.get(endpoint, headers=headers)
    print(data.status_code)
    if(data.status_code == 200):
        return data.json()
    return data.status_code

In [9]:
def collectData(url, collections=[]):
    fetch = getData(url)
    data = fetch.get("formInstances")
    if len(data) != 0:
        for d in data:
            collections.append(d)
        next_page = fetch.get("nextPageUrl")
        collectData(next_page, collections)
    return collections

In [10]:
def transform(data, forms, dataPointId):
    answers = {"data_point_id": dataPointId}
    for qgroup in forms.get('questionGroups'):
        gid = qgroup['id']
        questions = qgroup.get('questions')
        for index, question in enumerate(questions):
            qid = question['id']
            qname = question['name'].strip() # remove whitespace on start end
            qtype = question['type']
            key = '{}|{}'.format(qid, qname)
            try:
                answer = data[gid][0][qid]
                answer = handler(answer,qtype)
                answers.update({key:answer})
            except:
                answers.update({key:None})
    return answers

In [11]:
def toList(datas):
    res = []
    for data in datas:
        res.append(data)
    return res

In [12]:
def defineCategories(x):
    category = df.groupby(df[x]).size()
    category_name = configs[x]
    category_id = config_ids[x]
    data_type = str(df[x].dtype)
    
    if(category_id == 'data_point_id'):
        category_id = 0
    
    if(not int(category_id) in questionIds):
        pass
    else:
        type = [item.get('type') for item in categories_config if item.get('question_id') == int(category_id)][0]
        if (type == 'option'):
            type = 'list'
        if (type == 'number'):
            type = 'num'
        if len(category) <= max_category and data_type == 'object':
            category_list = list(category.index)
            categories.append({
                'id':x,
                'type': type,
                'lookup': category_list,
                'name': category_name
            })
        elif data_type == 'int32':
            categories.append({
                'id':x,
                'type': type,
                'name': category_name
            })
        else:
            pass
    return True

## Get, Transform and Insert Data

### Get Data from Flow Api

In [13]:
surveys = getData("{}/surveys/{}".format(url, survey_id))

200


In [14]:
dataPoints = getData('{}/data_points?survey_id={}'.format(url, survey_id))

200


In [15]:
forms = surveys.get('forms')

In [16]:
forms

[{'id': '322230993',
  'name': 'COVID Test',
  'questionGroups': [{'id': '310430997',
    'name': 'Covid Test',
    'repeatable': True,
    'questions': [{'id': '294141015',
      'name': 'Test Result',
      'type': 'OPTION',
      'order': 1,
      'variableName': None,
      'personalData': None,
      'createdAt': '2020-05-05T06:35:55.827Z',
      'modifiedAt': '2020-05-05T06:36:47.125Z'},
     {'id': '286420988',
      'name': 'Test Location',
      'type': 'OPTION',
      'order': 2,
      'variableName': None,
      'personalData': None,
      'createdAt': '2020-05-05T06:36:50.912Z',
      'modifiedAt': '2020-05-05T06:37:36.038Z'}],
    'createdAt': '2020-05-05T06:34:28.252Z',
    'modifiedAt': '2020-05-11T05:05:50.912Z'}],
  'createdAt': '2020-05-05T06:33:55.156Z',
  'modifiedAt': '2020-05-11T05:23:06.737Z',
  'formInstancesUrl': 'https://api-auth0.akvo.org/flow/orgs/seap/form_instances?survey_id=308090988&form_id=322230993'},
 {'id': '308090989',
  'name': 'Registration',
  'q

### Open Connection to DB

In [17]:
engine = db.create_engine('mysql+pymysql://phpmyadmin:'+os.environ['SQL_PWD']+'@localhost/akvo-map?host=localhost?port=3306')
connection = engine.connect()
metadata = db.MetaData(bind=engine)
data_sources = db.Table('data_sources', metadata, autoload=True, autoload_with=engine)
insert = db.insert(data_sources)

### Transform and Insert Data

In [18]:
for form in reversed(forms):
    print("Start")
    formId = int(form.get('id'))
    print(formId)
    qgroups = form.get('questionGroups')
    qtype = ""
    for qgroup in qgroups:
        questions = qgroup.get('questions')
        for question in questions:
            # check registration form
            if (int(form.get('id')) == form_id):
                # set geo column
                if (question.get('type') == 'GEO'):
                    qtype = 'GEO'
                    temp = '{}|{}'.format(question.get('id'), question.get('name'))
                    geo.append(temp.strip())
                    
    formInstanceURL = form.get('formInstancesUrl')
    #formInstance = getData(formInstanceURL)
    rowData = collectData(formInstanceURL)
    df = pd.DataFrame()
    df = df.iloc[0:0]
    df = pd.DataFrame(rowData)
    df['responses'] = df[['responses', 'dataPointId']].apply(lambda x : transform(x['responses'], form, x['dataPointId']), axis=1)
    print(df['responses'])
    responses = toList(df['responses'])
    results = pd.DataFrame(responses)
    
    if (qtype == 'GEO'):
        ##### Split geo-loc column into separate column
        point = pd.Series(results[geo].values.flatten())
        loc = point.str.split(",", n = 1, expand = True)
        # making separate first name column from new data frame 
        results["lat"]= loc[0] 
  
        # making separate last name column from new data frame 
        results["long"]= loc[1] 
  
        # Dropping old Name columns 
        results.drop(columns = geo, inplace = True) 
    
    res = results.dropna(how='all').reset_index(drop=True)
        
    ##### Save file
    output_filename = '{}.xlsx'.format(formId)
    results.to_excel(output_filename, index=False)
    
    #dataset = output_filename # string
    # try:
        # df = pd.read_excel(dataset)
    # except:
        # df = pd.read_csv(dataset)
        
    df = res
    
    
    ## Filter Columns & Rows ======================================================
    print("Filter Columns & Rows")
    ### Ignore Suffix
    if ignore_suffix:
        df = df[[c for c in df.columns if ignore_suffix not in c]]
        
    ### Fill Empty Integer Values
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    if (qtype == 'GEO'):
        # Redeclare lat_long
        lat_long = ['long', 'lat'] # Array String # default, do not change
        df_num = df.drop(columns=lat_long)
    df_num = df.select_dtypes(include=numerics)
    df_num = df_num.fillna(0.0).astype(np.int32)
    df[list(df_num)] = df_num
    
    ### Replace Empty String Values
    df_str = df.select_dtypes(include=['object']).fillna(empty_string_value)
    df[list(df_str)] = df_str
    
    ### Get Time Series Values
    if timeseries:
        df[timeseries] = df[timeseries].apply(lambda x:x.replace(' UTC','').replace(' UTC',''))
        df[timeseries] = pd.to_datetime(df[timeseries], format='%d-%m-%Y %H:%M:%S')
        df['TMS'] = df[timeseries]
        df = df.drop(columns=[timeseries])
        not_category.append('TMS')
        
    ### Get LatLong Values
    if (qtype == 'GEO'):
        df[lat_long] = df[lat_long].round({lat_long[0]: 3, lat_long[1]: 3})
        df['PTS'] = df[lat_long].values.tolist()
        #not_category.append('PTS')
        #df['PTS'] = df[lat_long]
        df = df.drop(columns=lat_long)
        
    ### Replace Akvo Flow Column Names
    rep_indicators = [(lambda x: x.lower().replace('GEOLON',''))(x) for x in list(df)]
    header = lambda a: [x.lower() if x.find("|") == -1 else x.split('|')[1].lower().replace("--other--"," other") for x in a]
    headerIds = lambda a: [x.split('|')[0] for x in a]
    column_names = list(df)
    column_ids = headerIds(list(df))
    if akvoflow:
        column_names = header(list(df))
        column_ids = headerIds(list(df))
        
    ### Replace Datetime to String
    for c in list(df):
        if 'time' in str(df[c].dtype):
            df[c] = df[c].astype('str')
            
            
            
    ## Generate Settings ======================================================
    print("Generate Settings")
    ### JSON Config
    chars =list(string.ascii_uppercase)
    chars_col = chars + [x+y for x in chars for y in chars]
    #keyname = lambda x,y: {a:y[b] for b, a in enumerate(x) }
    keyname = lambda x,y: {a:y[b] if (a != 'data_point_id') else 'data_point_id' for b, a in enumerate(x)}
    
    columns_length = len(list(df)) - 1 # -1 because of datapointid
    if (qtype == 'GEO'):
        columns_length = len(list(df)) - 2 # -2 because of datapointid
    
    index = chars_col[:columns_length]
    index.append('data_point_id') # append 'data_point_id'
    configs = keyname(index, column_names)
    config_ids = keyname(index, column_ids)
    
    if (qtype == 'GEO'):
        # if timeseries:
        # index.append('TMS')
        index.append('PTS')
    
    ### Replace Dataset Columns
    df.columns = index
    #df_index = df.loc[:, df.columns != 'data_point_id']
    #df = pd.concat([df_index, df['data_point_id']], axis=1, sort=False)
    
    ### Define Categories
    categories_config = php_config['categories'][str(formId)]['list'] # array from config
    questionIds = [item.get('question_id') for item in categories_config]
    categories = []

    columns = list(df.columns)
    if (qtype == 'GEO'):
        columns = list(df.drop(columns=['PTS']).columns)
    # IF TMS
    # columns = list(df.drop(columns=['TMS','PTS']).columns)
    for column in columns:
        defineCategories(column)
    
    ### Overview
    print(categories)
    cat = pd.DataFrame(categories)
    selected_cat = php_config['categories'][str(formId)]['selected_category'] # string from config
    selected_cat = selected_cat.lower()
    popup_name = php_config['categories'][str(formId)]['popup_name'] # String from config
    first_cat = cat[(cat['type'] == 'list') & (cat['name'] == selected_cat)].reset_index().loc[0].to_dict()['id']
    conf_series = pd.Series(configs).to_frame('name')
    popup_name = conf_series[conf_series['name'] == popup_name.lower()].index.tolist()[0]
    configs.update({'center':center_map,'name':first_cat,'popup':popup_name})
    
    
    
    
    
    ## Record Data ======================================================
    print("Record Data")
    data = list(df.T.to_dict().values())
    print(data[1])
    print(configs)
    
    templates = php_config['categories'][str(formId)]['template']
    html = [item.get('html') for item in templates]
    js = [item.get('js') for item in templates]
    
    parentId = None
    sourceName = source
    if (formId in monitoring_id):
        parentId = form_id
        sourceName = form.get('name')
        
    db = {
        "id": formId,
        "parent_id": parentId,
        "source": sourceName,
        "config": configs,
        "categories": categories,
        "data": data,
        "html": html,
        "js": js
    }
    
    connection.execute(insert.values(db))
    print("End")

Start
308090989
200
200
0    {'data_point_id': '320260997', '310430991|Name...
1    {'data_point_id': '316081001', '310430991|Name...
2    {'data_point_id': '306190990', '310430991|Name...
3    {'data_point_id': '310440990', '310430991|Name...
Name: responses, dtype: object
Filter Columns & Rows
Generate Settings
[{'id': 'F', 'type': 'list', 'lookup': ['Male'], 'name': 'gender'}, {'id': 'H', 'type': 'list', 'lookup': ['Director', 'Manager'], 'name': 'job title'}]
Record Data
{'A': '2020-05-04 06:27:04.801', 'B': 'https://akvoflow-62.s3.amazonaws.com/images/97be0f9c-d9f4-48a5-9be0-4ad812923df4.jpg', 'C': 'No Answer', 'D': 'Deden Bangkit', 'E': -2147483648, 'F': 'Male', 'G': 'BALI:BALI|KAB. BULELENG:KAB. BULELENG', 'H': 'Director', 'data_point_id': '316081001', 'PTS': ['-122.08400000000002', '37.421998333333335']}
{'A': 'birth date', 'B': 'take a picture of yourself', 'C': 'question for female', 'D': 'name', 'E': 'phone number', 'F': 'gender', 'G': 'address', 'H': 'job title', 'data_poin

### Close Connection

In [19]:
connection.close()