In [1]:
import sys
import getpass
sys.path.append('..')

In [2]:
import pandas as pd
import os
import re
import time
from collections import ChainMap
from sqlalchemy import create_engine, text, inspect, MetaData, ForeignKey, Table, Column, Integer, Float, String, Text, Boolean
from sqlalchemy.orm import sessionmaker, relationship
from sqlalchemy.ext.declarative import declarative_base
from geoalchemy2 import Geography, WKTElement
from FlowHandler import FlowHandler
from Akvo import Flow

In [3]:
PSQL_USER = getpass.getuser()
PSQL_PWD = os.environ['KEYCLOAK_PWD']
PSQL_DB = 'uduma'
PSQL_DB = 'angkorsalad'

In [4]:
instanceURI = 'udumamali'
instanceURI = 'angkorsalad'
requestURI = 'https://api.akvo.org/flow/orgs/' + instanceURI
surveysUrl = []

In [5]:
engine = create_engine("postgresql://{}:{}@localhost/{}".format(PSQL_USER, PSQL_PWD, PSQL_DB))
session = sessionmaker(bind=engine)()
getter = sessionmaker(bind=engine)()
Base = declarative_base()

## Helpers

In [6]:
then = time.time()

In [7]:
def getTime():
    now = time.time()
    decimal_time = int(( now - then)) / 60
    integer_time = int(decimal_time)
    seconds_time = decimal_time - integer_time
    seconds_time = int(60 * seconds_time)
    return '{} Minutes, {} Seconds'.format(integer_time, seconds_time)

## Classes

In [8]:
class Forms(Base):
    
    __tablename__ = "form"
    
    id = Column(Integer, primary_key=True)
    survey_id = Column(Integer, ForeignKey('survey.id'))
    name = Column(Text)
    survey = relationship("Surveys")
    datapoint = relationship("DataPoints")
    
    def __init__(self, data):
        self.id = int(data['id'])
        self.survey_id = int(data['survey_id'])
        self.name = data['name']
    
    def __repr__(self):
        return "<Forms(id={}, survey_id={}, name={})>".format(
            self.id, self.survey_id, self.name)
    
class Surveys(Base):
    
    __tablename__ = "survey"
    
    id = Column(Integer, primary_key=True)
    name = Column(Text)
    registration_id = Column(Integer, nullable=True)
    forms = relationship("Forms")
    
    def __init__(self, data):
        self.id = int(data['id'])
        self.name = data['name']
        self.registration_id = int(data['registrationFormId'])
        
    def __repr__(self):
        return "<Survey(id={}, name={}, registration_id={})>".format(
            self.id, self.name, self.registration_id)

class DataPoints(Base):
    
    __tablename__ = "datapoint"
    
    id = Column(Integer, primary_key=True)
    identifier = Column(Text)
    form_id = Column(Integer, ForeignKey('form.id'))
    submitter = Column(Text)
    survey_time = Column(Integer)
    form = relationship('Forms')
    answers = relationship('Answers')
    
    def __init__(self, data):
        self.id = int(data['id'])
        self.identifier = data['identifier']
        self.form_id = int(data['formId'])
        self.submitter = data['submitter']
        self.survey_time = data['surveyalTime']
    
    def __repr__(self):
        return "<DataPoints(id={}, identifier={}, form_id={}, submitter={}, survey_time={})>".format(
            self.id, self.identifier, self.form_id, self.submitter, self.survey_time)

        
class QuestionGroups(Base):
    
    __tablename__ = "question_group"
    
    id = Column(Integer, primary_key=True)
    form_id = Column(Integer, ForeignKey('form.id'))
    repeat = Column(Boolean)
    name = Column(Text)
    questions = relationship('Questions')
    form = relationship('Forms')
    
    def __init__(self, data):
        self.id = int(data['id'])
        self.form_id = int(data['form_id'])
        self.repeat = data['repeatable']
        self.name = data['name']
    
    def __repr__(self):
        return "<QuestionGroups(id={}, form_id={}, repeat={}, name={})>".format(
            self.id, self.form_id, self.repeat, self.name)
        
class Questions(Base):
    
    __tablename__ = "question"
    
    id = Column(Integer, primary_key=True)
    form_id = Column(Integer, ForeignKey('form.id'))
    question_group_id = Column(Integer, ForeignKey('question_group.id'))
    name = Column(Text)
    type = Column(Text)
    question_group = relationship('QuestionGroups')
    form = relationship('Forms')
    
    def __init__(self, data):
        self.id = int(data['id'])
        self.form_id = int(data['form_id'])
        self.question_group_id = int(data['group_id'])
        self.name = data['name']
        self.type = data['type']
    
    def __repr__(self):
        return "<Questions(id={}, form_id={}, question_group_id={}, name={}, type={})>".format(
            self.id, self.form_id, self.question_group_id, self.name, self.type)
    
class Answers(Base):
    
    __tablename__ = "answer"
    
    id = Column(Integer, primary_key=True, autoincrement=True)
    datapoint_id = Column(Integer, ForeignKey('datapoint.id'))
    question_id = Column(Integer, ForeignKey('question.id'))
    value = Column(Text)
    question = relationship('Questions')
    datapoint = relationship('DataPoints')
    
    def __init__(self, data):
        self.datapoint_id = int(data['datapoint_id'])
        self.question_id = int(data['question_id'])
        self.value = str(data['value'])
        
    def __repr__(self):
        return "<Answers(datapoint_id={}, question_id={}, value={})>".format(
            self.datapoint_id, self.question_id, self.value)

In [None]:
def writeData(input_data, info, log):
    try:
        session.add(input_data)
        if log:
            print('INSERTING: {}'.format(log))
        session.commit()
    except:
        if log:
            print('ERROR: ABORTING {}'.format(log))
        session.rollback()
        raise

## Main Function

In [None]:
def getFolders(items):
    for folder in items['folders']:
        try:
            surveysUrl.append(folder['surveysUrl'])
            childs = Flow.getResponse(folder['foldersUrl'])
            getFolders(childs)
        except:
            pass
        
print('GETTING FOLDER LIST: ' + getTime())
parents = Flow.getResponse(requestURI + '/folders')
getFolders(parents)
print('FOLDER IS POPULATED: ' + getTime())

In [None]:
surveys = []
print('GETTING SURVEY LIST: ' + getTime())
for surveyUrl in surveysUrl:
    surveyList = Flow.getResponse(surveyUrl)
    for survey in surveyList['surveys']:
        surveys.append(survey)
print('SURVEY IS POPULATED: ' + getTime())

In [None]:
print('RECORDING SURVEY: ' + getTime())
formInstanceUrls = []
for url in surveys:
    data = Flow.getResponse(url['surveyUrl'])
    print('GETTING {}: {}'.format(data['name'],getTime()))
    if data['registrationFormId'] == "":
        data.update({'registrationFormId':0})
    input_data = Surveys(data)
    writeData(input_data, data, "SURVEY")
    for form in data['forms']:
        form.update({'survey_id':data['id']})
        input_data = Forms(form)
        writeData(input_data, form, False)
        for qgroup in form['questionGroups']:
            qgroup.update({'form_id':form['id']})
            input_data = QuestionGroups(qgroup)
            writeData(input_data, qgroup, False)
            for question in qgroup['questions']:
                question.update({'group_id':qgroup['id']})
                question.update({'form_id':form['id']})
                input_data = Questions(question)
                writeData(input_data, question, False)
        formInstanceUrls.append({
            'formInstancesUrl': form['formInstancesUrl'],
            'form_id': form['id']
        })
print('SURVEY IS RECORDED: ' + getTime())

In [None]:
print('GETTING DATAPOINTS: ' + getTime())
for data in formInstanceUrls:
    formInstances = Flow.getResponse(data['formInstancesUrl'])
    formInstancesData = formInstances['formInstances']
    while 'nextPageUrl' in formInstances:
        nextPageData = Flow.getResponse(formInstances['nextPageUrl'])
        formInstancesData += nextPageData['formInstances']
        formInstances = nextPageData
    for datapoint in formInstancesData:
        input_data = DataPoints(datapoint)
        writeData(input_data, datapoint, "DATAPOINT")
        answers = datapoint['responses']
        for group_id in [*answers]:
            for group in answers[group_id]:
                for qid in [*group]:
                    question = getter.query(Questions).filter(Questions.id == int(qid)).first()
                    answer_value = FlowHandler(group, qid, question.type)
                    answer_value = str(answer_value)
                    answer = {
                        'datapoint_id': datapoint['id'],
                        'question_id': qid,
                        'value': answer_value
                    }
                    input_data = Answers(answer)
                    writeData(input_data, answer, False)
print('DATAPOINTS RECORDED: ' + getTime())

In [None]:
session.rollback()

In [None]:
total_surveys = session.query(Surveys).count()
total_forms = session.query(Forms).count()
total_question_groups = session.query(QuestionGroups).count()
total_questions = session.query(Questions).count()
total_datapoints = session.query(DataPoints).count()

In [None]:
print(
    'SUCCESS : {}\n\n'\
    '{} SURVEYS\n'\
    '{} FORMS\n'\
    '{} QUESTION GROUPS\n'\
    '{} QUESTIONS\n'\
    '{} DATAPOINTS\n'\
    .format(getTime(),total_surveys,total_forms,total_question_groups,total_questions, total_datapoints)
)

In [129]:
def qregex(name, id):
    regex = re.compile('[,\.!?()""]')
    name = regex.sub('', name).lower().replace(' ','_')
    return '{}_{}'.format(id, name)

In [131]:
form_list = session.query(Forms).all()
for fm in form_list:
    table_name = qregex(fm.name, fm.id)
    rows = []
    data = session.query(DataPoints).filter(DataPoints.form_id == fm.id)
    for dt in data:
        row = list(map(lambda x: {qregex(x.question.name, x.question.id):x.value}, dt.answers))
        row = dict(ChainMap(*row))
        row.update({
            '0_identifier':dt.identifier,
            '0_submitter':dt.submitter,
            '0_survey_time':dt.survey_time
        })
        rows.append(row)
    df = pd.DataFrame(rows)
    df = df[sorted(list(df))]
    column_name = dict(ChainMap(*[{x:x.replace('0_','')} for x in list(df)]))
    df = df.rename(columns=column_name)
    df.to_sql(table_name, engine)