In [1]:
import pandas as pd
import requests as r
from collections import Counter
import json
import re
import collections

In [2]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 999

In [3]:
def removeRegex(stc):
    stc = stc.replace('(','')
    stc = stc.replace(')','')
    return stc

In [4]:
def nested_dict():
    return collections.defaultdict(nested_dict)

In [5]:
FORMS_URL = "http://unep.localhost/json/unep-forms-with-variables.json"
COUNTRY_QUESTION = "Country"
COUNTRY_FILTER = "Member State"
DATA_SOURCE = "data-production_2020-03-16.csv"

In [6]:
def generate_code(x,y,t):
    for i, a in enumerate(x):
        code = y +'#' + str(i)
        if t == COUNTRY_QUESTION:
            a.update({
                'code':a['v']
            })
        else:
            a.update({
                'code':code
            })
    return x

In [7]:
dataframe = pd.read_csv('./source/' + DATA_SOURCE)
forms = pd.DataFrame(r.get(FORMS_URL).json())
forms['code'] = forms['q'].apply(lambda x:x.split(' ')[0].replace('.','').upper())
forms['a'] = forms.apply(lambda x: generate_code(x['a'], x['code'], x['s']), axis=1)
col_names = forms[['q','code']].to_dict('records')
col_names = {q['q']:q['code'] for q in col_names}

In [8]:
questions = forms['code'].to_list()
dataframe = dataframe.rename(columns=col_names)

In [9]:
#DEBUG
#dataframe = dataframe.loc[0:1]

In [10]:
def collect(sentence, options):
    data = []
    if sentence == sentence:
        sentence = removeRegex(sentence)
        for option in options:
            option_value = removeRegex(option['v'])
            if re.findall(option_value, str(sentence)):
                data.append(option['code'])
    return data

In [11]:
country_qcode = forms[forms['s'] == COUNTRY_QUESTION].reset_index()
country_qcode = country_qcode['code'][0]

In [12]:
metadatas = ['Unique Response Number','StartDate','CompletionDate']
questionlist = []
dataset = []
for index, row in dataframe.iterrows():
    cdata = {}
    for mdata in metadatas:
        cdata.update({mdata: row[mdata]})
    for qdata in list(dataframe):
        for q in questions:
            if qdata == q:
                qst = forms[forms['code'] == q]
                qtype = qst['t'].values[0]
                qopt = qst['a'].values[0]
                if qtype == "OPTIONS":
                    cdata.update({q: collect(row[q], qopt)})
                    questionlist.append(q)
                else:
                    cdata.update({q: row[q]})
    dataset.append(cdata)

In [13]:
baked = pd.DataFrame(dataset)

In [14]:
#qobject = {}
#for question in questionlist:
#    if question != country_qcode:
#        dataopt = {}
#        options = forms[forms['code'] == question]['a'].values[0]
#        for option in options:
#            country_value = option['code']
#            dataopt.update({country_value:0})
#        qobject.update({question:dataopt})

In [15]:
#results = nested_dict()
countrylist = forms[forms['code'] == country_qcode]['a'].values[0]
#for country in countrylist:
#    results.update({country['v']:qobject})

In [16]:
results = nested_dict()

In [17]:
for data in baked.to_dict('records'):
    for country in data[country_qcode]:
        for question in questionlist:
            if question != country_qcode:
                for answer in data[question]:
                    try:
                        results[country][question][answer] += 1
                    except:
                        results[country][question][answer] = 0

In [18]:
with open('./results/transformed-' + DATA_SOURCE.replace('csv','json'), 'w') as outfile:
    json.dump(results, outfile)

In [19]:
## INPUT CONNECTION

In [20]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import numpy as np
from app.models import Countries, Groups, Values, CountryGroups, CountryValues
from app.connection import engine_url, write_data

In [21]:
engine_url = engine_url()
engine = create_engine(engine_url)
session = sessionmaker(bind=engine)()

In [22]:
values = forms[['q','s','t']].rename(columns={'q':'description','s':'name','t':'type'})
values['code'] = values['description'].apply(lambda x:x.split(' ')[0].replace('.','').upper())
values = values[values['name'] != COUNTRY_QUESTION].reset_index()
for value in values.to_dict('records'):
    value['parent_id'] = None
    value['units'] = None
    input_data = Values(value)
    write_data(session, input_data, value, False)

In [23]:
saved_values = pd.read_sql_table('values', engine_url)

In [24]:
child_values = forms[forms['t'] == 'OPTIONS'].reset_index()
child_values['code'] = child_values['q'].apply(lambda x:x.split(' ')[0].replace('.','').upper())

In [25]:
saved_values = saved_values.merge(child_values, on='code', how='left')
option_values = saved_values[saved_values['type'] == 'OPTIONS']
option_values = option_values[['id','type','code','units','description','a','color']].to_dict('records')

In [26]:
for option_value in option_values:
    for index, child in enumerate(option_value['a']):
        child_value = {
            'parent_id': option_value['id'],
            'units': option_value['units'],
            'code': option_value['code'] + '#' + str(index),
            'description': child['v'] + ' - ' + option_value['description'],
            'type': 'OPTION',
            'name': child['s']
        }
        input_data = Values(child_value)
        write_data(session, input_data, child_value, False)

In [27]:
unep_countries = pd.read_csv('./source/country_shapefile.csv')
unep_countries = unep_countries.rename(columns={'Terr_Name':'name','STATUS':'status'})

In [28]:
countries = unep_countries[unep_countries['status'] == 'Member State'].rename(columns={'ISO3_CODE':'code'})
countries = countries[['code','name','status']].to_dict('records')
for country in countries:
    input_data = Countries(country)
    write_data(session, input_data, country, False)

In [29]:
data_groups = pd.read_csv('./source/list_of_regional_seas_country.csv')
data_groups = data_groups[['Group','Country']].rename(columns={'Country':'name','Group':'group'})

In [30]:
# DEBUG GROUPS
countries = unep_countries[unep_countries['status'] == 'Member State'].rename(columns={'ISO3_CODE':'code'})
country_groups = countries.merge(data_groups, on='name', how='left', indicator=True)
groups = country_groups[country_groups['_merge'] == 'both'].groupby('group').first().reset_index()['group'].to_frame('name')
groups['parent_id'] = None

In [31]:
import wikipedia

In [32]:
def wiki(search):
    description = None
    #try:
    #    description = wikipedia.summary(search, sentences=1)
    #except:
    #    pass
    return description

def groupcode(x):
    x = x.replace('/',' ').replace('(','').replace(')','').split(' ')
    code = ''
    if len(x) == 1:
        x = x[0][0:3]
    for y in x:
        code += y[0].upper()
    return code

In [33]:
groups['description'] = groups['name'].apply(lambda x:wiki(x))
groups['code'] = groups['name'].apply(lambda x: groupcode(x))

In [34]:
for group in groups.to_dict('records'):
    input_data = Groups(group)
    write_data(session, input_data, group, False)

In [35]:
error_country = pd.DataFrame(countrylist)
error_country = error_country[['v']]
error_country = error_country.rename(columns={'v':'name'})
error_country = error_country.merge(unep_countries, on='name', how='left', indicator=True)
error_country = error_country[error_country['ISO3_CODE'] != error_country['ISO3_CODE']]
error_country.to_csv('./results/error-combined_source-and-shapefile.csv')

In [36]:
saved_groups = pd.read_sql_table('groups', engine_url)
country_groups = country_groups.rename(columns={'name':'country','group':'name','id':'group_id'})
country_groups = country_groups[country_groups['_merge'] == 'both'][['name','country']]
saved_groups = saved_groups.merge(country_groups, on='name', how='left', indicator=True).rename({'id':'group_id'})

In [37]:
saved_groups

Unnamed: 0,id,parent_id,name,code,description,country,_merge
0,1,,Abidjan,ABI,,Togo,both
1,1,,Abidjan,ABI,,South Africa,both
2,2,,Antarctic,ANT,,Australia,both
3,2,,Antarctic,ANT,,Bulgaria,both
4,2,,Antarctic,ANT,,Greece,both
...,...,...,...,...,...,...,...
128,18,,Wider Caribbean,WC,,Panama,both
129,18,,Wider Caribbean,WC,,Suriname,both
130,18,,Wider Caribbean,WC,,Trinidad and Tobago,both
131,18,,Wider Caribbean,WC,,United States of America,both


In [38]:
saved_countries = pd.read_sql_table('countries', engine_url).rename(columns={'name':'country','id':'country_id'})
saved_countries = saved_countries[['country','country_id']]

In [39]:
saved_countries

Unnamed: 0,country,country_id
0,Afghanistan,1
1,Angola,2
2,Albania,3
3,Andorra,4
4,United Arab Emirates,5
...,...,...
188,South Africa,189
189,Zambia,190
190,Zimbabwe,191
191,Sudan,192


In [40]:
country_groups = saved_groups.merge(saved_countries, on='country',how='left')
country_groups = country_groups.rename(columns={'id':'group_id'})
country_groups = country_groups[['group_id','country_id']].to_dict('records')

In [41]:
for country_group in country_groups:
    input_data = CountryGroups(country_group)
    write_data(session, input_data, country_group, False)

In [42]:
for res in results:
    country = session.query(Countries).filter(Countries.name == res).first()
    data = {}
    if country:
        for question in results[res]:
            for answer in results[res][question]:
                option = session.query(Values).filter(Values.code == answer).first()
                country_value = {
                    'country_id':country.id,
                    'value_id':option.id,
                    'value':results[res][question][answer],
                    'description': None
                }
                input_data = CountryValues(country_value)
                write_data(session, input_data, country_value, False)

In [43]:
results

defaultdict(<function __main__.nested_dict()>,
            {'France': defaultdict(<function __main__.nested_dict()>,
                         {'9': defaultdict(<function __main__.nested_dict()>,
                                      {'9#1': 947, '9#2': 315}),
                          '9B': defaultdict(<function __main__.nested_dict()>,
                                      {'9B#0': 631,
                                       '9B#1': 947,
                                       '9B#2': 631,
                                       '9B#3': 473}),
                          '9BII': defaultdict(<function __main__.nested_dict()>,
                                      {'9BII#0': 631,
                                       '9BII#1': 631,
                                       '9BII#2': 631,
                                       '9BII#3': 631,
                                       '9BII#4': 631,
                                       '9BII#5': 157}),
                          '9BIII': defaultdi