In [1]:
import pandas as pd
import requests as r
from collections import Counter
import json
import re

In [2]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 999

In [3]:
def removeRegex(stc):
    stc = stc.replace('(','')
    stc = stc.replace(')','')
    return stc

In [4]:
FORMS_URL = "http://unep.localhost/json/unep-forms-with-variables.json"
COUNTRY_QUESTION = "Country"
COUNTRY_FILTER = "Member State"
DATA_SOURCE = "data-production_2020-03-16.csv"

In [5]:
def generate_code(x,y,t):
    for i, a in enumerate(x):
        code = y +'#' + str(i)
        if t == COUNTRY_QUESTION:
            a.update({
                'code':a['v']
            })
        else:
            a.update({
                'code':code
            })
    return x

In [6]:
dataframe = pd.read_csv('./source/' + DATA_SOURCE)
forms = pd.DataFrame(r.get(FORMS_URL).json())
forms['code'] = forms['q'].apply(lambda x:x.split(' ')[0].replace('.','').upper())
forms['a'] = forms.apply(lambda x: generate_code(x['a'], x['code'], x['s']), axis=1)
col_names = forms[['q','code']].to_dict('records')
col_names = {q['q']:q['code'] for q in col_names}

In [7]:
forms

Unnamed: 0,q,s,a,t,code
0,1. I agree to share the information provided i...,Personal Agreements,"[{'v': 'YES', 's': 'Yes', 'code': '1#0'}, {'v'...",OPTIONS,1
1,2. I will provide the answers that reflect the...,Personal Agreements 2,"[{'v': 'YES', 's': 'Yes', 'code': '2#0'}, {'v'...",OPTIONS,2
2,3. I have received adequate information about ...,Personal Agreements 3,"[{'v': 'YES', 's': 'Yes', 'code': '3#0'}, {'v'...",OPTIONS,3
3,4. Your name,Submitter,[],FREE,4
4,4.a. Your role,Role,[],FREE,4A
5,4.b. Contact details (preferably e-mail address),Submitter Contact,[],FREE,4B
6,4.c. Second/alternative contact detail.,Submitter Contact II,[],FREE,4C
7,4.d. Your organisation (If you are not affilia...,Submitter Organisation,[],FREE,4D
8,5. Are you reporting as an individual or on be...,Submitter Reporting,[],FREE,5
9,"5.a. If you selected Other, please specify:",Submitter Reporting Other,[],FREE,5A


In [8]:
questions = forms['code'].to_list()
dataframe = dataframe.rename(columns=col_names)

In [9]:
#DEBUG
#dataframe = dataframe.loc[0:10]

In [10]:
def collect(sentence, options):
    data = []
    if sentence == sentence:
        sentence = removeRegex(sentence)
        for option in options:
            option_value = removeRegex(option['v'])
            if re.findall(option_value, str(sentence)):
                data.append(option['code'])
    return data

In [11]:
country_qcode = forms[forms['s'] == COUNTRY_QUESTION].reset_index()
country_qcode = country_qcode['code'][0]

In [12]:
metadatas = ['Unique Response Number','StartDate','CompletionDate']
questionlist = []
dataset = []
for index, row in dataframe.iterrows():
    cdata = {}
    for mdata in metadatas:
        cdata.update({mdata: row[mdata]})
    for qdata in list(dataframe):
        for q in questions:
            if qdata == q:
                qst = forms[forms['code'] == q]
                qtype = qst['t'].values[0]
                qopt = qst['a'].values[0]
                if qtype == "OPTIONS":
                    cdata.update({q: collect(row[q], qopt)})
                    questionlist.append(q)
                else:
                    cdata.update({q: row[q]})
    dataset.append(cdata)

In [13]:
baked = pd.DataFrame(dataset)

In [14]:
baked

Unnamed: 0,Unique Response Number,StartDate,CompletionDate,4D,5,5A,5B,6,6A,7,8,9,9A,9AI,9B,9BI,9BII,9BIIA,9BIII,9BIIIA,9BIIIB,9BIIIBI,9C,9CI,9CII,9CIIA,9D,9DI,9DII,9DIII,9DIV,9DV,9DVI,9DVII,9DVIII,9DVIIIA,9DVIIIB,10,11,11A,12,12A,12B,12BI,12C,12CI,12D,12DI,13,13A,13AI,13B,13BI,14,14A,15,15A,16,16A,17,17A,18,18A,19,19A,20,20A,21,21A,21B,21BI,21C,21CI,22,22A,23,24,25,26,27,27A,27B,28,28A,29,29A,29B,29C,29D,29E,30
0,545728-545719-53376637,2019-12-20 17:36:36 UTC,2019-12-30 15:42:40 UTC,The SeaCleaners,On behalf of an organisation,,The SeaCleaners,Yes,4.0,AWARENESS RAISING,"-awareness raising activities in schools, trad...",[9#1],[],,"[9B#0, 9B#1, 9B#2, 9B#3]",,"[9BII#0, 9BII#1, 9BII#2, 9BII#3, 9BII#4]",,"[9BIII#1, 9BIII#4]",,[],,[],,[],,[],,,,,,,,[],,,[],[11#2],,[12#2],,[],,[],,[12D#0],,[13#1],[],,[13B#2],,[14#1],,[15#2],,"[France, Germany, Indonesia, Switzerland]",,"[17#8, 17#9, 17#10, 17#11]",,"[18#1, 18#2, 18#3, 18#4, 18#5, 18#6, 18#7, 18#8]",,"[19#0, 19#1, 19#2]",,"[20#0, 20#1, 20#2, 20#3, 20#4, 20#5, 20#6]",,[21#0],,[21B#12],,[],,[22#16],,300000.0,euros,500000.0,euros,"[27#1, 27#3]",,private sponsorship,[28#2],,https://www.theseacleaners.org/en/educational-...,,,,,,
1,545728-545719-53649934,2020-01-07 22:32:59 UTC,2020-01-07 23:05:20 UTC,The Institute of Environmental Science and Res...,As an individual,,,Yes,4.0,New Zealand Plastic bag ban,Government ban of the use of single-use plasti...,[9#0],"[9A#0, 9A#1, 9A#7]",,[],,[],,[],,[],,[],,[],,[],,,,,,,,[],,,[10#1],[11#0],,"[12#0, 12#1]",,[12B#1],,"[12C#1, 12C#2]",,[],,[13#1],[],,[13B#4],I am not involved in the process,[14#2],results of the ban are being indirectly monito...,[15#3],,[Other],New Zealand,[17#13],,[18#4],,[19#0],,[20#6],,[21#0],,[21B#1],,[],,[22#13],,,,,,[27#6],,,[28#3],,https://www.mfe.govt.nz/waste/single-use-plast...,,,,,,
2,545728-545719-53650563,2020-01-07 23:05:28 UTC,2020-01-07 23:17:25 UTC,The Institute of Environmental Science and Res...,As an individual,,,"No, I am returning to the survey to report on ...",,Litter Intellegence,Citizen science Beach Litter surveys,[],[],,[],,[],,[],,[],,[],,[],,[9D#3],,,,,,,,[],,,[10#1],[11#4],no role,[12#2],,[],,[],,"[12D#0, 12D#1]",,"[13#1, 13#2]",[],,[],,[14#0],,[15#2],,[Other],New Zealand - the lead but other Pacific islan...,"[17#5, 17#8]",,[18#7],,"[19#0, 19#1, 19#2]",,[20#6],,[21#0],,[21B#12],,[],,"[22#0, 22#1, 22#2, 22#3, 22#6, 22#7, 22#8, 22#...",,,,,,[],,,[28#3],,https://litterintelligence.org/,,,,,,
3,545728-545719-53668499,2020-01-08 13:54:19 UTC,2020-01-10 15:05:07 UTC,"Secretariat of the Basel, Rotterdam and Stockh...",On behalf of an organisation,,"Secretariat of the Basel, Rotterdam and Stockh...",Yes,3.0,Updating the technical guidelines for the iden...,"At its sixth meeting held in December 2002, th...",[9#0],"[9A#1, 9A#4]",,[],,[],,[],,[],,[],,[],,[],,,,,,,,[],,,[10#2],[11#4],We are the supporting Parties and others to up...,"[12#0, 12#1, 12#2, 12#3]",The technical guidelines if adopted at the COP...,"[12B#0, 12B#1, 12B#2]",,"[12C#0, 12C#1, 12C#2]",,"[12D#0, 12D#1, 12D#2]",,[13#0],[13A#0],,[],,[14#1],,[15#0],,[All],,[17#13],,"[18#5, 18#6]",,"[19#0, 19#2]",,[20#0],,"[21#0, 21#2]",,[21B#12],,[21C#2],,[22#16],,,,10000.0,USD,[27#7],"Staff time, IT resources for online meetings",Core funds of the convention,[28#2],,http://www.basel.int/Implementation/Plasticwas...,,,,,,
4,545728-545719-53741002,2020-01-10 14:06:52 UTC,2020-01-10 16:21:59 UTC,Dirección del Parque Nacional Galápagos,On behalf of an organisation,,Dirección del Parque Nacional Galápagos,Yes,1.0,LIMPIEZA COSTERA EN GALÁPAGOS,Se realizan viajes en diferentes embarcaciones...,[9#1],[],,[9B#4],Limpieza y recolección de basura plástica mari...,[],,[],,[],,[],,[],,[],,,,,,,,[],,,[],[11#1],,[12#0],,[12B#1],,[],,[],,[13#0],[13A#0],,[],,[14#0],,[15#4],,[Ecuador],,[17#8],,"[18#6, 18#7]",,[19#0],,"[20#1, 20#2, 20#3, 20#4]",,[21#0],,"[21B#0, 21B#1, 21B#2, 21B#5, 21B#8, 21B#11]",,[21C#4],Pellets,"[22#8, 22#10]",,50000.0,dólar americano,50000.0,dólar americano,"[27#2, 27#3]",,Fondos Coca Cola,[28#3],,http://www.galapagos.gob.ec/46-toneladas-de-ba...,http://www.ambiente.gob.ec/limpieza-costera-un...,http://www.galapagos.gob.ec/galapagos-se-sumo-...,https://conservation.org.ec/boletines/iniciati...,,,
5,545728-545719-53853866,2020-01-14 12:48:24 UTC,2020-01-14 13:33:37 UTC,"Institute of Oceanography and Fisheries, Split...",As an individual,,,Yes,1.0,Fishing for litter,Fisheries is a sector that can make a signific...,[9#1],[],,"[9B#0, 9B#1, 9B#2]",,"[9BII#0, 9BII#1, 9BII#2, 9BII#3, 9BII#4]",,"[9BIII#2, 9BIII#4, 9BIII#5]",,[],,[],,[],,[],,,,,,,,[],,,[10#0],[11#2],,[12#0],,[12B#2],,[],,[],,[13#0],[13A#1],,[],,[14#0],,[15#3],,[Croatia],,[17#9],,[18#8],,[19#0],,"[20#0, 20#1, 20#2, 20#3, 20#4, 20#5, 20#6]",,[21#0],,[21B#12],,[],,[22#16],,50000.0,Euro,0.0,Euro,[27#7],European Union funds through InterReg project,Interreg funding,[28#2],,http://www.ml-repair.eu/en,http://www.defishgear.net/desert2/itemlist/tag...,https://www.jutarnji.hr/vijesti/hrvatska/alarm...,https://net.hr/danas/hrvatska/ponosni-ste-na-p...,http://www.ml-repair.eu/attachment/preview/5dc...,https://komunalne-djelatnosti.hr/wp-content/up...,"Before the DeFishGear project started in 2013,..."
6,545728-545719-53855495,2020-01-14 13:15:32 UTC,2020-01-14 14:43:51 UTC,UNEP,On behalf of an organisation,,UNEP Law Division,Yes,1.0,legislative guidance to countries on marine li...,legislative assistance to countries,[9#0],[],,[],,[],,[],,[],,[],,[],,[],,,,,,,,[],,,[10#1],[11#3],,[12#0],,[12B#1],,[],,[],,[13#0],[13A#0],,[],,[14#0],,[15#0],,"[All, Argentina, Mexico]",,[17#15],"lifecycle of pastics (production, distribution...","[18#8, 18#10]",It depends on the country needs 1) a general ...,[19#4],see comment under Q18,"[20#6, 20#8]",Depends of country needs and priorities,[21#0],,"[21B#12, 21B#14]",Depends on country needs and priorities,[21C#4],Depends on country needs and priorities,"[22#0, 22#1, 22#2, 22#10, 22#11, 22#15, 22#18]","Mainly the sectors checked, but it depends on ...",150000.0,USD,50000.0,USD,[27#1],,,[28#3],,https://www.unenvironment.org/resources/report...,,,,,,- ongoing provision of technical assistance to...
7,545728-545719-53915703,2020-01-15 19:44:36 UTC,2020-01-15 20:23:41 UTC,National Geographic Society,On behalf of an organisation,,National Geographic Society,Yes,2.0,'Sea to Source: Ganges' Expedition,The “Sea to Source: Ganges” river expeditions ...,[],[],,[],,[],,[],,[],,[],,[],,"[9D#1, 9D#3, 9D#4, 9D#5, 9D#6, 9D#7, 9D#8, 9D#9]","Assessing litter on land, socioeconomic survey...",,30 litres of surface river water were directly...,,Dr. Jenna Jambeck's circularity assessment pro...,"Opportunistic sampling of gut, tissue, feces",Total atmospheric fallout collected through a ...,[9DVIII#0],,To come (still in analysis phase),[10#1],[11#1],,[12#3],All of the above,[],,[],,[],,[13#0],[13A#1],,[],,[14#0],,[15#2],,"[Bangladesh, India]",,"[17#1, 17#5, 17#6, 17#7, 17#8, 17#15]",rural communities,[18#8],,"[19#0, 19#1, 19#2]",,"[20#0, 20#1, 20#2, 20#3, 20#5]",,[21#0],,[21B#12],,[21C#2],,[22#18],To be determined by findings and the appropria...,2000000.0,USD,10000.0,USD,[27#4],,National Geographic Society,[28#5],To be determined - likely continuous over 3+ y...,https://www.nationalgeographic.org/projects/pl...,https://www.nationalgeographic.org/projects/pl...,,,,,
8,545728-545719-53934963,2020-01-16 11:15:57 UTC,2020-01-16 12:00:32 UTC,Clean Up Kenya,On behalf of an organisation,,Clean Up Kenya,Yes,1.0,Ban the bottle campaign,The Ban the Bottle Campaign is designed to pre...,[9#1],[],,"[9B#0, 9B#1, 9B#2]",,"[9BII#0, 9BII#1, 9BII#2, 9BII#3, 9BII#4, 9BII#...",,"[9BIII#2, 9BIII#5]",,[],,[],,[],,[],,,,,,,,[],,,[10#0],[11#1],,"[12#1, 12#2]",,[],,"[12C#1, 12C#2]",,"[12D#0, 12D#1]",,[13#0],[13A#1],,[],,[14#0],,[15#3],,[Kenya],,"[17#5, 17#6, 17#7, 17#8, 17#9]",,"[18#2, 18#3, 18#4, 18#5, 18#6, 18#7]",,"[19#0, 19#2]",,[20#6],,[21#0],,[21B#0],,[],,[22#10],,450000.0,Kenya Shillings,200000.0,Kenya shillings,[27#1],,Donations from individuals and organisations,[28#2],,https://www.facebook.com/notes/clean-up-kenya/...,https://www.facebook.com/1553146668276531/post...,https://www.facebook.com/1553146668276531/post...,https://www.facebook.com/1553146668276531/post...,,,
9,545728-545719-53944718,2020-01-16 14:07:54 UTC,2020-01-16 16:07:42 UTC,National Environment Commission,As an individual,,,Yes,1.0,Survey in marine litter and microplactics,"Regulatory, implementation, enforcement",[9#0],"[9A#0, 9A#1, 9A#2, 9A#4, 9A#5, 9A#6, 9A#7, 9A#8]",,[],,[],,[],,[],,[],,[],,[],,,,,,,,[],,,[10#1],[11#2],,"[12#0, 12#1]",,[12B#1],,"[12C#1, 12C#2]",,[],,[13#0],[13A#1],,[],,[14#0],,[15#3],,[Bhutan],,"[17#1, 17#2, 17#3, 17#5, 17#6, 17#7]",,"[18#4, 18#5, 18#6, 18#7]",,"[19#0, 19#1, 19#2]",,"[20#0, 20#1, 20#2, 20#4]",,[21#0],,"[21B#0, 21B#1, 21B#2, 21B#3, 21B#7, 21B#8, 21B...",,[],,"[22#0, 22#3, 22#6, 22#7, 22#10, 22#11, 22#12, ...",,550000.0,USD,50000.0,USD,"[27#2, 27#3]",,Government & UNEP,[28#3],,www.nec.gov.bt wwf bhutan,,,,,,Ban on certain single use plastics underway. N...


In [15]:
qobject = {}
for question in questionlist:
    if question != country_qcode:
        dataopt = {}
        options = forms[forms['code'] == question]['a'].values[0]
        for option in options:
            country_value = option['code']
            dataopt.update({country_value:0})
        qobject.update({question:dataopt})

In [16]:
results = {}
countrylist = forms[forms['code'] == country_qcode]['a'].values[0]
for country in countrylist:
    results.update({country['v']:qobject})

In [17]:
for data in baked.to_dict('records'):
    for country in data[country_qcode]:
        for question in questionlist:
            if question != country_qcode:
                for answer in data[question]:
                    results[country][question][answer] += 1

In [18]:
with open('./results/transformed-' + DATA_SOURCE, 'w') as outfile:
    json.dump(results, outfile)

In [19]:
## INPUT CONNECTION

In [20]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import numpy as np
from app.models import Countries, Groups, Values, CountryGroups, CountryValues
from app.connection import engine_url, write_data

In [21]:
engine_url = engine_url()
engine = create_engine(engine_url)
session = sessionmaker(bind=engine)()

In [22]:
values = forms[['q','s','t']].rename(columns={'q':'description','s':'name','t':'type'})
values['code'] = values['description'].apply(lambda x:x.split(' ')[0].replace('.','').upper())
values = values[values['name'] != COUNTRY_QUESTION].reset_index()
for value in values.to_dict('records'):
    value['parent_id'] = None
    value['units'] = None
    input_data = Values(value)
    write_data(session, input_data, value, False)

In [23]:
saved_values = pd.read_sql_table('values', engine_url)

In [24]:
child_values = forms[forms['t'] == 'OPTIONS'].reset_index()
child_values['code'] = child_values['q'].apply(lambda x:x.split(' ')[0].replace('.','').upper())

In [25]:
saved_values = saved_values.merge(child_values, on='code', how='left')
option_values = saved_values[saved_values['type'] == 'OPTIONS']
option_values = option_values[['id','type','code','units','description','a','color']].to_dict('records')

In [26]:
for option_value in option_values:
    for index, child in enumerate(option_value['a']):
        child_value = {
            'parent_id': option_value['id'],
            'units': option_value['units'],
            'code': option_value['code'] + '#' + str(index),
            'description': child['v'] + ' - ' + option_value['description'],
            'type': 'OPTION',
            'name': child['s']
        }
        input_data = Values(child_value)
        write_data(session, input_data, child_value, False)

In [27]:
unep_countries = pd.read_csv('./source/country_shapefile.csv')
unep_countries = unep_countries.rename(columns={'Terr_Name':'name','STATUS':'status'})

In [28]:
countries = unep_countries[unep_countries['status'] == 'Member State'].rename(columns={'ISO3_CODE':'code'})
countries = countries[['code','name','status']].to_dict('records')
for country in countries:
    input_data = Countries(country)
    write_data(session, input_data, country, False)

In [29]:
data_groups = pd.read_csv('./source/list_of_regional_seas_country.csv')
data_groups = data_groups[['Group','Country']].rename(columns={'Country':'name','Group':'group'})

In [30]:
# DEBUG GROUPS
countries = unep_countries[unep_countries['status'] == 'Member State'].rename(columns={'ISO3_CODE':'code'})
country_groups = countries.merge(data_groups, on='name', how='left', indicator=True)
groups = country_groups[country_groups['_merge'] == 'both'].groupby('group').first().reset_index()['group'].to_frame('name')
groups['parent_id'] = None

In [31]:
import wikipedia

In [32]:
def wiki(search):
    description = None
    #try:
    #    description = wikipedia.summary(search, sentences=1)
    #except:
    #    pass
    return description

def groupcode(x):
    x = x.replace('/',' ').replace('(','').replace(')','').split(' ')
    code = ''
    if len(x) == 1:
        x = x[0][0:3]
    for y in x:
        code += y[0].upper()
    return code

In [33]:
groups['description'] = groups['name'].apply(lambda x:wiki(x))
groups['code'] = groups['name'].apply(lambda x: groupcode(x))

In [34]:
for group in groups.to_dict('records'):
    input_data = Groups(group)
    write_data(session, input_data, group, False)

In [35]:
error_country = pd.DataFrame(countrylist)
error_country = error_country[['v']]
error_country = error_country.rename(columns={'v':'name'})
error_country = error_country.merge(unep_countries, on='name', how='left', indicator=True)
error_country = error_country[error_country['ISO3_CODE'] != error_country['ISO3_CODE']]
error_country.to_csv('./results/error-combined_source-and-shapefile.csv')

In [36]:
saved_groups = pd.read_sql_table('groups', engine_url)
country_groups = country_groups.rename(columns={'name':'country','group':'name','id':'group_id'})
country_groups = country_groups[country_groups['_merge'] == 'both'][['name','country']]
saved_groups = saved_groups.merge(country_groups, on='name', how='left', indicator=True).rename({'id':'group_id'})

In [37]:
saved_groups

Unnamed: 0,id,parent_id,name,code,description,country,_merge
0,1,,Abidjan,ABI,,Togo,both
1,1,,Abidjan,ABI,,South Africa,both
2,2,,Antarctic,ANT,,Australia,both
3,2,,Antarctic,ANT,,Bulgaria,both
4,2,,Antarctic,ANT,,Greece,both
...,...,...,...,...,...,...,...
128,18,,Wider Caribbean,WC,,Panama,both
129,18,,Wider Caribbean,WC,,Suriname,both
130,18,,Wider Caribbean,WC,,Trinidad and Tobago,both
131,18,,Wider Caribbean,WC,,United States of America,both


In [38]:
saved_countries = pd.read_sql_table('countries', engine_url).rename(columns={'name':'country','id':'country_id'})
saved_countries = saved_countries[['country','country_id']]

In [39]:
saved_countries

Unnamed: 0,country,country_id
0,Afghanistan,1
1,Angola,2
2,Albania,3
3,Andorra,4
4,United Arab Emirates,5
...,...,...
188,South Africa,189
189,Zambia,190
190,Zimbabwe,191
191,Sudan,192


In [40]:
country_groups = saved_groups.merge(saved_countries, on='country',how='left')
country_groups = country_groups.rename(columns={'id':'group_id'})
country_groups = country_groups[['group_id','country_id']].to_dict('records')

In [41]:
for country_group in country_groups:
    input_data = CountryGroups(country_group)
    write_data(session, input_data, country_group, False)

In [43]:
for res in results:
    country = session.query(Countries).filter(Countries.name == res).first()
    data = {}
    if country:
        for question in results[res]:
            for answer in results[res][question]:
                option = session.query(Values).filter(Values.code == answer).first()
                country_value = {
                    'country_id':country.id,
                    'value_id':option.id,
                    'value':results[res][question][answer],
                    'description': None
                }
                input_data = CountryValues(country_value)
                write_data(session, input_data, country_value, False)