In [1]:
import pandas as pd
import numpy as np
import glob
import json
import requests as r
import math
pd.set_option('display.max_rows', 500)

In [2]:
files = glob.glob("./0*.csv")
output_folder = "./json-source-v2"

In [3]:
files

['./02_resources.csv',
 './03_events.csv',
 './05_technologies.csv',
 './04_policies-modified.csv',
 './06_tags.csv',
 './09.country_groups.csv',
 './04_policies.csv']

### GET ALL ORGANISTIONS

In [4]:
orglist = []
for file in files:
    data = pd.read_csv(file)
    try:
        orgs = list(data[data['organisation'] == data['organisation']]['organisation'])
        for org in orgs:
            org = org.replace('"','').replace(',',';').split(';')
            for og in org:
                og = og.strip()
                if og != '':
                    orglist.append(og.strip())
    except:
        pass
orglist = list(np.unique(orglist))
orglist = [{"name":org} for org in orglist]
with open(output_folder + '/organisations.json', 'w') as file:
    file.write(json.dumps(orglist, indent=1))

### LIST ALL MEA

In [5]:
#get unique reqgional coverage
mea = []
for file in files:
    df = pd.read_csv(file)
    try:
        df = df[df['geo_coverage'] == df['geo_coverage']]
        df['geo_coverage_type'] = df['geo_coverage'].apply(lambda x: x.split(':')[0].lower() if ':' in x else np.nan)
        df['geo_coverage'] = df['geo_coverage'].apply(lambda x: x.split(':')[1] if ':' in x else x)
        df = df[df['geo_coverage_type'] == df['geo_coverage_type']]
        df = df[['geo_coverage_type','geo_coverage']].to_dict('records')
        for d in df:
            if 'specific areas' in d['geo_coverage_type'] or d['geo_coverage_type'] == 'regional':
                if ';' in d['geo_coverage']:
                    dd = d['geo_coverage'].split(';')
                    for gc in dd:
                        mea.append({'coverage':gc.strip(),'type': d['geo_coverage_type']})
                else:
                    mea.append(mea.append({'coverage':d['geo_coverage'].strip(),'type':d['geo_coverage_type']}))
    except:
        print(file)
mea = [i for i in mea if i]
pd.DataFrame(mea).drop_duplicates(subset=['coverage'])
new = pd.read_csv('./country_group.csv')
new['source'] = 'new'
old = pd.DataFrame(r.get("http://unep.localhost/api/public/groups").json())
old['source_api'] = 'unep.tc'
old = old[['name','source_api']]
new['duplicates'] = new['name'].apply(lambda x: old.loc[old['name'].str.contains(x)].shape[0])

./06_tags.csv
./09.country_groups.csv


  return func(self, *args, **kwargs)


### GET ALL TAGS

In [6]:
tags = pd.read_csv('./06_tags.csv')
tags['category'] = tags['category'].apply(lambda x: x.split('_')[1])
tags = tags[['category','tag']]
tags = tags.groupby('category')['tag'].apply(lambda g: g.values.tolist()).to_dict()
with open(output_folder + '/tags.json', 'w') as file:
    file.write(json.dumps(tags, indent=1))

### GET COUNTRY GROUPS

In [7]:
groups = pd.read_csv('./09.country_groups.csv')
groups['country'] = groups['country'].apply(lambda x: x.strip())
countryGroup = groups.groupby('group')['country'].apply(lambda g: g.values.tolist()).to_dict()
with open(output_folder + '/country_group.json', 'w') as file:
    file.write(json.dumps(countryGroup, indent=1))

### GET ALL RESOURCES

In [8]:
def get_currency(x):
    if "€" in x or "EUR" in x:
        return "EUR"
    if "$" in x or "USD" in x:
        return "USD"
    if "NOK" in x:
        return "NOK"
    if "CAD" in x:
        return "CAD"
    if "GBP" in x or "£":
        return "GBP"
    return None

In [9]:
def get_value_currency(x):
    cur = [int(s) for s in x.split() if s.isdigit()]
    if len(cur) == 1:
        if "milli" in x:
            return cur[0] * 1000000
        if "billi" in x:
            return cur[0] * 1000000000
        return cur[0]
    return None

In [10]:
def generate_theme(theme):
    srcs = pd.read_csv(theme)
    taglist = pd.read_csv('./06_tags.csv')
    taglist = [t.strip() for t in list(taglist['tag'])]
    coverage_type = ['global',
                     'regional',
                     'national',
                     'transnational',
                     'sub-national',
                     'global with elements in specific areas']
    srcs = srcs.to_dict('records')
    resources = []
    false_tags = []
    for src in srcs:
        res = {}
        for s in src:
            res.update({s:src[s]})
            if type(src[s]) == float:
                if math.isnan(src[s]):
                    res.update({s: None})
            if s in ["publish_year","valid_from","valid_to"]:
                if res[s] is not None:
                    res.update({s: int(res[s])})
            if s == "url":
                if res[s] is not None:
                    res.update({"url": [v.strip() for v in res[s].splitlines(True)]})
            if s == "languages":
                languages = []
                if res[s] is not None:
                    lang = []
                    if ":" in res[s]:
                        lang = res[s].replace("http://","").replace("https://","").split(';')
                    if len(lang) > 0:
                        for ln in lang:
                            ln = ln.split(":")
                            if len(ln) > 1:
                                languages.append({"language":ln[0].strip(),"url":"https://{}".format(ln[1].strip())})
                if len(languages) > 0:
                    res.update({"resource_language_url": languages})
                else:
                    res.update({"resource_language_url": None})
                #del res[s]
            if type(res[s]) == str:
                v = res[s].replace('\n','').replace('"','').replace('‘','').replace('’','').replace('\xa0',' ')
                v = v.strip()
                res.update({s: v})
            ## Should we do data cleaning for value?
            if s == "value":
                if res[s] is not None:
                    res.update({"value_currency": get_currency(res[s])})
                    res.update({s: get_value_currency(res[s])})
                else:
                    res.update({"value_currency": None})
            if s in ["tags","organisation"]:
                if type(res[s]) == str:
                    vl = []
                    if res[s] is not None:
                        sep = [';' if ';' in src[s] else ':']
                        vl = res[s].split(sep[0])
                        vl = [k.replace('"','').strip() for k in vl]
                        if s == "tags":
                            nv = []
                            for tg in taglist:
                                for v in vl:
                                    if v == tg:
                                        nv.append(v)
                                    if v not in taglist:
                                        false_tags.append(v)
                            #if len(vl) != len(nv):
                            #    res.update({"error_tags":True})
                            #else:
                            #    res.update({"error_tags":False})
                            vl = nv
                    if len(vl) == 0:
                        vl = None
                    res.update({s: vl})
            if s == "geo_coverage":
                if res[s] is not None:
                    gt = res[s].split(":")
                    ct = gt[0].lower().strip()
                    if ct in coverage_type:
                        res.update({'geo_coverage_type': ct})
                    else:
                        res.update({'geo_coverage_type': None})
                    if len(gt) > 1:
                        gc = gt[1].split(';')
                        gc = [g.replace('.','').strip().title() for g in gc]
                        res.update({s: gc})
                    else:
                        res.update({s: None})
            if s == "attachments":
                if res[s] is not None:
                    res.update({s: res[s].split(' ')})
                else:
                    res.update({s: []})
            if s == "country":
                if res[s] is not None:
                    if "," in res[s]:
                        country = res[s].split(',')
                    elif ";" in res[s]:
                        country = res[s].split(';')
                    else:
                        country = [res[s]]
                    res.update({"country": country[0]})
                else:
                    res.update({"country": None})
        resources.append(res)
    results = pd.DataFrame(resources).to_dict('records')
    #df = df.fillna(dict(publish_year=999)).replace(dict(publish_year={999: None}))
    for res in results:
        for s in res:
            if type(res[s]) == float:
                if math.isnan(res[s]):
                    res.update({s: None})
                else:
                    res.update({s: int(res[s])})
    resources = pd.DataFrame(results)
    with open('{}/{}.json'.format(output_folder, theme.replace('.csv','')), 'w') as file:
        file.write(json.dumps(results, indent=1))
    print("theme {}.json generated".format(theme.replace('.csv','')))

In [11]:
generate_theme("05_technologies.csv")
generate_theme("04_policies.csv")
generate_theme("02_resources.csv")

theme 05_technologies.json generated
theme 04_policies.json generated
theme 02_resources.json generated


#### NOTES
- Do we need to analyze value and it's currency?
- No Country Table
- Country also has **global with elements in specific areas**
- Some of the country has different separator
- Some of the geo_coverage has different separator
- Some of the tags has different separator
- languages separator is using colon while the url is also using colon