In [None]:
import pandas as pd
import numpy as np
import glob
import json
import requests as r
import math
pd.set_option('display.max_rows', 500)
import requests
from openpyxl import load_workbook
import re
import os
from datetime import datetime

In [None]:
bucketurl = "https://storage.googleapis.com/akvo-unep-gpml"
#API_URL = "https://unep.tc.akvo.org"
API_URL = "http://unep.localhost"
output_folder = "../backend/dev/resources/files/"

## UPDATE COUNTRY GROUPS

In [None]:
def update_country_groups():
    country_group_countries = open(output_folder + 'country_group_countries.json')
    country_group_countries = json.load(country_group_countries)
    country_group_countries_old = r.get(API_URL + '/api/export/groups').json()
    country_groups = open(output_folder + 'country_group.json')
    country_groups = json.load(country_groups)
    country_groups = pd.DataFrame(country_groups)
    last_cg_id = list(country_groups.tail(1)['id'])[0]
    new_country_groups = []
    for cgc in country_group_countries_old:
        if cgc not in list(country_group_countries):
            country_group_countries.update({cgc: country_group_countries_old[cgc]})
        if cgc not in list(country_groups['name']):
            last_cg_id = last_cg_id + 1;
            regional = ['Black Sea','South Asia Seas','South East Pacific','Pacific','Artic','Antarctic','Baltic Sea','North East Pacific']
            rtype = 'region'
            if cgc not in regional:
                rtype = 'mea'
            new_country_groups.append({'id': last_cg_id,'name':cgc,'type':rtype})
    if len(new_country_groups) > 0:
        country_groups = country_groups.append(new_country_groups, ignore_index=True)
        
    for cs in list(country_group_countries):
        if cs not in list(country_groups['name']):
            print(cs)

    with open(output_folder + 'country_group_countries.json', 'w') as f:
        json.dump(country_group_countries, f, indent=2, ensure_ascii=False)
        print(f'Updated {f.name}')

    with open(output_folder + 'country_group.json', 'w') as f:
        json.dump(country_groups.to_dict('records'), f, indent=2, ensure_ascii=False)
        print(f'Updated {f.name}')
        
    return country_group_countries

In [None]:
def collectCountries(cgname):
    if cgname in list(country_group_countries):
        return country_group_countries[cgname]
    return [cgname]

In [None]:
def create_csvs_from_xls(filename):
    wb = load_workbook(filename=filename)
    problematic_sheets = {'policies'} # FIXME: remove after they are fixed and move to required_sheets
    required_sheets = {'events','resources', 'technologies', 'tags'} 
    for idx, sheet_name in enumerate(wb.sheetnames, start=1):
        sname = sheet_name.strip().lower()
        if sname in problematic_sheets:
            print(f'NOTE: Data in {sheet_name} must be downloaded manually!!!!')
            print('^' * 50)
        if sname not in required_sheets:
            continue
        csv = f'{idx:-02}_{sname}.csv'
        df = pd.read_excel(filename, sheet_name=sheet_name)
        df.columns = [c.strip().lower() for c in df.columns]
        df.to_csv(csv, index=False)
    return wb.sheetnames        

### GET ALL ORGANISTIONS

In [None]:
def update_organisations():
    orglist = []
    for file in files:
        data = pd.read_csv(file)
        try:
            orgs = list(data[data['organisation'] == data['organisation']]['organisation'])
            for org in orgs:
                org = org.replace('"','').replace(',',';').split(';')
                for og in org:
                    og = og.strip()
                    if og != '':
                        orglist.append(og.strip())
        except:
            pass
    orglist = list(np.unique(orglist))
    orglist = [{"name":org} for org in orglist]
    with open(output_folder + 'organisations.json', 'w') as f:
        json.dump(orglist, f, indent=2, ensure_ascii=False)
        
    print(f'Updated {f.name}')

### GET ALL TAGS

In [None]:
def update_tags():
    tags = pd.read_csv('./06_tags.csv', skip_blank_lines=True)
    tags = tags[~tags['category'].isna()]
    tags['category'] = tags['category'].apply(lambda x: x.split('_')[1])
    tags = tags[['category','tag']]
    tags = tags.groupby('category')['tag'].apply(lambda g: g.values.tolist()).to_dict()
    with open(output_folder + 'tags.json', 'w') as f:
        json.dump(tags, f, indent=2, ensure_ascii=False)
    print(f'Updated {f.name}')

### GET ALL RESOURCES

In [None]:
def get_currency(x):
    if "€" in x or "EUR" in x:
        return "EUR"
    if "$" in x or "USD" in x:
        return "USD"
    if "NOK" in x:
        return "NOK"
    if "CAD" in x:
        return "CAD"
    if "GBP" in x or "£":
        return "GBP"
    return None

In [None]:
def get_value_currency(x):
    cur = [int(s) for s in x.split() if s.isdigit()]
    if len(cur) == 1:
        if "milli" in x:
            return cur[0] * 1000000
        if "billi" in x:
            return cur[0] * 1000000000
        return cur[0]
    return None

In [None]:
def get_images():
    images = glob.glob("./images/**/*.*")
    images_map = {}
    for image in images:
        basename = os.path.basename(image)
        name = os.path.splitext(basename)[0]
        is_logo = name.endswith('logo')
        topic, topic_id = name.split('_')[:2]
        try:
            topic_id = int(topic_id)
        except ValueError:
            continue
        topic_map = images_map.setdefault(topic, {})
        topic_id_map = topic_map.setdefault(topic_id, {})
        key = 'logo' if is_logo else 'image'
        topic_id_map[key] = f"{bucketurl}/images/{basename}"
        
    if not images_map:
        raise RuntimeError('No images found')

    return images_map

In [None]:
def revertGeoToBeforeFormat(x):
    # IMPORTANT NOTES, Because the format before in geo_coverage column is 'geo_coverage_type':'geo_coverage_values'
    # Since 24th February, Unep manage to change the format to move geo_coverage_values to country
    # so we revert to how it's before to prevent code changed
    # If we are using old format as above, then we need to comment this function inside generate_theme
    if x['country'] == x['country'] and x['geo_coverage'] == x['geo_coverage']:
        return x['geo_coverage'] + ':' + x['country']
    return x['geo_coverage']

In [None]:
def policy_date_transformer(date):
    if not date:
        return None
    if date == '18/18/2018':
        return '2018-01-01'    
    if '/' in date:
        day, month, year = date.split('/')
        return f'{year}-{month.zfill(2)}-{day.zfill(2)}'  
    if len(date) == 4:
        year = date
        return f'{year}-01-01'
    return date
        

In [None]:
def generate_theme(theme):
    converters = {
        'first_publication_date': policy_date_transformer, 
        'latest_amendment_date': policy_date_transformer,
        'start_date': policy_date_transformer,
        'end_date': policy_date_transformer
    }
    srcs = pd.read_csv(theme, converters=converters)
    if 'country' not in list(srcs):
        srcs['country'] = ''
    srcs['geo_coverage'] = srcs.apply(lambda x: revertGeoToBeforeFormat(x),axis=1)
    # Drop unnamed columns
    columns = [col for col in srcs.columns if not col.startswith('unnamed')]
    srcs = srcs[columns]
    # Drop rows with no data
    srcs = srcs[~srcs['id'].isna()]
    taglist = pd.read_csv('./06_tags.csv')
    taglist = taglist[~taglist['tag'].isna()]
    taglist = [t.strip() for t in list(taglist['tag'])]
    coverage_type = ['global',
                     'regional',
                     'national',
                     'transnational',
                     'sub-national',
                     'global with elements in specific areas']
    srcs = srcs.to_dict('records')
    resources = []
    false_tags = []
    for src in srcs:
        res = {}
        for s in src:
            res.update({s:src[s]})
            if type(src[s]) == float:
                if math.isnan(src[s]):
                    res.update({s: None})
            if s in ["publish_year","valid_from","valid_to"]:
                if res[s] is not None:
                    res.update({s: res[s]})
            if s == "url":
                if res[s] is not None:
                    res.update({"url": [v.strip() for v in res[s].split(";")]})
            if s == "languages":
                languages = []
                res.update({s:src[s]})
                urls = []
                lang = ['English']
                if src['url'] == src['url']:
                    urls = src['url'].replace("http://","").replace("https://","").split(';')
                    if src['languages'] == src['languages']:
                        if ";" in src['languages']:
                            lang = src['languages'].split(';')
                        else:
                            lang = [src['languages'].split(':')[0]]
                    if len(urls) > len(lang):
                        for i, u in enumerate(urls):
                            if len(lang) > 0:
                                lang.append(lang[0])
                            else:
                                lang.append("English")
                    if len(lang) > 0:
                        for i, lan in enumerate(lang):
                            try:
                                languages.append({"language":lan.strip(),"url":"https://{}".format(urls[i].strip())})
                            except:
                                pass
                if len(languages) > 0:
                    res.update({"resource_language_url": languages})
                else:
                    res.update({"resource_language_url": None})
                #del res[s]
            if type(res[s]) == str:
                v = res[s].replace('\n','').replace('"','').replace('‘','').replace('’','').replace('\xa0',' ')
                v = v.strip()
                res.update({s: v})
            ## Should we do data cleaning for value?
            if s == "value":
                if res[s] is not None:
                    res.update({"value_currency": get_currency(res[s])})
                    res.update({s: get_value_currency(res[s])})
                else:
                    res.update({"value_currency": None})
            if s == "value_amount":
                res.update({"value_amount": src[s]})
            if s in ["tags","organisation"]:
                if type(res[s]) == str:
                    vl = []
                    if res[s] is not None:
                        sep = [';' if ';' in src[s] else ':']
                        vl = res[s].split(sep[0])
                        vl = [k.replace('"','').strip() for k in vl]
                        if s == "tags":
                            nv = []
                            for tg in taglist:
                                for v in vl:
                                    if v == tg:
                                        nv.append(v)
                                    if v not in taglist:
                                        false_tags.append(v)
                            vl = nv
                    if len(vl) == 0:
                        vl = None
                    res.update({s: vl})
            if s == "geo_coverage":
                if res[s] is not None:
                    gt = res[s].split(":")
                    ct = gt[0].lower().strip()
                    if ct in coverage_type:
                        res.update({'geo_coverage_type': ct})
                    else:
                        res.update({'geo_coverage_type': None})
                    if len(gt) > 1:
                        gc = gt[1].split(';')
                        gc = [g.replace('.','').strip() for g in gc]
                        if ct == 'transnational':
                            for g in gc:
                                cc = collectCountries(g)
                                for c in cc:
                                    if c.strip() not in gc:
                                        gc.append(c)
                        res.update({s: gc})
                    else:
                        res.update({s: None})
            if s == "attachments":
                if res[s] is not None:
                    res.update({s: res[s].split(' ')})
                else:
                    res.update({s: []})
            if s == "country":
                if res[s] is not None:
                    if ";" in res[s]:
                        country = res[s].split(';')
                    else:
                        country = [res[s]]
                    res.update({"country": country[0]})
                else:
                    res.update({"country": None})
        resources.append(res)
    results = pd.DataFrame(resources)
    images = get_images()
    topic_name = theme.replace('ies.csv','y').replace('s.csv','').split('_')[1]
    if topic_name == 'event':
        topic_name = 'events'
    results['image'] = results['id'].apply(lambda x: images.get(topic_name, {}).get(int(x), {}).get('image'))
    if topic_name == 'technology':
        results['logo'] = results['id'].apply(lambda x: images.get(topic_name, {}).get(int(x), {}).get('logo'))
    results = results.to_dict('records')
    for res in results:
        for s in res:
            if type(res[s]) == float:
                if math.isnan(res[s]):
                    res.update({s: None})
                else:
                    res.update({s: int(res[s])})
    resources = pd.DataFrame(results)
    output_file = theme.split('_', 1)[1].replace('.csv','.json')
    with open(output_folder + output_file, 'w') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    print(f'Updated {f.name}')

In [None]:
def update_countries(): 
    with open('countries.geojson', 'r') as f:
        country_data = json.load(f)
        
    countries = {}
    for feature in country_data['features']:
        props = feature['properties']
        name = props['name']
        if name.startswith('disputed'):
            continue
        if name not in countries:
            countries[name] = props['cd']
        elif 'Island' in countries[name] and 'Island' not in props['cd']:
            countries[name] = props['cd']
        else:
            print(f'{name} already mapped to {countries[name]}. Not adding {props}')
            
    countries = [{"name": val, "code": key} for key, val in countries.items()]
    countries = countries + [{'name': "All", "code": None}, {'name': "Other", "code": None}]
        
    with open(output_folder + 'countries.json', 'w') as f:
        json.dump(countries, f, indent=2, ensure_ascii=False)
    print(f'Updated {f.name}')

# Import/Update Projects

In [None]:
actions = json.loads(requests.get(API_URL + '/api/export/project-actions').content)

In [None]:
for action in actions:
    if action['parent_id'] == 116:
        print(f"{action['code']}: '{action['name'].lower().split('(')[0].strip()}',")
        
geo_coverage_codes = {
    105885227: 'global',
    105885347: 'regional',
    105885443: 'transnational',
    105885568: 'national',
    105885616: 'sub-national',
    999999001: 'global with elements in specific areas',
    105994502: 'other',
}
geo_coverage_names = {val: key for key, val in geo_coverage_codes.items()}

In [None]:
def transform_project_geo_coverage(project):
    countries = set(project['countries'])
    project_action_codes = set(project['action_codes'])
    geo_coverage_type = None
    if 'All' in countries:
        geo_coverage_type = 'global'
        project['countries'] = []
        
    elif len(countries) > 1 or geo_coverage_names['transnational'] in project_action_codes:
        geo_coverage_type = 'transnational'
        
    elif len(countries) == 1:
        if {geo_coverage_names['national'], geo_coverage_names['regional'], geo_coverage_names['other']}.intersection(project_action_codes):
            geo_coverage_type = 'national'
        elif geo_coverage_names['sub-national'] in project_action_codes:
            geo_coverage_type = 'sub-national'
        elif "Narrative Submission" in project['title']:
            geo_coverage_type = 'national'
        elif 'Other' in project['countries'] and geo_coverage_names['global'] in project_action_codes:
            geo_coverage_type = 'global'
            project['countries'] = []
        else:
            #print("1 country project!!!")
            #pprint(project['title'])
            #pprint(project['countries'])
            pass
    else:
        geo_coverage_type = None
        #print("No countries??")
        #pprint(project['title'])
    project['geo_coverage_type'] = geo_coverage_type

In [None]:
from pprint import pprint

def update_projects():
    project_data = json.loads(requests.get(API_URL + '/api/export/projects').content)
    country_data = json.loads(requests.get(API_URL + '/api/countries').content)
    
    files = "./images/activity/activity_"
    activity_images = glob.glob("{}*".format(files))
    available_images = []
    for image in activity_images:
        root, ext = os.path.splitext(image)
        available_images.append({'uuid': root.replace(files,''),'ext': ext})
    for p in project_data:
        ext = filter(lambda i: p['uuid'] == i['uuid'] , available_images)
        image = list(ext)
        if len(image) > 0:
            image = image[0]
            image = "{}/images/activity_{}{}".format(bucketurl, image['uuid'], image['ext'])
            p.update({'image':image})
        else:
            p.update({'image':None})

    TITLE_ACTION_CODE = 43374800
    SUMMARY_ACTION_CODE = 43374829
    URL_ACTION_CODE = 43374839
    
    URL_REGEX = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    
    old_names = {c['name']: c['code'] for c in country_data}
    with open(output_folder + 'countries.json', 'r') as f:
        new_names = {c['code']: c['name'] for c in json.load(f)}

    for project in project_data:
        project['url'] = None
        for action_detail in project['action_details']:
            if action_detail['action_detail_code'] == TITLE_ACTION_CODE:
                title = action_detail['value']
                # Handle one ugly piece of data
                if title.startswith('1.\t'):
                    title = title[3:]
                project['title'] = title
            elif action_detail['action_detail_code'] == SUMMARY_ACTION_CODE:
                summary = action_detail['value']
                project['summary'] = summary
            elif action_detail['action_detail_code'] == URL_ACTION_CODE:
                url = action_detail['value']
                url = re.findall(URL_REGEX, url)
                if len(url) > 0:
                    project['url'] = "https://" + url[0][0].replace("https://","").replace("http://","")
       
        # Replace "old" country names with new country names
        countries = []
        for name in project['countries']:
            if name not in old_names:
                print(project)
                print(f"{name} missing in old country list!")
                continue
            code = old_names[name]
            if code is not None:
                new_name = new_names.get(code)
                if new_name is None:
                    print(f"{code} ({name}) not found in new country name list")
                    continue
            else:
                new_name = name
            countries.append(new_name)
        project['countries'] = countries

        transform_project_geo_coverage(project)

    with open('../backend/dev/resources/files/projects.json', 'w') as f:
        json.dump(project_data, f, indent=2, ensure_ascii=False)
    print(f'Updated {f.name}')

In [None]:
data_file = 'flat data-structure_2021-04-13.xlsx'
sheetnames = create_csvs_from_xls(data_file)
files = glob.glob("./0*.csv")
print(files)

# update_countries()  # NOTE: This is slightly slower, because we open the huge geojson
# update_organisations()
# update_tags()
country_group_countries = update_country_groups()
generate_theme('01_resources.csv')
generate_theme('03_events.csv')
generate_theme('04_policies.csv')
generate_theme('05_technologies.csv')
# NOTE: Always call update_projects after update_countries
# update_projects()

In [None]:
#datetime.strptime('04/03/2021', '%d/%m/%Y').strftime("%m/%d/%Y, %H:%M:%S")

#### NOTES
- Do we need to analyze value and it's currency?
- No Country Table
- Country also has **global with elements in specific areas**
- Some of the country has different separator
- Some of the geo_coverage has different separator
- Some of the tags has different separator
- languages separator is using colon while the url is also using colon