In [1]:
# Load in packages
import pandas as pd
import os.path as osp
import os
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import re

In [2]:
# Paths to directories
currentdir = os.getcwd()
inputdir = osp.realpath(osp.join(currentdir, '..', 'inputData'))
moogaldir = osp.realpath(osp.join(currentdir, '..', 'MOOGALdefs'))
outputdir = osp.realpath(osp.join(currentdir, '..', 'outputData'))

In [3]:
# Load core files
indexfile = pd.read_csv(inputdir + '/indexfile.csv')
indexfile.set_index(['countryISO3', 'year'], inplace=True)

country_name_database = pd.read_csv(inputdir + '/country_regions.csv')

demography = pd.read_csv(inputdir + '/demography.csv')
demography.set_index(['countryISO3','year'], inplace=True)

youth = pd.read_csv(inputdir + '/youth_model.csv')
youth.rename(columns={'education': 'education_research', 'active_recreation': 'active_rec',
                                             'work_employed': 'work_employment'}, inplace=True)

youth_uncertainty = pd.read_csv(inputdir + '/youth_model_unc.csv')
youth_uncertainty.rename(columns={'education': 'education_research', 'active_recreation': 'active_rec',
                                             'work_employed': 'work_employment'}, inplace=True)

In [4]:
# Load all TUS to be included into dictionary. Key=country_year, value=df
TUS_data = {}

for idx, row in indexfile.iterrows():

    country = idx[0]
    year = int(idx[1])

    if row['include'] == 1:

        fname = row['TUS_file']

        TUS_data[country + '_' + str(year)] = \
            pd.read_csv(inputdir + '/' + fname, sep=',')[['tier1','tier2','hours_per_day']]

In [5]:
# Enforce all TUS to sum to 24h (minor differences are redistributed proportionally to all activities)
TUS_dict = {}

for k, v in TUS_data.items():

    # Country ISO3 code is first 3 substrings, year is 4th+ of key (ISO_YYYY)
    country = k[0:3]
    year = int(k[4:])

    # Take only first 3 columns
    tus = v[['tier1','tier2','hours_per_day']].copy()
    hours_sum = v['hours_per_day'].sum()

    # Check and adjust hours to equal 24 (proportionally distributed)
    if hours_sum >= 24.01:
        difference = tus['hours_per_day'].sum() - 24.0
        tus['hours_per_day'] = tus['hours_per_day'].apply(lambda t: t - (difference * t / hours_sum))
        tus['tier1'] = tus.iloc[:,0].str.lower().str.lstrip(' ').str.rstrip(' ')
        tus['tier2'] = tus.iloc[:,1].str.lower().str.lstrip(' ').str.rstrip(' ')
        TUS_dict[country + '_' + str(year)] = tus
        
    elif hours_sum <= 23.99:
        difference = 24.0 - tus['hours_per_day'].sum()
        tus['hours_per_day'] = tus['hours_per_day'].apply(lambda t: t + (difference * t / hours_sum))
        tus['tier1'] = tus.iloc[:,0].str.lower().str.lstrip(' ').str.rstrip(' ')
        tus['tier2'] = tus.iloc[:,1].str.lower().str.lstrip(' ').str.rstrip(' ')
        TUS_dict[country+ '_' + str(year)] = tus
    
    else:
        tus['tier1'] = tus.iloc[:,0].str.lower().str.lstrip(' ').str.rstrip(' ')
        tus['tier2'] = tus.iloc[:,1].str.lower().str.lstrip(' ').str.rstrip(' ')
        TUS_dict[country+ '_' + str(year)] = tus

In [6]:
# Load in MLdef files, split into MLdef (fractions) and MLunc
MLdef_dict = {}
MLunc_dict = {}

for k, v in TUS_dict.items():
    
    MLdef = {}
    MLunc = {}
    country = k[0:3]
    year = int(k[4:])
    
    # Find MLdef file in indexfile
    file = indexfile.loc[country].loc[year]['TUS_MLdef']

    # Take first 30 cols of MLdef (avoids taking empty cols)
    combined_MLdef = pd.read_csv(moogaldir + '/' + file, usecols = np.arange(0,30))

    # Drop hours_per_day col since it will be replaced directly with TUS data
    combined_MLdef.drop('hours_per_day', axis=1, inplace=True)
    
    # Lowercase all strings and remove whitespaces on either side to avoid errors when merging
    MLdef['tier1'] = combined_MLdef.iloc[:,0].str.lower().str.lstrip(' ').str.rstrip(' ')
    MLdef['tier2'] = combined_MLdef.iloc[:,1].str.lower().str.lstrip(' ').str.rstrip(' ')
    MLunc['tier1'] = combined_MLdef.iloc[:,0].str.lower().str.lstrip(' ').str.rstrip(' ')
    MLunc['tier2'] = combined_MLdef.iloc[:,1].str.lower().str.lstrip(' ').str.rstrip(' ')

    for col in combined_MLdef.iloc[:, 2:]:
        
        # Split the column on the ; -- 1st item is MLdef frac, 2nd item is MLunc
        split_MLdef = combined_MLdef[col].apply(lambda x: x.split(";")[0]).replace('nan', 0)
        # Some items are numbers in string form (thx excel) so force to numeric
        split_MLdef = split_MLdef.apply(pd.to_numeric, errors='coerce')
        MLdef[col] = split_MLdef

        split_MLunc = combined_MLdef[col].apply(lambda x: x.split(';')[1]).replace('nan', np.NaN)
        split_MLunc = split_MLunc.apply(pd.to_numeric, errors='coerce')
        MLunc[col] = split_MLunc
    
    MLdef = pd.DataFrame.from_dict(MLdef)
    MLunc = pd.DataFrame.from_dict(MLunc)
    MLdef_dict[k] = MLdef
    MLunc_dict[k] = MLunc

In [7]:
# Convert TUS classification to MOOGAL
TUS_M24 = {}
baseline_uncertainty = {'A':0.05, 'B':0.1, 'C':0.2}

for k, v in TUS_dict.items():
    
    country = k[0:3]
    year = int(k[4:])
    
    # Merge TUS with MLdef
    merged = v.merge(MLdef_dict[k], on=['tier1','tier2'])
    
    t = np.array(merged['hours_per_day']).reshape((1,len(merged)))
    M = np.array(merged.iloc[:,3:])
    
    # t (1 x n) * M [n x 27] = h (1 x 27)
    h = np.dot(t,M)

    # Redistribute "unknown" proportionally across all categories, then drop unknown column
    unknown = h[0,-1]
    h = np.delete(h, -1)
    scale = (lambda x: unknown * x / (h.sum() + unknown) + x)
    h = scale(h)
    
    # Compute uncertainty
    merged = v.merge(MLunc_dict[k], on=['tier1','tier2'])
    u_base = baseline_uncertainty[indexfile.loc[country].loc[year]['tus_quality']]
    
    U_frac = merged.iloc[:,3:] 
    # Sum variance of MLunc fraction with baseline variance, multiply by square of time,
    # following formula of combined_var = w^2 * sigma^2 
    U = np.nansum(t.T**2 * U_frac**2 + t.T**2 * u_base**2 , axis=0)
    U = np.delete(U, -1)
    
    # Convert hours & uncertainty to df with index of subcategories
    subcategories = merged.columns[3:-1]
    TUS_M24[k] = pd.DataFrame(data=zip(h,U), index=subcategories, columns=['hours_per_day','uncertainty'])

In [8]:
# Integrate youth model

# First, clean it up
youth['countryISO3'] = youth.iloc[:,0].apply(lambda x: x[2:5])
youth['age'] = youth.iloc[:,0].apply(lambda x: int(re.findall("\d+", x)[0]))
youth.drop('Unnamed: 0', axis=1, inplace=True)
youth.set_index(['countryISO3', 'age'], inplace=True)

youth_uncertainty['countryISO3'] = youth_uncertainty.iloc[:,0].apply(lambda x: x[2:5])
youth_uncertainty['age'] = youth_uncertainty.iloc[:,0].apply(lambda x: int(re.findall("\d+", x)[0]))
youth_uncertainty.drop('Unnamed: 0', axis=1, inplace=True)
youth_uncertainty.set_index(['countryISO3', 'age'], inplace=True)

In [9]:
# Combine it with TUS according to population covered in each.

TUS_combined_M24 = {}
TUS_country_year = {}

for k, v in TUS_M24.items():
    
    country = k[0:3]
    year = int(k[4:])
        
    min_age = indexfile.loc[country].loc[year]['TUS_age_min']
    max_age = indexfile.loc[country].loc[year]['TUS_age_max'] + 1
    
    if min_age == 0:
        
        TUS_combined_M24[k[0:3]] = v
        TUS_country_year[k[0:3]] = int(k[4:])
    
    else:
        youth_hrs_list = []
        youth_unc_list = []
        
        pop_in_TUS = demography.loc[country].loc[year][min_age:max_age].sum()
        pop_total = demography.loc[country].loc[year][:max_age].sum()
        
        person_hours_TUS = v['hours_per_day'] * pop_in_TUS
        TUS_uncertainty = v['uncertainty'] * ((pop_in_TUS/pop_total)**2)
        
        for age in np.arange(0, min_age):

            # Access row of activities for given country and age
            youth_hours = youth.loc[country].loc[age]
            # Nothing in unknown so drop it for simplicity
            youth_hours.drop('unknown', axis=0, inplace=True)

            # Get population of age, multiply by avg hours
            pop_youth = demography.loc[country].loc[year][age]
            youth_person_hours = youth_hours * pop_youth
            youth_hrs_list.append(youth_person_hours)
            
            # Get uncertainty for age (units already in hrs)
            youth_unc = youth_uncertainty.loc[country].loc[age]
            youth_unc.drop('unknown', axis=0, inplace=True)
            # Youth baseline uncertainty
            unc_base = 0.25**2 * youth_hours**2
            # Add weighted variances
            youth_unc = (youth_unc**2 + unc_base) * (pop_youth / pop_total)**2
            youth_unc_list.append(youth_unc)
        
        # Sum across ages to get youth total person-hours, add to TUS person-hours, then divide by total pop
        total_youth_hours = pd.DataFrame(youth_hrs_list).sum()
        
        combined_hours = (person_hours_TUS.values + total_youth_hours) / pop_total
        
        # Add variances from TUS and youth together
        total_youth_uncertainty = (pd.DataFrame(youth_unc_list)).sum()
        combined_uncertainty = (TUS_uncertainty + total_youth_uncertainty) 
        
        TUS_combined_M24[k[0:3]] = pd.DataFrame(data=zip(np.array(combined_hours), \
                                                np.array(combined_uncertainty)), index=combined_hours.index, \
                                                columns=['hours_per_day','uncertainty'])
        TUS_country_year[k[0:3]] = int(k[4:])

In [10]:
# Split combined dict into separate dfs
hours = pd.DataFrame([TUS_combined_M24[i]['hours_per_day'] for i in TUS_combined_M24.keys()], \
                     index=TUS_combined_M24.keys())
uncertainty = pd.DataFrame([TUS_combined_M24[i]['uncertainty'] for i in TUS_combined_M24.keys()], \
                        index=TUS_combined_M24.keys())

# Format hours df for interpolation
hours = hours.reset_index().rename(columns={'index':'countryISO3'}).melt(id_vars=['countryISO3'], \
                        var_name='subcategory', value_name='hoursPerDay')
hours = hours.merge(country_name_database[['region_code','country_iso3']], how='left', left_on='countryISO3', \
                        right_on='country_iso3').drop('country_iso3', axis=1)

In [11]:
# Interpolate TUS data over regions 

hours_region_group = hours.groupby(['region_code','subcategory'])
no_pop_data = []
M24_interpolated = []

for name, grp in hours_region_group:
    
    countries_in_region = list(country_name_database[country_name_database['region_code'] == \
                            name[0]]['country_iso3'])
    df = pd.DataFrame()
    df['countryISO3'] = countries_in_region
    df = df.merge(grp, on='countryISO3', how='left')
    df.fillna({'region_code':name[0], 'subcategory':name[1]}, inplace=True)
    
    populations = []    
    pop_with_data = []
    unc = []
    
    for country in df['countryISO3'].unique():
        
        # No pop data for protectorates and unrecognzied entities. Exclude.
        if country not in set(demography.index.get_level_values(0)):
            no_pop_data.append(country)
            populations.append(np.NaN)
            unc.append(np.NaN)
            
        elif country in TUS_country_year.keys():
            year = TUS_country_year[country]
            max_age = indexfile.loc[country].loc[year]['TUS_age_max'] + 1
            populations.append(demography.loc[country].loc[year][:max_age].sum())
            pop_with_data.append(demography.loc[country].loc[year][:max_age].sum())
            unc.append(uncertainty.loc[country][name[1]])
        
        else:
            populations.append(demography.loc[country].loc[2019][:].sum())
            unc.append(np.NaN)
            
    df['population'] = populations
    pop_with_data = np.sum(pop_with_data)
    
    df['dataStatus'] = ['observed' if i >= 0.0 else 'interpolated' for i in df['hoursPerDay']]
    
    region_stdev = df['hoursPerDay'].std()
    df['uncertainty'] = unc
    
    if df['dataStatus'].value_counts()['observed'] <= 3:
        global_stdev = hours.groupby('subcategory')['hoursPerDay'].std().loc[name[1]]
        u = df['uncertainty'].loc[~df['uncertainty'].isnull()].iloc[0]
        df['uncertainty'] = df['uncertainty'].fillna((2*global_stdev)**2 + u)
               
    else:  
        df['uncertainty'] = df['uncertainty'].fillna(region_stdev**2)
    
    region_mean = (df['hoursPerDay'] * df['population']).sum() / pop_with_data
    df['hoursPerDay'] = df['hoursPerDay'].fillna(region_mean)
    df.dropna(axis=0, inplace=True)
       
    M24_interpolated.append(df)

In [12]:
M24_interpolated = pd.concat(M24_interpolated)
M24_interpolated = M24_interpolated[['countryISO3','region_code','population',\
                                     'subcategory','hoursPerDay','uncertainty','dataStatus']]

In [13]:
# Save out
current_date = datetime.today().strftime('%y%m%d')
M24_interpolated.to_csv(outputdir + '/M24_TUS_and_youth_' + current_date + '.csv', index=False)