In [1]:
import pandas as pd
from os import listdir
from zipfile import ZipFile


# Process the download data
## read and merge zipped files

In [2]:
def find_zip_filenames( path_to_dir, suffix=".zip" ):
    filenames = listdir(path_to_dir)
    return [filename for filename in filenames if filename.endswith( suffix ) ]

In [3]:
def change_df_columnnames(df): # change columnnames if download data with id and names
    new_col = [col.split("_")[0] if col.split("_")[-1] == "name" else col for col in df.columns]
    df.columns = new_col
    del_col = [col for col in df.columns if col.split("_")[-1] == "id"]
    df_new = df.drop(columns=del_col)
    return df_new

In [4]:
!ls ../../bigdata/level3_diabetes_zipped_data

IHME-GBD_2019_DATA-0862adbd-1.zip IHME-GBD_2019_DATA-96f16e6b-1.zip
IHME-GBD_2019_DATA-178ef5e8-1.zip IHME-GBD_2019_DATA-dca0b348-1.zip
IHME-GBD_2019_DATA-1844a1c6-1.zip IHME-GBD_2019_DATA-f11c19de-1.zip
IHME-GBD_2019_DATA-5d5cdf1d-1.zip IHME-GBD_2019_DATA-f2ec3a9c-1.zip
IHME-GBD_2019_DATA-74788d53-1.zip javascript.level3
IHME-GBD_2019_DATA-8a8e36bd-1.zip


In [5]:
# file_dir = './Level Three Zipped Data/'
file_dir = '../../bigdata/level3_diabetes_zipped_data/'
files = find_zip_filenames(file_dir)
## time
MAXSIZE = 1
for i in range(0, MAXSIZE, 1):
    print(i)
    pieces = []
    for file in files[i*100: min(i*100+100, len(files))]:
        zip_file = ZipFile(file_dir + file)
        csv_file = [text_file.filename for text_file in zip_file.infolist() if text_file.filename.endswith('.csv')][0]
        df = pd.read_csv(zip_file.open(csv_file))
        print(len(df))
        df = change_df_columnnames(df)
        pieces.append(df)
    df = pd.concat(pieces)
    df.to_parquet('../../bigdata/piece%s.parquet'%i)  
    del df, pieces

0
117504
117504
117504
117504
117504
117504
117504
117504
117504
117504


In [6]:
# ages_to_use = ['Under 5', '5 to 9', '10 to 14', '15 to 19', '20 to 24',
#        '25 to 29', '30 to 34', '35 to 39', '40 to 44', '45 to 49',
#        '50 to 54', '55 to 59', '60 to 64', '65 to 69', '70 to 74',
#        '75 to 79', '80 to 84', '85 to 89', '90 to 94',
#        '95 plus']

ages_to_use = ['Under 5', '5 to 9', '10 to 14', '15 to 19', '20 to 24',
       '25 to 29', '30 to 34', '35 to 39', '40 to 44', '45 to 49',
       '50 to 54', '55 to 59', '60 to 64', '65 to 69', '70 to 74',
       '75 plus']

# get level 3 data 

In [7]:
data_folder = '../../bigdata/data_diabetes/'
if not os.path.exists(data_folder):
    os.makedirs(data_folder)

In [8]:
# get cause
causes_to_use = ["Diabetes mellitus"]

file_dir = '../../bigdata/'
files = find_zip_filenames(file_dir, suffix='.parquet')
pieces = []
for file in files:
    print(file)
    df = pd.read_parquet(file_dir+file)
    # df = df[df['cause'].isin(causes_to_use)]
    df = df[df['age'].isin(ages_to_use)]
    df = df[df['year']>=2010]
    pieces.append(df)
df = pd.concat(pieces)

piece0.parquet


In [9]:
print(df.columns, len(df))

Index(['measure', 'location', 'sex', 'age', 'cause', 'metric', 'year', 'val',
       'upper', 'lower'],
      dtype='object') 1175040


## convert IHME country names to WB codes

In [10]:
# change the name from world bank names to common names
countries = pd.read_csv('../../data/dl1_countrycodeorg_country_name.csv')
code_map = dict(zip(countries.country, countries['Country Code'])) 
# df = df[df['location'].isin(countries.country)]
# df['location'] = df['location'].apply(lambda x:country_map[x])
df['Country Code'] = df['location'].apply(lambda x:code_map[x])
df.to_csv(data_folder+'IHME_p_details.csv',index=False)

# RUN MAIN FUNCTION FROM HERE 
## process the raw data

In [11]:
## MAIN FUNCTION - input
## OUTPUT FILE
data_folder = '../../bigdata/data_diabetes/'

## INPUT FILE
df_input = pd.read_csv(data_folder+'IHME_p_details.csv')

pop_data = pd.read_csv('../../data/population_gbd.csv')
countries_pri = pd.read_csv('../../data/dl1_countrycodeorg_country_name.csv')


# # map of ages to dx+
ages = {'Under 5': 'd0', '5 to 9': 'd5', '10 to 14': 'd10', '15 to 19': 'd15', 
        '20 to 24': 'd20', '25 to 29': 'd25', '30 to 34': 'd30', '35 to 39': 'd35', 
        '40 to 44': 'd40', '45 to 49': 'd45', '50 to 54': 'd50', '55 to 59': 'd55', 
        '60 to 64': 'd60', '65 to 69': 'd65', '70 to 74': 'd65', '75 plus': 'd65'}

# ages = {    'Under 5': 'd0',   '5 to 9': 'd05', '10 to 14': 'd10', '15 to 19': 'd15'
#         , '20 to 24': 'd20', '25 to 29': 'd25', '30 to 34': 'd30', '35 to 39': 'd35'
#         , '40 to 44': 'd40', '45 to 49': 'd45', '50 to 54': 'd50', '55 to 59': 'd55'
#         , '60 to 64': 'd60', '65 to 69': 'd65', '70 to 74': 'd70', '75 to 79': 'd75'
#         , '80 to 84': 'd80', '85 to 89': 'd85', '90 to 94': 'd90', '95 plus': 'd95' }
## Process 
print(len(df_input))
df = df_input[df_input['metric']=='Number']
df['sex'].replace(to_replace='Female', value='F', inplace=True)
df['sex'].replace(to_replace='Male', value='M', inplace=True)
df = df[df['age'] != 'All Ages']
df = df[df['sex'] != 'Both']
df['age'] = df['age'].apply(lambda x: ages[x])
df = df.rename(columns={"cause": "disease"})
print(len(df)) ##
print('Please calculate the numbers and verify!')
gb = df.groupby(['measure', 'location', 'sex', 'age', 'disease', 'metric', 'year', 'Country Code']).sum()
print(len(gb))
df_numbers = gb.reset_index()

1175040


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


391680
Please calculate the numbers and verify!
342720


In [12]:
pieces = []
pop = pop_data.set_index(['Country Code', 'sex', 'age'])
for measure in df_numbers['measure'].unique():
    rates = []
    for scen in ['val', 'upper', 'lower']:
        dff = pd.pivot_table(df_numbers[df_numbers['measure'] == measure], columns=['year'], index=['Country Code', 'sex', 'age'], values=scen)
        rate = pd.DataFrame(dff.values / pop.values * 100000, index = dff.index, columns= dff.columns)
        rate = pd.DataFrame(rate.stack(), columns=[scen])
        rates.append(rate)
    piece = pd.concat(rates, axis=1).reset_index()
    piece['measure'] = measure
    piece['disease'] = 'Diabetes mellitus'
    pieces.append(piece)
df_rate_adjust = pd.concat(pieces)

In [13]:
# df = df_numbers
df = df_rate_adjust
diseases = sorted(list(df['disease'].unique()))

countrymap = dict(zip(countries_pri.country, countries_pri['Country Code'])) 
codemap = dict(zip(countries_pri['Country Code'], countries_pri.country)) 
countries = df['Country Code'].unique()

# this the latest year of data available, should be updated if it changes.
latest_year_available = max(df["year"]) # 2019
print (latest_year_available)

print(df.columns)
print(df['measure'].unique())
print(len(df['measure'].unique()))
print(len(countries))
print(len(df['Country Code'].unique()))
print(len(df['disease'].unique()))
print(len(df['age'].unique()))
print(len(df['year'].unique()))
l, m = len(df), len(df['measure'].unique()) * len(df['Country Code'].unique()) * len(df['disease'].unique()) * len(df['age'].unique()) * len(df['year'].unique()) * len(df['sex'].unique())
print(l, m)


2019
Index(['Country Code', 'sex', 'age', 'year', 'val', 'upper', 'lower',
       'measure', 'disease'],
      dtype='object')
['DALYs (Disability-Adjusted Life Years)' 'Deaths' 'Incidence'
 'Prevalence' 'YLDs (Years Lived with Disability)'
 'YLLs (Years of Life Lost)']
6
204
204
1
14
10
342720 342720


In [14]:
df_rate_adjust.max(), df_rate_adjust.min()

(Country Code                          ZWE
 sex                                     M
 age                                   d65
 year                                 2019
 val                          80273.474243
 upper                        83805.556761
 lower                        77033.637116
 measure         YLLs (Years of Life Lost)
 disease                 Diabetes mellitus
 dtype: object,
 Country Code                                       AFG
 sex                                                  F
 age                                                 d0
 year                                              2010
 val                                            0.00657
 upper                                         0.008819
 lower                                         0.000017
 measure         DALYs (Disability-Adjusted Life Years)
 disease                              Diabetes mellitus
 dtype: object)

# get  mortality

In [15]:
def get_IHME(df):
    dff = pd.pivot_table(df, columns=['year'], index=['measure', 'disease', 'Country Code', 'sex', 'age'], values=scen)
    # dff = dff.T.interpolate(limit_direction='both').T
    # exp = dff[2019] / dff[2010]
    # exp = exp.apply(lambda x: pow(x, 1.0 / (2019-2010)))
    # print('The max and min rate is', exp.max(), exp.min())
    # for year in range(2019+1, 2051, 1):
    #     dff[year] = dff[year - 1] * exp
    exp = dff[2019] / dff[2010]
    exp[exp > 1.13] = 1.13
    exp[exp < 0.87] = 0.87
    # print(exp)
    exp = exp.apply(lambda x: pow(x, 1.0 / (2019-2010)))
    print('The max and min rate is', exp.max(), exp.min())
    for year in range(2019+1, 2051, 1):
        dff[year] = dff[year - 1] * exp
    # dff = dff.fillna(0)
    dff = dff / 100000
    return dff

def get_index(dff):
    df_indexes = dff.reset_index()
    measures = df_indexes['measure'].unique()
    diseases = df_indexes['disease'].unique()
    countries = df_indexes['Country Code'].unique()
    sexes = df_indexes['sex'].unique()
    ages = df_indexes['age'].unique()
    sesies_index = pd.MultiIndex.from_product([measures, diseases, countries, sexes, ages]) 
    sesies_index = sesies_index.set_names(['measure', 'disease', 'Country Code', 'sex', 'age'])  
    return sesies_index
    
def save_IHME(dff):
    dff.loc['Incidence'].reset_index().to_csv(data_folder + '/incidence_%s.csv'%(scen), index=False)
    dff.loc['Prevalence'].reset_index().to_csv(data_folder + '/prevalence_%s.csv'%(scen), index=False)
    dff.loc['Deaths'].reset_index().to_csv(data_folder + '/mortality_%s.csv'%(scen), index=False)
    dff.loc['DALYs (Disability-Adjusted Life Years)'].reset_index().to_csv(data_folder + '/DALY_%s.csv'%(scen), index=False)
    YLL = dff.loc['YLLs (Years of Life Lost)']
    YLD = dff.loc['YLDs (Years Lived with Disability)']
    YLL.reset_index().to_csv(data_folder + '/YLL_%s.csv'%(scen), index=False)
    YLD.reset_index().to_csv(data_folder + '/YLD_%s.csv'%(scen), index=False)
    MORBIDITY = YLD / YLL
    MORBIDITY = MORBIDITY.fillna(0)
    MORBIDITY.reset_index().to_csv(data_folder + '/morbidity_%s.csv'%(scen), index=False)


In [16]:
for scen in ['val', 'lower', 'upper']:
    print (scen)
    dff = get_IHME(df_rate_adjust)
    # full_index = get_index(dff)
    # dff_full = dff.reindex(full_index, fill_value=0)
    dff_full = dff
    print('The counts of data:',len(dff_full))
    save_IHME(dff_full)

val
The max and min rate is 1.0136723603897777 0.9846455374452312
The counts of data: 34272
lower
The max and min rate is 1.0136723603897777 0.9846455374452312
The counts of data: 34272
upper
The max and min rate is 1.0136723603897777 0.9846455374452312
The counts of data: 34272
