# Goal:
1. This notebook uses the WHO's DALY data to re-interpolate the road-injury induced DALYs
2. Also interplolate the road injury data from the WRS

In [2]:
import pandas as pd
import os
import numpy as np
import gspread
ROOTFOLDER = "/lustre1/g/geog_pyloo/05_timemachine/_raw/road_injury"
EXPORTFOLDER = "/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_road_dalys"
if not os.path.exists(EXPORTFOLDER):
    os.makedirs(EXPORTFOLDER)
DALY_FILE = "DALYs_country_{year}.xlsx"
# INJURY_FILE = ""
serviceaccount = "../../google_drive_personal.json"
gc = gspread.service_account(filename=serviceaccount)
GC_URL = "https://docs.google.com/spreadsheets/d/1o5gFmZPUoDwrrbfE6M26uJF3HnEZll02ivnOxP6K6Xw/edit?usp=sharing"


In [3]:
def read_url(url, SHEET_NAME):
    SHEET_ID = url.split("/")[5]
    spreadsheet = gc.open_by_key(SHEET_ID)
    worksheet = spreadsheet.worksheet(SHEET_NAME)
    rows = worksheet.get_all_records()
    df_spread = pd.DataFrame(rows)
    return df_spread, worksheet

In [4]:
namemapping = {
    "Bolivia (Plurinational State of)":"Bolivia"
}
metadata = {
    "population":"1000",
    "other variables":"disability-adjusted life year (DALY)"
}

In [43]:
SHEETNAME = "select_city_classifier"
city_meta, other_worksheet = read_url(GC_URL, SHEETNAME)
city_meta = city_meta[city_meta['City']!=''].reset_index(drop = True)
city_meta["country_clean"] = np.where(city_meta["Country"].isin(["USA", "United States"]), 
                                      "United States of America",
                                      city_meta["Country"]
                                     )

In [9]:
def get_daly_transform(year):
    if year==2019:
        df = pd.read_excel(os.path.join(ROOTFOLDER, DALY_FILE.format(year = year)), header = 0).set_index("variables").drop("year", axis = 1)
    else:
        df = pd.read_excel(os.path.join(ROOTFOLDER, DALY_FILE.format(year = year)), header = 0).set_index("variables")
    df = df.T.reset_index().rename(columns = {"index":"country", "population":"pop_country"})
    df["country"] = df["country"].apply(lambda x: namemapping[x] if x in namemapping.keys() else x)
    df['year'] = year
    print(f"year {year} done")
    return df
df_2019 = get_daly_transform(2019)

df_2015 = get_daly_transform(2015)

df = pd.concat([df_2015, df_2019]).reset_index(drop = True)
df.head()

year 2019 done
year 2015 done


variables,country,pop_country,Diabetes mellitus,Cardiovascular diseases,Road injury,Mental and substance use disorders,year
0,Afghanistan,34414.0,,,,,2015
1,Albania,2891.0,14.0,216.0,26.3,53.9,2015
2,Algeria,39728.0,275.9,1577.9,494.9,877.6,2015
3,Angola,27884.0,175.3,753.4,473.9,495.7,2015
4,Antigua and Barbuda,94.0,1.6,4.4,0.1,2.1,2015


In [51]:
# load population sheet
# pop_sheet = "urban_expansion"
# pop_meta, pop_worksheet = read_url(GC_URL, pop_sheet)
# pop_meta.rename(columns = {"City Name":"city", "Urban Extent Population":"urban_pop"}, inplace = True)

# test = city_meta\
#     .merge(df, left_on = ["country_clean"], right_on = "country", how = "left")\
#     .merge(pop_meta[["city", "urban_pop"]], right_on = ["city"], left_on = ["City"], how = "left")
# test.drop(["city", "country", "country_clean"], axis = 1, inplace = True)
# test = test.fillna("")

In [62]:
# other_worksheet.update(
#     [test.columns.values.tolist()] + test.values.tolist()
# )

{'spreadsheetId': '1o5gFmZPUoDwrrbfE6M26uJF3HnEZll02ivnOxP6K6Xw',
 'updatedRange': 'select_city_classifier!A1:S128',
 'updatedRows': 128,
 'updatedColumns': 19,
 'updatedCells': 2432}

## Reread all above

In [21]:
SHEETNAME = "select_city_classifier"
city_meta, other_worksheet = read_url(GC_URL, SHEETNAME)
city_meta = city_meta[city_meta['City']!=''].reset_index(drop = True)
city_meta["city_clean"] = city_meta['City'].apply(lambda x: x.split(",")[0])
city_meta["country_clean"] = np.where(city_meta["Country"].isin(["USA", "United States"]), 
                                      "United States of America",
                                      city_meta["Country"]
                                     )
city_meta.columns = [x.lower() for x in city_meta.columns]

In [11]:
# update_cols = ['pop_country',
#        'Diabetes mellitus', 'Mental and substance use disorders',
#        'Cardiovascular diseases', 'Road injury']
# city_meta.drop(update_cols, axis = 1)

Unnamed: 0,City,Country,State/Province,center_lat,center_lng,label,num_panoid,GSV Downloaded,Road Injury literature,Road injury data,left,bottom,right,top,urban_pop,pop source,country_clean,county_ls
0,Killeen,United States,Texas,31.117119,-97.727796,57,182310,87621,,,-97.821563,31.031499,-97.651934,31.140409,159172,2022,United States of America,[Bell]
1,Modesto,United States,California,37.639260,-120.997001,76,363968,90716,,,-121.073547,37.611227,-120.906042,37.711337,218069,2022,United States of America,[Stanislaus]
2,"Gainesville, FL",United States,Florida,29.651956,-82.324998,37,411255,105991,,,-82.415603,29.603327,-82.256350,29.767622,145214,2022,United States of America,[Alachua]
3,Miami,USA,Florida,25.761700,-80.191800,73,806659,123133,1,,-80.318595,25.709243,-80.145902,25.855475,449514,2022,United States of America,[Miami-Dade]
4,San Francisco,USA,California,37.774900,-122.419400,104,1346152,129104,1,,-122.514283,37.707959,-122.349796,37.832246,808437,2022,United States of America,[San Francisco]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,Milan,Italy,Lombardy,45.465776,9.187412,74,1161816,4646129,,,9.049037,45.389791,9.276110,45.535152,1352000,2017,Italy,[]
123,Nagoya,Japan,Aichi,35.174745,136.907539,81,1399215,5595654,,,136.792485,35.034771,137.055066,35.259008,2296000,2020,Japan,[]
124,Rio de Janeiro,Brazil,Rio de Janeiro,-22.911502,-43.181365,99,1452099,5806835,,,-43.793416,-23.075966,-43.150970,-22.783303,6748000,2020,Brazil,[]
125,Jakarta,Indonesia,Jakarta,-6.193973,106.822511,48,1675418,6701389,,,106.686105,-6.372598,106.973751,-6.074263,10560000,2020,Indonesia,[]


In [73]:
"""
Current definition of DALYs: DALYs are a combination of the sum of the years of 
potential life lost due to premature mortality and years of productive life lost due to a disability per 100 000 population.
To produce the DALYS at city level, we first run a regression between DALYS ~ country population, then predict with city population.
!!! This table's DALY still shows the total DALY at country level, should be divided by population first
"""


Unnamed: 0,City,Country,center_lat,center_lng,label,num_panoid,GSV Downloaded,Road Injury literature,Road injury data,left,...,Mental and substance use disorders,Cardiovascular diseases,Road injury,urban_pop,pop source,country_clean,diabetes_mellitus_city,mental_and_substance_use_disorders_city,cardiovascular_diseases_city,road_injury_city
0,Buenos Aires,Argentina,-34.599589,-58.380564,18,606938,2365132,,,-58.531449,...,1018.869452,1824.364079,379.257170,10568200,,Argentina,83.398899,240.450551,430.545197,89.503710
1,Sydney,Australia,-33.870453,151.208755,112,3411953,1413083,,,150.264995,...,882.731132,695.293632,79.588929,2871961,,Australia,22.280393,100.589985,79.230893,9.069408
2,Vienna,Austria,48.208166,16.371864,122,1166126,319435,,,16.182084,...,257.412530,467.253286,26.668338,1551706,,Austria,14.936479,44.603972,80.964794,4.621041
3,Saidpur,Bangladesh,25.778031,88.897626,102,4442,12721,https://journals-sagepub-com.eproxy.lib.hku.hk...,,88.851607,...,3231.301718,6247.642763,1732.543632,104935,,Bangladesh,0.704860,2.079638,4.020929,1.115050
4,Rajshahi,Bangladesh,24.374650,88.600367,95,240371,186022,https://journals-sagepub-com.eproxy.lib.hku.hk...,,88.017099,...,3231.301718,6247.642763,1732.543632,26943,,Bangladesh,0.180979,0.533966,1.032409,0.286299
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,New York,USA,40.748459,-73.988593,83,2449048,757884,,,-74.255283,...,15385.702740,16900.440260,2376.142369,16235289,,United States of America,210.922085,759.094192,833.827760,117.233246
123,Los Angeles,USA,34.052235,-118.243683,65,2312065,940535,,,-118.668150,...,15385.702740,16900.440260,2376.142369,12355295,,United States of America,160.514826,577.681905,634.555255,89.216234
124,Houston,USA,29.760400,-95.369800,43,4574777,2261113,,,-95.784863,...,15385.702740,16900.440260,2376.142369,2739736,,United States of America,35.593504,128.098594,140.710026,19.783334
125,Philadelphia,USA,39.952583,-75.165222,91,754285,2997840,,,-75.279793,...,15385.702740,16900.440260,2376.142369,4760536,,United States of America,61.846893,222.582747,244.496237,34.375310


In [22]:
yls = ['Diabetes mellitus',
       'Mental and substance use disorders', 'Cardiovascular diseases',
       'Road injury']
yls_cap = [y.lower().replace(" ", "_")+"_cap" for y in yls]

In [26]:
# longitudinal data: note that Taiwan only has 2019 available
long_daly_1519 = city_meta[['city','city_clean', 'country_clean','urban_pop']].merge(df, right_on = "country", left_on = "country_clean")

In [27]:
for y in yls:
    yabbr = y.lower().replace(" ", "_")+"_cap"
    long_daly_1519[yabbr] = long_daly_1519[y]/long_daly_1519["pop_country"]*100_000

In [31]:
y_clean = long_daly_1519[["city","city_clean", "country_clean","urban_pop", "pop_country","year"]+yls_cap].sort_values("urban_pop")
y_clean.to_csv(os.path.join(EXPORTFOLDER, "t_dalys_no_weight.csv"), index = False)