In [103]:
import pandas as pd
import numpy as np


In [104]:
# The human freedom index dataset for countries.
hfi = pd.read_csv('../data/human-freedom.csv')

# The quality of life dataset for a subset of cities.
qol = pd.read_csv('../data/quality-of-life.csv')

# Climate data for cities around the world.
climate = pd.read_json('../data/wiki-climate.json')

for df in [hfi, qol, climate]:
    df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace('&', 'and')


### Human freedom index dataset

This dataset contains freedom indices for countries, including human freedom (HF), personal freedom (PF) and economic freedom (EF).


In [105]:
hfi = hfi.rename(columns={'countries': 'country'})
hfi = hfi[hfi['year'] == 2020]
hfi = hfi[['country', 'hf_rank', 'ef_rank', 'pf_rank']]
hfi = hfi.astype({'hf_rank': 'int', 'ef_rank': 'int', 'pf_rank': 'int'})
hfi = hfi.rename(columns={'hf_rank': 'human_freedom',
                 'ef_rank': 'economic_freedom', 'pf_rank': 'personal_freedom'})
# strip whitespace from country names
hfi['country'] = hfi['country'].str.strip()
hfi.head()


Unnamed: 0,country,human_freedom,economic_freedom,personal_freedom
0,Albania,47,26,58
1,Algeria,154,157,138
2,Angola,122,138,114
3,Argentina,74,161,29
4,Armenia,26,11,37


### Quality of life dataset

The QoL dataset contains information about specific cities (which is of interest to our application). However, the range of countries described by these cities is smaller than that of the Human Freedom Index dataset. Only the countries within the QoL dataset should be used within the final processed dataset.


In [106]:
qol = qol.rename(columns={'ua_name': 'city', 'ua_country': 'country'})
qol = qol[['city', 'country', 'housing', 'cost_of_living', 'startups', 'venture_capital',
           'travel_connectivity', 'commute', 'business_freedom', 'safety',
           'healthcare', 'education', 'environmental_quality', 'economy',
           'taxation', 'internet_access', 'leisure_and_culture', 'tolerance',
           'outdoors']]
# replace all numeric columns with ranks
qol[qol.columns[2:]] = qol[qol.columns[2:]].rank(
    axis=0, method='min', ascending=False).astype('int32')
qol['city'] = qol['city'].str.strip()
qol['country'] = qol['country'].str.strip()
qol = qol.round(3)
qol.head()


Unnamed: 0,city,country,housing,cost_of_living,startups,venture_capital,travel_connectivity,commute,business_freedom,safety,healthcare,education,environmental_quality,economy,taxation,internet_access,leisure_and_culture,tolerance,outdoors
0,Aarhus,Denmark,169,225,213,124,114,6,4,6,17,58,78,155,103,14,236,1,176
1,Adelaide,Australia,159,203,196,112,192,104,17,83,81,76,42,86,136,190,208,82,61
2,Albuquerque,United States,106,107,158,177,217,135,69,266,165,147,93,16,151,133,178,144,207
3,Almaty,Kazakhstan,23,18,233,193,72,35,215,123,241,198,215,145,9,232,240,172,64
4,Amsterdam,Netherlands,239,228,27,31,5,12,64,55,83,30,81,147,106,179,16,49,93


### Wikipedia Climate dataset

Climate data is very noisy and sparse. It is consistent only for larger cities. To sanitise, we remove extra characters and accents from city names and extract only the desired columns.


In [107]:


climate = climate.rename(columns={'name': 'city'})
climate = climate.dropna(subset=['city'])
climate['city'] = climate['city'].str.normalize('NFKD').str.encode(
    'ascii', errors='ignore').str.decode('utf-8')
climate['city'] = climate['city'].apply(
    lambda x: x.split(',')[0].split('(')[0])
climate['city'] = climate['city'].str.strip()
climate['country'] = climate['country'].str.strip()
climate = climate.rename(columns={'year_high_c': 'max_temp', 'year_precipitation_mm': 'precipitation',
                         'year_low_c': 'min_temp', 'year_mean_c': 'mean_temp', 'gps_lon': 'lon', 'gps_lat': 'lat', 'city_wd': 'wikidata'})
climate = climate[['city', 'country', 'population', 'wikidata', 'lon',
                   'lat', 'max_temp', 'mean_temp', 'min_temp', 'precipitation']]
climate[['mean_temp', 'min_temp', 'max_temp']] = climate[['mean_temp', 'min_temp', 'max_temp']].replace('', np.nan).astype('float')
climate['mean_temp'] = climate['mean_temp'].fillna(
    (climate['max_temp'] + climate['min_temp']) / 2)
climate['precipitation'] = climate['precipitation'].fillna(0)

climate = climate.sort_values(by='population', ascending=False)
climate = climate.drop_duplicates(subset=['city', 'country'], keep='first')
print(f"Columns: {climate.columns}")
print(f"Shape: {climate.shape}")
climate = climate.sort_values(by='population', ascending=False)
climate.head()

Columns: Index(['city', 'country', 'population', 'wikidata', 'lon', 'lat', 'max_temp',
       'mean_temp', 'min_temp', 'precipitation'],
      dtype='object')
Shape: (9671, 10)


Unnamed: 0,city,country,population,wikidata,lon,lat,max_temp,mean_temp,min_temp,precipitation
0,Delhi,India,26495000,http://www.wikidata.org/entity/Q1353,77.216667,28.666667,31.2,25.1,18.9,790.0
1,Shanghai,China,23390000,http://www.wikidata.org/entity/Q8686,121.466667,31.166667,20.6,17.1,14.1,1166.1
2,Beijing,China,21710000,http://www.wikidata.org/entity/Q956,116.391389,39.905,17.8,12.5,7.2,571.8
3,Lagos,Nigeria,21324000,http://www.wikidata.org/entity/Q8673,3.4,6.45,30.8,26.8,22.8,1506.6
4,Karachi,Pakistan,14910352,http://www.wikidata.org/entity/Q8660,67.01,24.86,31.7,26.0,20.3,0.0


### Creating a combined dataset

We combine all of the sanitised datasets using QoL city name as a primary key for the climate data, and QoL country as a primary key for the human freedom data.


In [108]:
combined = pd.merge(qol, hfi, on='country', how='left')
combined = pd.merge(combined, climate, on=['city', 'country'], how='left')
combined = combined.dropna()
combined.to_csv('../data/preprocessed.csv', index=False)
combined.head()

Unnamed: 0,city,country,housing,cost_of_living,startups,venture_capital,travel_connectivity,commute,business_freedom,safety,...,economic_freedom,personal_freedom,population,wikidata,lon,lat,max_temp,mean_temp,min_temp,precipitation
0,Aarhus,Denmark,169,225,213,124,114,6,4,6,...,5.0,8.0,269022.0,http://www.wikidata.org/entity/Q25319,10.209722,56.156389,11.4,7.7,3.8,725.0
1,Adelaide,Australia,159,203,196,112,192,104,17,83,...,6.0,17.0,1300000.0,http://www.wikidata.org/entity/Q5112,138.583333,-34.933333,22.4,17.35,12.3,0.0
2,Albuquerque,United States,106,107,158,177,217,135,69,266,...,7.0,33.0,559277.0,http://www.wikidata.org/entity/Q34804,-106.616667,35.116667,20.4,13.9,7.5,240.1
3,Almaty,Kazakhstan,23,18,233,193,72,35,215,123,...,49.0,119.0,1703500.0,http://www.wikidata.org/entity/Q35493,76.9,43.25,15.8,10.0,5.0,684.0
4,Amsterdam,Netherlands,239,228,27,31,5,12,64,55,...,17.0,10.0,851573.0,http://www.wikidata.org/entity/Q727,4.9,52.383333,13.8,10.2,6.4,838.2
