In [869]:
import pandas as pd
import numpy as np


In [870]:
# The human freedom index dataset for countries.
hfi = pd.read_csv('../data/human-freedom.csv')

# The quality of life dataset for a subset of cities.
qol = pd.read_csv('../data/quality-of-life.csv')

# Climate data for cities around the world.
climate = pd.read_json('../data/wiki-climate.json')

# A list of the United States, required to filter qol data that considers each state as a country.
states = pd.read_csv('../data/states.csv')


### Human freedom index dataset

This dataset contains freedom indices for countries, including human freedom (HF), personal freedom (PF) and economic freedom (EF).


In [871]:
hfi = hfi.rename(columns={'countries': 'country'})
hfi = hfi[hfi['year'] == 2020]
hfi = hfi[['country', 'hf_rank', 'ef_rank', 'pf_rank']]
hfi = hfi.astype({'hf_rank': 'int32', 'ef_rank': 'int32', 'pf_rank': 'int32'})
hfi = hfi.rename(columns={'hf_rank': 'human_freedom',
                 'ef_rank': 'economic_freedom', 'pf_rank': 'personal_freedom'})
hfi.head()


Unnamed: 0,country,human_freedom,economic_freedom,personal_freedom
0,Albania,47,26,58
1,Algeria,154,157,138
2,Angola,122,138,114
3,Argentina,74,161,29
4,Armenia,26,11,37


In [872]:
countries = hfi['country'].unique()
print(
    f"This dataset contains {len(countries)} countries, including {np.random.choice(countries, 5)}")


This dataset contains 165 countries, including ['Poland' 'Barbados' 'Israel' 'Tunisia' 'Lesotho']


### Quality of life dataset

The QoL dataset contains information about specific cities (which is of interest to our application). However, the range of countries described by these cities is smaller than that of the Human Freedom Index dataset. Only the countries within the QoL dataset should be used within the final processed dataset.


In [873]:
qol = qol.rename(columns={'UA_Name': 'city', 'UA_Country': 'country'})
qol['country'] = qol['country'].str.strip()
qol['country'] = qol['country'].apply(
    lambda x: 'United States' if x in states['State'].values else x)
qol = qol.drop(columns=['UA_Continent'])
qol = qol.round(3)
qol = qol.iloc[:, 1:]
qol.head()


Unnamed: 0,city,country,Housing,Cost of Living,Startups,Venture Capital,Travel Connectivity,Commute,Business Freedom,Safety,Healthcare,Education,Environmental Quality,Economy,Taxation,Internet Access,Leisure & Culture,Tolerance,Outdoors
0,Aarhus,Denmark,6.132,4.015,2.827,2.512,3.536,6.312,9.94,9.617,8.704,5.366,7.633,4.887,5.068,8.373,3.187,9.739,4.13
1,Adelaide,Australia,6.31,4.692,3.136,2.64,1.777,5.336,9.4,7.926,7.937,5.142,8.331,6.07,4.588,4.341,4.328,7.822,5.531
2,Albuquerque,United States,7.262,6.059,3.772,1.493,1.456,5.056,8.671,1.344,6.43,4.152,7.32,6.514,4.346,5.396,4.89,7.028,3.516
3,Almaty,Kazakhstan,9.282,9.333,2.458,0.0,4.592,5.871,5.568,7.309,4.546,2.283,3.857,5.269,8.522,2.886,2.937,6.54,5.5
4,Amsterdam,Netherlands,3.053,3.824,7.972,6.107,8.324,6.118,8.837,8.504,7.907,6.18,7.597,5.053,4.955,4.523,8.874,8.368,5.307


In [874]:

countries = qol['country'].unique()
countries = np.sort(countries)
print(
    f"This dataset contains {len(countries)} countries, including {np.random.choice(countries, 5)}.")
print(f"This dataset contains {len(qol['city'].unique())} cities.")


This dataset contains 97 countries, including ['China' 'Colombia' 'Ireland' 'India' 'Belarus'].
This dataset contains 264 cities.


### Wikipedia Climate dataset

Climate data is very noisy and sparse. It is consistent only for larger cities. To sanitise, we remove extra characters and accents from city names and extract only the desired columns.


In [875]:
climate = climate.rename(columns={'name': 'city'})
climate = climate.dropna(subset=['city'])
climate['city'] = climate['city'].str.normalize('NFKD').str.encode(
    'ascii', errors='ignore').str.decode('utf-8')
climate['city'] = climate['city'].apply(
    lambda x: x.split(',')[0].split('(')[0])
climate['city'] = climate['city'].str.strip()
climate['country'] = climate['country'].str.strip()
climate = climate[['city', 'country', 'population', 'city_wd', 'gps_lon',
                   'gps_lat', 'year high C', 'year mean C', 'year low C', 'year precipitation mm']]
climate = climate.sort_values(by='population', ascending=False)
climate = climate.drop_duplicates(subset=['city', 'country'], keep='first')
print(f"Columns: {climate.columns}")
print(f"Shape: {climate.shape}")
climate = climate.sort_values(by='population', ascending=False)
climate.head()


Columns: Index(['city', 'country', 'population', 'city_wd', 'gps_lon', 'gps_lat',
       'year high C', 'year mean C', 'year low C', 'year precipitation mm'],
      dtype='object')
Shape: (9672, 10)


Unnamed: 0,city,country,population,city_wd,gps_lon,gps_lat,year high C,year mean C,year low C,year precipitation mm
0,Delhi,India,26495000,http://www.wikidata.org/entity/Q1353,77.216667,28.666667,31.2,25.1,18.9,790.0
1,Shanghai,People's Republic of China,23390000,http://www.wikidata.org/entity/Q8686,121.466667,31.166667,20.6,17.1,14.1,1166.1
2,Beijing,People's Republic of China,21710000,http://www.wikidata.org/entity/Q956,116.391389,39.905,17.8,,7.2,571.8
3,Lagos,Nigeria,21324000,http://www.wikidata.org/entity/Q8673,3.4,6.45,30.8,26.8,22.8,1506.6
4,Karachi,Pakistan,14910352,http://www.wikidata.org/entity/Q8660,67.01,24.86,31.7,26.0,20.3,


### Creating a combined dataset

We combine all of the sanitised datasets using QoL city name as a primary key for the climate data, and QoL country as a primary key for the human freedom data.


In [876]:
combined = pd.merge(qol, climate, on=['city', 'country'], how='inner')
combined.to_csv('../data/combined.csv', index=False)
combined = pd.merge(combined, hfi, on='country', how='inner')
combined.columns = combined.columns.str.lower(
).str.replace(' ', '_').str.replace('&', 'and')
combined.to_csv('../data/preprocessed.csv', index=False)
combined.head()


Unnamed: 0,city,country,housing,cost_of_living,startups,venture_capital,travel_connectivity,commute,business_freedom,safety,...,city_wd,gps_lon,gps_lat,year_high_c,year_mean_c,year_low_c,year_precipitation_mm,human_freedom,economic_freedom,personal_freedom
0,Aarhus,Denmark,6.132,4.015,2.827,2.512,3.536,6.312,9.94,9.617,...,http://www.wikidata.org/entity/Q25319,10.209722,56.156389,11.4,7.7,3.8,725.0,4,5,8
1,Copenhagen,Denmark,4.227,3.426,6.207,4.467,6.3,6.296,9.94,9.152,...,http://www.wikidata.org/entity/Q1748,12.568889,55.676111,12.2,9.2,6.2,522.6,4,5,8
2,Adelaide,Australia,6.31,4.692,3.136,2.64,1.777,5.336,9.4,7.926,...,http://www.wikidata.org/entity/Q5112,138.583333,-34.933333,22.4,,12.3,,11,6,17
3,Brisbane,Australia,5.212,5.177,6.343,3.24,2.434,4.618,9.4,7.904,...,http://www.wikidata.org/entity/Q34932,153.027778,-27.467778,26.5,,16.3,,11,6,17
4,Melbourne,Australia,4.83,4.798,8.204,5.02,2.847,4.872,9.4,7.132,...,http://www.wikidata.org/entity/Q3141,144.961389,-37.820556,20.4,16.0,11.4,,11,6,17
