In [24]:
import pandas as pd 

v1 = pd.read_csv("data/unclean/cost-of-living.csv", index_col=0)
v2 = pd.read_csv("data/unclean/cost-of-living_v2.csv")

In [25]:
# Exploring differences between dataset versions
outer = pd.merge(v1, v2, how='outer', indicator=True)
outer.groupby("_merge")['city'].count() # 


_merge
left_only     4030
right_only    4112
both           844
Name: city, dtype: int64

In [26]:
# At least some of these differences are due to different values across the quantitative columns:
outer.sort_values("city").head(2)

Unnamed: 0,city,country,x1,x2,x3,x4,x5,x6,x7,x8,...,x48,x49,x50,x51,x52,x53,x54,x55,data_quality,_merge
6833,'s-Hertogenbosch,Netherlands,18.97,63.22,9.11,4.35,3.16,3.06,2.83,2.29,...,964.18,753.43,1830.88,1448.9,4531.1,3301.74,2669.49,2.27,1,right_only
2147,'s-Hertogenbosch,Netherlands,18.7,62.32,8.98,4.28,3.12,3.02,2.79,2.26,...,950.41,742.67,1804.74,1428.21,4466.4,3254.58,2631.36,2.27,1,left_only


In [27]:
# Some of these differences are from cities being included in one version but not the other
city_diff = pd.merge(v1.city, v2.city, how = "outer", indicator=True) 
city_diff = city_diff[city_diff._merge != "both"]

dropped_cities = list(city_diff[city_diff['_merge'] == "left_only"].city) # Cities present in v1 but not v2
v1_only = v1.loc[v1.city.isin(dropped_cities)]

In [28]:
cost_living = pd.merge(v1_only, v2, how = "outer")

# The below file is from text copy-pasted from kaggle page's description section:
      # https://www.kaggle.com/datasets/mvieira101/global-cost-of-livin
column_descriptions = pd.read_csv("data/column_descriptions.csv",sep="	") 
column_descriptions = column_descriptions.iloc[1:56,] # Drop descriptions of columns whose names we don't want to change
col_key = dict(zip(column_descriptions['city'],column_descriptions['Name of the city']))
cost_living.rename(col_key, axis=1, inplace = True)
cost_living.head(1)
# The original dummy columns were entirely uninformative, but now many of these are too long. Alas.

Unnamed: 0,city,country,"Meal, Inexpensive Restaurant (USD)","Meal for 2 People, Mid-range Restaurant, Three-course (USD)",McMeal at McDonalds (or Equivalent Combo Meal) (USD),"Domestic Beer (0.5 liter draught, in restaurants) (USD)","Imported Beer (0.33 liter bottle, in restaurants) (USD)","Cappuccino (regular, in restaurants) (USD)","Coke/Pepsi (0.33 liter bottle, in restaurants) (USD)","Water (0.33 liter bottle, in restaurants) (USD)",...,1 Pair of Men Leather Business Shoes (USD),Apartment (1 bedroom) in City Centre (USD),Apartment (1 bedroom) Outside of Centre (USD),Apartment (3 bedrooms) in City Centre (USD),Apartment (3 bedrooms) Outside of Centre (USD),Price per Square Meter to Buy Apartment in City Centre (USD),Price per Square Meter to Buy Apartment Outside of Centre (USD),Average Monthly Net Salary (After Tax) (USD),"Mortgage Interest Rate in Percentages (%), Yearly, for 20 Years Fixed-Rate",data_quality
0,Guigang,China,2.3,20.95,4.4,,2.79,2.1,0.49,0.3,...,130.37,,,,,,,,4.18,0


#### Cleaning the Data Set

In [30]:
cost_living.drop_duplicates() # No duplicates...beautiful


Unnamed: 0,city,country,"Meal, Inexpensive Restaurant (USD)","Meal for 2 People, Mid-range Restaurant, Three-course (USD)",McMeal at McDonalds (or Equivalent Combo Meal) (USD),"Domestic Beer (0.5 liter draught, in restaurants) (USD)","Imported Beer (0.33 liter bottle, in restaurants) (USD)","Cappuccino (regular, in restaurants) (USD)","Coke/Pepsi (0.33 liter bottle, in restaurants) (USD)","Water (0.33 liter bottle, in restaurants) (USD)",...,1 Pair of Men Leather Business Shoes (USD),Apartment (1 bedroom) in City Centre (USD),Apartment (1 bedroom) Outside of Centre (USD),Apartment (3 bedrooms) in City Centre (USD),Apartment (3 bedrooms) Outside of Centre (USD),Price per Square Meter to Buy Apartment in City Centre (USD),Price per Square Meter to Buy Apartment Outside of Centre (USD),Average Monthly Net Salary (After Tax) (USD),"Mortgage Interest Rate in Percentages (%), Yearly, for 20 Years Fixed-Rate",data_quality
0,Guigang,China,2.30,20.95,4.40,,2.79,2.10,0.49,0.30,...,130.37,,,,,,,,4.18,0
1,Sidi Bouzid,Tunisia,1.85,16.22,2.78,1.39,1.31,0.69,0.50,0.20,...,59.74,154.49,154.49,370.77,370.77,,,253.36,8.20,0
2,Zaria,Nigeria,1.24,5.52,5.63,1.13,1.58,2.14,0.23,0.25,...,74.35,112.65,225.29,337.94,901.16,,,157.70,17.67,0
3,Paulista,Brazil,4.10,18.63,5.59,1.12,1.86,1.18,0.93,0.37,...,40.90,108.08,93.17,195.65,121.12,,,195.65,9.25,0
4,Houma,United States,10.00,,10.00,,,5.00,,,...,89.50,950.00,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4965,Peterborough,Australia,,,,,,,,,...,,,,,,,,,,0
4966,Georgetown,Australia,,,,,,,,,...,,,,,,,,,,0
4967,Ixtapa Zihuatanejo,Mexico,5.16,30.94,12.89,0.98,,1.80,0.62,0.41,...,103.14,412.55,257.84,515.69,412.55,,,,,0
4968,Iqaluit,Canada,29.65,74.27,13.71,6.67,8.89,3.71,3.52,4.08,...,,,,2964.60,2964.60,,,,6.53,0


In [31]:
cost_living.apply(lambda col: col.isna().sum(), axis=0).sort_values(ascending=False)

Tennis Court Rent (1 Hour on Weekend) (USD)                                       2410
Price per Square Meter to Buy Apartment Outside of Centre (USD)                   2315
Price per Square Meter to Buy Apartment in City Centre (USD)                      2237
Monthly Pass (Regular Price) (USD)                                                2177
International Primary School, Yearly for 1 Child (USD)                            1687
Apartment (3 bedrooms) Outside of Centre (USD)                                    1519
One-way Ticket (Local Transport) (USD)                                            1514
Apartment (3 bedrooms) in City Centre (USD)                                       1488
Average Monthly Net Salary (After Tax) (USD)                                      1439
Apartment (1 bedroom) Outside of Centre (USD)                                     1436
Taxi 1hour Waiting (Normal Tariff) (USD)                                          1401
Fitness Club, Monthly Fee for 1 Adult (USD)

In [32]:

apartments1 = cost_living.dropna(subset=['Price per Square Meter to Buy Apartment in City Centre (USD)','Price per Square Meter to Buy Apartment Outside of Centre (USD)'], axis=0) # rows that have price/sqmeter data
apartments2 = cost_living.dropna(subset=cost_living.columns[49:53], axis=0) # rows that have data for apartment rent by location and bedrooms
apartments3 = cost_living.dropna(subset=cost_living.columns[49:55], axis=0) # rows that have data in all apartment-related columns

# all_apartment_columns = list(cost_living.columns[cost_living.columns.str.contains("[aA]partment")]) --- all the apartment-related columns, including price/sqmeter.
### replaced with slicing above for visual clarity

In [43]:
apartments3

Unnamed: 0,city,country,"Meal, Inexpensive Restaurant (USD)","Meal for 2 People, Mid-range Restaurant, Three-course (USD)",McMeal at McDonalds (or Equivalent Combo Meal) (USD),"Domestic Beer (0.5 liter draught, in restaurants) (USD)","Imported Beer (0.33 liter bottle, in restaurants) (USD)","Cappuccino (regular, in restaurants) (USD)","Coke/Pepsi (0.33 liter bottle, in restaurants) (USD)","Water (0.33 liter bottle, in restaurants) (USD)",...,1 Pair of Men Leather Business Shoes (USD),Apartment (1 bedroom) in City Centre (USD),Apartment (1 bedroom) Outside of Centre (USD),Apartment (3 bedrooms) in City Centre (USD),Apartment (3 bedrooms) Outside of Centre (USD),Price per Square Meter to Buy Apartment in City Centre (USD),Price per Square Meter to Buy Apartment Outside of Centre (USD),Average Monthly Net Salary (After Tax) (USD),"Mortgage Interest Rate in Percentages (%), Yearly, for 20 Years Fixed-Rate",data_quality
10,Smederevska Palanka,Serbia,,,3.54,,,,1.06,0.88,...,35.40,106.19,88.49,132.73,106.19,309.71,265.47,,,0
13,Szentendre,Hungary,5.94,25.57,4.32,1.65,1.48,1.45,1.05,0.86,...,85.23,485.82,375.02,852.32,639.24,2556.95,2173.41,895.12,7.42,0
14,Seoul,South Korea,7.68,53.78,6.15,3.07,4.99,3.93,1.48,0.79,...,110.36,742.54,557.52,2669.12,1731.08,22067.70,10971.90,2689.62,3.47,1
15,Shanghai,China,5.69,39.86,5.69,1.14,4.27,3.98,0.53,0.33,...,123.51,1091.93,569.88,2952.70,1561.59,17746.11,9416.35,1419.87,5.03,1
16,Guangzhou,China,4.13,28.47,4.98,0.85,1.71,3.54,0.44,0.33,...,43.89,533.28,317.45,1242.24,688.05,12892.82,5427.45,1211.68,5.19,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4952,Seferhisar,Turkey,3.22,18.78,1.61,2.15,2.15,2.33,0.54,0.21,...,67.08,321.99,268.33,429.33,321.99,1287.98,536.66,,,0
4955,Mackay,Australia,16.31,81.55,8.83,4.59,5.44,4.87,2.49,2.38,...,84.27,736.21,679.58,1359.16,1087.33,11213.06,10091.76,3737.69,5.80,0
4959,Tirupati,India,2.46,9.21,4.30,2.21,3.07,1.47,0.46,0.14,...,37.25,82.90,61.41,184.23,128.96,661.00,396.60,614.09,7.97,0
4961,Rockhampton,Australia,14.95,64.56,8.15,4.25,3.40,3.40,2.74,2.38,...,54.37,781.58,594.63,951.41,906.11,5097.49,4019.31,3481.06,5.80,0


In [48]:
price_sq_meter = apartments1[['city','country','Price per Square Meter to Buy Apartment in City Centre (USD)', 'data_quality']]


In [50]:
price_sq_meter

Unnamed: 0,city,country,Price per Square Meter to Buy Apartment in City Centre (USD),data_quality
10,Smederevska Palanka,Serbia,309.71,0
13,Szentendre,Hungary,2556.95,0
14,Seoul,South Korea,22067.70,1
15,Shanghai,China,17746.11,1
16,Guangzhou,China,12892.82,1
...,...,...,...,...
4952,Seferhisar,Turkey,1287.98,0
4955,Mackay,Australia,11213.06,0
4959,Tirupati,India,661.00,0
4961,Rockhampton,Australia,5097.49,0


In [181]:
worldcities = pd.read_csv("data/worldcities.csv").drop_duplicates(["city", "country"])
worldcities

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.6839,139.7744,Japan,JP,JPN,Tōkyō,primary,39105000.0,1392685764
1,Jakarta,Jakarta,-6.2146,106.8451,Indonesia,ID,IDN,Jakarta,primary,35362000.0,1360771077
2,Delhi,Delhi,28.6667,77.2167,India,IN,IND,Delhi,admin,31870000.0,1356872604
3,Manila,Manila,14.6000,120.9833,Philippines,PH,PHL,Manila,primary,23971000.0,1608618140
4,São Paulo,Sao Paulo,-23.5504,-46.6339,Brazil,BR,BRA,São Paulo,admin,22495000.0,1076532519
...,...,...,...,...,...,...,...,...,...,...,...
42900,Tukchi,Tukchi,57.3670,139.5000,Russia,RU,RUS,Khabarovskiy Kray,,10.0,1643472801
42901,Numto,Numto,63.6667,71.3333,Russia,RU,RUS,Khanty-Mansiyskiy Avtonomnyy Okrug-Yugra,,10.0,1643985006
42902,Nord,Nord,81.7166,-17.8000,Greenland,GL,GRL,Sermersooq,,10.0,1304217709
42903,Timmiarmiut,Timmiarmiut,62.5333,-42.2167,Greenland,GL,GRL,Kujalleq,,10.0,1304206491


In [159]:
combo = pd.read_csv("data/cl_real_happiness.csv")
combo.rename(columns=lambda v: v + " (country)", inplace=True)
combo.rename(mapper={"city (country)":"city", "country (country)":"country", "price_per_sq_meter (country)":"price_per_sq_meter"}, inplace=True, axis=1)
combo.drop("Country name (country)", axis=1, inplace=True)
combo

Unnamed: 0,city,country,price_per_sq_meter,Ladder score (country),Logged GDP per capita (country),Social support (country),Healthy life expectancy (country),Freedom to make life choices (country),Generosity (country),Perceptions of corruption (country)
0,Carrollton,United States,9848.98,6.94,10.93,0.91,68.30,0.84,0.15,0.70
1,Dania Beach,United States,1776.05,6.94,10.93,0.91,68.30,0.84,0.15,0.70
2,New York,United States,14784.44,6.94,10.93,0.91,68.30,0.84,0.15,0.70
3,Los Angeles,United States,9219.44,6.94,10.93,0.91,68.30,0.84,0.15,0.70
4,Chicago,United States,3498.00,6.94,10.93,0.91,68.30,0.84,0.15,0.70
...,...,...,...,...,...,...,...,...,...,...
4959,Petion-Ville,Haiti,2960.08,3.72,7.41,0.59,55.60,0.54,0.40,0.68
4960,Curepipe,Mauritius,412.37,6.10,9.96,0.91,66.40,0.89,-0.02,0.81
4961,Santa Tecla,El Salvador,1068.00,6.35,8.91,0.81,66.11,0.83,-0.12,0.75
4962,Alajuela,Costa Rica,1219.89,7.12,9.66,0.90,71.30,0.93,-0.10,0.79


In [168]:
combo_locations = pd.merge(combo, worldcities, how="inner", on=['city', 'country'])

Unnamed: 0,city,country,price_per_sq_meter,Ladder score (country),Logged GDP per capita (country),Social support (country),Healthy life expectancy (country),Freedom to make life choices (country),Generosity (country),Perceptions of corruption (country),city_ascii,lat,lng,iso2,iso3,admin_name,capital,population,id
0,Carrollton,United States,9848.980000,6.94,10.93,0.91,68.30,0.84,0.15,0.70,Carrollton,32.9890,-96.8999,US,USA,Texas,,139248.0,1840019436
1,Dania Beach,United States,1776.050000,6.94,10.93,0.91,68.30,0.84,0.15,0.70,Dania Beach,26.0594,-80.1637,US,USA,Florida,,32271.0,1840015135
2,New York,United States,14784.440000,6.94,10.93,0.91,68.30,0.84,0.15,0.70,New York,40.6943,-73.9249,US,USA,New York,,18713220.0,1840034016
3,New York,United States,18919.719454,6.94,10.93,0.91,68.30,0.84,0.15,0.70,New York,40.6943,-73.9249,US,USA,New York,,18713220.0,1840034016
4,Los Angeles,United States,9219.440000,6.94,10.93,0.91,68.30,0.84,0.15,0.70,Los Angeles,34.1139,-118.4068,US,USA,California,,12750807.0,1840020491
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3151,Marsaxlokk,Malta,8611.130000,6.77,10.53,0.93,72.20,0.92,0.21,0.66,Marsaxlokk,35.8417,14.5447,MT,MLT,Marsaxlokk,admin,3534.0,1470596833
3152,Curepipe,Mauritius,412.370000,6.10,9.96,0.91,66.40,0.89,-0.02,0.81,Curepipe,-20.3162,57.5166,MU,MUS,Plaines Wilhems,,299975.0,1480343099
3153,Santa Tecla,El Salvador,1068.000000,6.35,8.91,0.81,66.11,0.83,-0.12,0.75,Santa Tecla,13.6742,-89.2899,SV,SLV,La Libertad,admin,135483.0,1222613306
3154,Alajuela,Costa Rica,1219.890000,7.12,9.66,0.90,71.30,0.93,-0.10,0.79,Alajuela,10.0311,-84.2041,CR,CRI,Alajuela,admin,48326.0,1188394508


In [187]:
import folium
m = folium.Map(location=[20,0], tiles="OpenStreetMap", zoom_start=2)

for i in range(0, len(combo_locations)):
  folium.Marker(
    location=[combo_locations.iloc[i]['lat'],combo_locations.iloc[i]['lng']],
    popup=combo_locations.iloc[i]['city']
  ).add_to(m)

In [188]:
folium.Choropleth(
  geo_data=combo_locations, 
  name="choropleth", 
  
)

Unnamed: 0,city,country,price_per_sq_meter,Ladder score (country),Logged GDP per capita (country),Social support (country),Healthy life expectancy (country),Freedom to make life choices (country),Generosity (country),Perceptions of corruption (country),city_ascii,lat,lng,iso2,iso3,admin_name,capital,population,id
0,Carrollton,United States,9848.980000,6.94,10.93,0.91,68.30,0.84,0.15,0.70,Carrollton,32.9890,-96.8999,US,USA,Texas,,139248.0,1840019436
1,Dania Beach,United States,1776.050000,6.94,10.93,0.91,68.30,0.84,0.15,0.70,Dania Beach,26.0594,-80.1637,US,USA,Florida,,32271.0,1840015135
2,New York,United States,14784.440000,6.94,10.93,0.91,68.30,0.84,0.15,0.70,New York,40.6943,-73.9249,US,USA,New York,,18713220.0,1840034016
3,New York,United States,18919.719454,6.94,10.93,0.91,68.30,0.84,0.15,0.70,New York,40.6943,-73.9249,US,USA,New York,,18713220.0,1840034016
4,Los Angeles,United States,9219.440000,6.94,10.93,0.91,68.30,0.84,0.15,0.70,Los Angeles,34.1139,-118.4068,US,USA,California,,12750807.0,1840020491
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3151,Marsaxlokk,Malta,8611.130000,6.77,10.53,0.93,72.20,0.92,0.21,0.66,Marsaxlokk,35.8417,14.5447,MT,MLT,Marsaxlokk,admin,3534.0,1470596833
3152,Curepipe,Mauritius,412.370000,6.10,9.96,0.91,66.40,0.89,-0.02,0.81,Curepipe,-20.3162,57.5166,MU,MUS,Plaines Wilhems,,299975.0,1480343099
3153,Santa Tecla,El Salvador,1068.000000,6.35,8.91,0.81,66.11,0.83,-0.12,0.75,Santa Tecla,13.6742,-89.2899,SV,SLV,La Libertad,admin,135483.0,1222613306
3154,Alajuela,Costa Rica,1219.890000,7.12,9.66,0.90,71.30,0.93,-0.10,0.79,Alajuela,10.0311,-84.2041,CR,CRI,Alajuela,admin,48326.0,1188394508


In [195]:
import pandas as pd

url = (
    "https://raw.githubusercontent.com/python-visualization/folium/master/examples/data"
)
state_geo = f"{url}/us-states.json"
# state_unemployment = f"{url}/US_Unemployment_Oct2012.csv"
# state_data = pd.read_csv(state_unemployment)
combo_locations


m = folium.Map(location=[48, -102], zoom_start=3)

folium.Choropleth(
    geo_data=state_geo,
    name="choropleth",
    data= ,# state_data,
    columns=["State", "Unemployment"],
    key_on="feature.id",
    fill_color="YlGn",
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name="Unemployment Rate (%)",
).add_to(m)

folium.LayerControl().add_to(m)

state_data

Unnamed: 0,State,Unemployment
0,AL,7.1
1,AK,6.8
2,AZ,8.1
3,AR,7.2
4,CA,10.1
5,CO,7.7
6,CT,8.4
7,DE,7.1
8,FL,8.2
9,GA,8.8


In [193]:
worldcities

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.6839,139.7744,Japan,JP,JPN,Tōkyō,primary,39105000.0,1392685764
1,Jakarta,Jakarta,-6.2146,106.8451,Indonesia,ID,IDN,Jakarta,primary,35362000.0,1360771077
2,Delhi,Delhi,28.6667,77.2167,India,IN,IND,Delhi,admin,31870000.0,1356872604
3,Manila,Manila,14.6000,120.9833,Philippines,PH,PHL,Manila,primary,23971000.0,1608618140
4,São Paulo,Sao Paulo,-23.5504,-46.6339,Brazil,BR,BRA,São Paulo,admin,22495000.0,1076532519
...,...,...,...,...,...,...,...,...,...,...,...
42900,Tukchi,Tukchi,57.3670,139.5000,Russia,RU,RUS,Khabarovskiy Kray,,10.0,1643472801
42901,Numto,Numto,63.6667,71.3333,Russia,RU,RUS,Khanty-Mansiyskiy Avtonomnyy Okrug-Yugra,,10.0,1643985006
42902,Nord,Nord,81.7166,-17.8000,Greenland,GL,GRL,Sermersooq,,10.0,1304217709
42903,Timmiarmiut,Timmiarmiut,62.5333,-42.2167,Greenland,GL,GRL,Kujalleq,,10.0,1304206491
