In the first place, we will import the necessary libraries and read the two available datasets

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import plotly.express as px

#import seaborn as sns


%matplotlib inline
# Read both data sets
df_1 = pd.read_csv('cost-of-living.csv', index_col=0)
df_2 = pd.read_csv('cost-of-living_v2.csv', index_col=0)


We will determine which data set to use based on the amount of missing values in df_1 and df_2

In [None]:
# Which dataset has more missing values

proportion = (df_1.isna().sum()/df_2.isna().sum()).mean()
std = (df_1.isna().sum()/df_2.isna().sum()).std()

print('Data Set 1 has {:.2f} +- {:.2f} missing values with respect Data Set 2'.format(proportion, std))

We will use Data Set 1 since it has less missing values than Data Set 2. 
The column 'data_quality' of the dataframe is 0 if Numbeo considers that more contributors are needed to increase data quality, else 1. Hence, we will only keep good quality data.

In [None]:
df = df_1[df_1['data_quality']==1]
df.head()

In [None]:
#Description of the dataset
df.describe()

We rename columns for their meaning to understand further visualizations and tables. Also, we drop the data_quality column since it is 1 for all the rows and doesn't bring any information.

In [None]:
df.rename(columns = {'x1':'MealInexpensive',
                     'x2':'Meal2People',
                     'x3':'McMeal',
                     'x4':'BeerDomesticRes',
                     'x5':'BeerImportedRes',
                     'x6':'Cappuccino',
                     'x7':'Coke',
                     'x8':'Water',
                     'x9':'Milk',
                     'x10':'Bread',
                     'x11':'Rice',
                     'x12':'Eggs',
                     'x13':'Cheese',
                     'x14':'Chicken',
                     'x15':'Beef',
                     'x16':'Apples',
                     'x17':'Banana',
                     'x18':'Oranges',
                     'x19':'Tomato',
                     'x20':'Potato',
                     'x21':'Onion',
                     'x22':'Lettuce',
                     'x23':'Water',
                     'x24':'Wine',
                     'x25':'BeerDomesticMarket',
                     'x26':'BeerImportedMarket',
                     'x27':'Cigarettes',
                     'x28':'OWTicket',
                     'x29':'MonthlyPass',
                     'x30':'TaxiStart',
                     'x31':'Taxi1km',
                     'x32':'Taxi1h',
                     'x33':'Gasoline',
                     'x34':'VWGolf',
                     'x35':'Corolla',
                     'x36':'Services',
                     'x37':'Mobile',
                     'x38':'Internet',
                     'x39':'GymMonth',
                     'x40':'Tennis1h',
                     'x41':'Cinema',
                     'x42':'PreschoolMonth',
                     'x43':'IntlPrimarySchoolYear',
                     'x44':'Jeans',
                     'x45':'SummerDress',
                     'x46':'NikeShoes',
                     'x47':'LeatherShoes',
                     'x48':'Apartment_1BR_C',
                     'x49':'Apartment_1BR_OC',
                     'x50':'Apartment_3BR_C',
                     'x51':'Apartment_3BR_OC',
                     'x52':'SquareMeter_C',
                     'x53':'SquareMeter_OC',
                     'x54':'Salary',
                     'x55':'Mortgage_IR',

                     }, 
                    inplace = True)
df = df.drop('data_quality',axis=1)

We will base our analysis on the living cost by countries. We create a new data frame grouping countries by averaging the values of their corresponding cities.

In [None]:
df_countries = df.groupby(['country']).mean()
df_countries.reset_index(inplace=True)
df_countries.head()


Let's see how many missing values are remaining and their proportions

In [None]:
prop_countries = (df_countries.count()/df_countries.shape[0]).sort_values()
n_mv = df_countries.isna().sum().sum()

print('The proportion of missing values for every column is: ')
print(prop_countries)

print('In total, there are {} missing values in the dataframe'.format(n_mv))


We fill the 9 missing values in the dataframe with the most common value of the column for each indicator.

In [None]:
df_countries_2 = df_countries.select_dtypes(include='float')

fill_mode = lambda col: col.fillna(col.mode()[0])

new_df = df_countries_2.apply(fill_mode, axis=1)
n_mv = new_df.isna().sum().sum()

print('Number of missing values now: {}'.format(n_mv))

df_countries = pd.concat([df_countries['country'],new_df],axis=1)

We use the geopandas library to visualize in a worldwide map the indicators of each country. Parts of the following code are taken from: https://www.kaggle.com/code/lauman/data-visualization-on-cost-of-living-dataset


In [None]:
import geopandas as gpd  
SHAPEFILE = 'ne_10m_admin_0_countries.shp'
geo_df = gpd.read_file(SHAPEFILE)

# Rename to match value in data
geo_df['ADMIN'] = geo_df['ADMIN'].replace({
    #'United States Minor Outlying Islands': 'United States',
    'United States of America': 'United States',
    #'United States Virgin Islands': 'United States',
    'Hong Kong S.A.R.': 'Hong Kong',
    'United States of Tanzania': 'Tanzania',
    'Republic of Serbia': 'Serbia'})

geo_df = geo_df[['ADMIN', 'geometry']].set_index('ADMIN')


Let's take a look on the average salary of each country on the map

In [None]:
import plotly.figure_factory as ff # Plotting distribution using ff.distplot
import plotly.express as px 

col = 'Salary'

fig = px.choropleth_mapbox(df_countries, geojson=geo_df.geometry, locations='country', color=col,
                           color_continuous_scale="Viridis",
                           range_color=(df_countries[col].min(), df[col].max()),
                           mapbox_style="carto-positron",
                           zoom=0, center={"lat": 22.3193, "lon": 114.1694},
                           opacity=0.5,
                           labels={col:'Average Monthly Net Salary (After Tax) (USD)'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()


It is noticable that salaries are higher in North America and Europe than in Latin America, Asia, and Africa. However, living costs may be also higher in the countries of these continents. Let's take a look to some indicators.

Note: In the following plot, it is recommended to zoom in to get a better picture of the data distribution.

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

x = df_countries['country']
indicators = ['MealInexpensive', 'Gasoline','Services','SquareMeter_C']

fig = make_subplots(rows=2, cols=2)

trace0 = go.Bar(x=x, y=df_countries[indicators[0]],name=indicators[0])
trace1 = go.Bar(x=x, y=df_countries[indicators[1]],name=indicators[1])
trace2 = go.Bar(x=x, y=df_countries[indicators[2]],name=indicators[2])
trace3 = go.Bar(x=x, y=df_countries[indicators[3]],name=indicators[3])


fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)
fig.append_trace(trace2, 2, 1)
fig.append_trace(trace3, 2, 2)

fig.update_xaxes(categoryorder='total descending')
fig.update_layout(height=1000, width=1000,title='Overview of some cost indicators among countries')
fig.show()


It seems that countries located at the right tail of the barplots are located in Latin America and Africa, which suggests that even though these people earn less, they may spend less also. In order to understand the relation between salary and costs in each country, a proportion should be made.

Let's create a dataframe that contains the salary-cost ratio for each indicator and visualize some  of them on the world map.

In [None]:

df_indicators = df_countries.drop(['country','Salary'],axis=1)
ratio_df = df_indicators.divide(df_countries['Salary'],axis=0)
ratio_df = pd.concat([df_countries['country'],ratio_df],axis=1)


Let's visualize the salary-cost ratio for the same previous indicators: MealInexpensive, Gasoline, Services, and Square_Meter_C

In [None]:
col = 'MealInexpensive'

fig = px.choropleth_mapbox(ratio_df, geojson=geo_df.geometry, locations='country', color=col,
                           color_continuous_scale="Turbo",
                           range_color=(ratio_df[col].max(), ratio_df[col].min()),
                           mapbox_style="carto-positron",
                           zoom=0, center={"lat": 22.3193, "lon": 114.1694},
                           opacity=0.5,
                           labels={col:'Meal in an Inexpensive Restaurant and Salary ratio'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [None]:
col = 'Gasoline'

fig = px.choropleth_mapbox(ratio_df, geojson=geo_df.geometry, locations='country', color=col,
                           color_continuous_scale="Turbo",
                           range_color=(ratio_df[col].max(), ratio_df[col].min()),
                           mapbox_style="carto-positron",
                           zoom=0, center={"lat": 22.3193, "lon": 114.1694},
                           opacity=0.5,
                           labels={col:'Gasoline and Salary ratio'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [None]:
col = 'Services'

fig = px.choropleth_mapbox(ratio_df, geojson=geo_df.geometry, locations='country', color=col,
                           color_continuous_scale="Turbo",
                           range_color=(ratio_df[col].max(), ratio_df[col].min()),
                           mapbox_style="carto-positron",
                           zoom=0, center={"lat": 22.3193, "lon": 114.1694},
                           opacity=0.5,
                           labels={col:'Services and Salary ratio'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [None]:
col = 'SquareMeter_C'

fig = px.choropleth_mapbox(ratio_df, geojson=geo_df.geometry, locations='country', color=col,
                           color_continuous_scale="Turbo",
                           range_color=(ratio_df[col].min(), ratio_df[col].max()),
                           mapbox_style="carto-positron",
                           zoom=0, center={"lat": 22.3193, "lon": 114.1694},
                           opacity=0.5,
                           labels={col:'Price per Square Meter to Buy Apartment in City Centre and Salary Ratio'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

From the visualizations, it is possible to note that there is a great difference throughout the countries in terms of the proportion of the salary that must be used to pay for the expenses of the indicators studied. In countries like Cuba, people need their entire salary to pay for basic services, while in United States and Canada, this only represents 4% of their salary. 

In [None]:
top_meal = ratio_df.sort_values(by=['MealInexpensive'], axis=0, ascending=False)['country','MealInexpensive']
top_meal.head()

In [None]:
top_meal = ratio_df.sort_values(by=['Gasoline'], axis=0, ascending=False)
top_meal.head()

In [None]:
top_meal = ratio_df.sort_values(by=['Services'], axis=0, ascending=False)
top_meal.head()

In [None]:
top_meal = ratio_df.sort_values(by=['SquareMeter_C'], axis=0, ascending=False)
top_meal.head()