In [None]:
# Import of all packages used in this notebook
import zipfile
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pycountry
from pycountry_convert.convert_country_alpha2_to_continent_code import country_alpha2_to_continent_code
from pycountry_convert.convert_continent_code_to_continent_name import convert_continent_code_to_continent_name

from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer, StandardScaler, PowerTransformer, QuantileTransformer 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import PredictionErrorDisplay

from column_names import get_column_names
from utils.data_handling import DataHandling


%matplotlib inline

# Setup

Check requirements.txt

In [None]:
! pip install -r "requirements.txt"

Download and unzip dataset, if necessary.

In [None]:
filename = 'cost-of-living_v2.csv'

# check if file already exists
if os.path.exists(filename):
    print('File {} exists.'.format(filename))

else:
    
    zip_file = 'global-cost-of-living.zip'
    
    # check if kaggle zip-file already exists
    if os.path.exists(zip_file):
        print('File {} exists.'.format(zip_file))
    
    else:
        # Download files from kaggle
        ! kaggle datasets download -d mvieira101/global-cost-of-living
        

    # end if

    # Unpacking files
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall('')
    print('Unpacking {}.'.format(zip_file))

# end if

Import dataset

In [None]:
# import dataset
df = pd.read_csv(filename)

# give the columns informative names
df.columns = get_column_names()

global_random_state = 42

# Data Analysis

## Basics

Shape of the dataset

In [None]:
df.shape

Let's look at the column names.

In [None]:
df.columns

The dataset contains cities in several countries. For each city there are several costs, e.g. for groceries and beverages, transportation, leisure time, clothing, housing.
Additionally it contains information about salaries and mortgages.
The last columns is called `data_quality` and contains a flag. It is 0 if Numbeo considers that more contributors are needed to increase data quality and 1 elsewise.

Let's see how many rows have sufficient data and drop the others.

In [None]:
# check dtype of column data_quality
col_quality = 'data_quality'
print('Column {} has type {}.'.format(
    col_quality,
    df[col_quality].dtype
))

# convert dtype of data quality column to bool
df[col_quality] = pd.Series(df[col_quality], dtype=bool)

# count rows with good data quality
print('{} of {} cities have good data quality!'.format(
    df[col_quality].sum(),
    df.shape[0]
))

# drop all rows with bad data quality
df_quality = df.loc[df[col_quality],:]
df_quality = df_quality.drop(labels=col_quality, axis=1)

Lets count the missing values

In [None]:
df_missing = (df_quality.shape[0]-df_quality.count()).sort_values(ascending=False)

print('{} of {} columns do not have any missing values\n'.format(
    df_missing.value_counts()[0],
    df_missing.shape[0]
))

print('Missing values by columns:\n{}'.format(
    df_missing
))

In [None]:
# implementation of a nan checker we will need later
def na_check(df, cols):
    """
    Checks for na values in the columns cols of Dataframe df.
    Outputs True if any of the columns has a na value and False elsewise.
    """
    contains_na = df.loc[:,cols].isna().any(axis=1).any()
    return contains_na
    

Let's look at the column datatypes

In [None]:
df_quality.dtypes.value_counts()

In [None]:
df_quality.select_dtypes(include='object').columns

This shows that all columns except `city` and `country` are numerical.

Let's take a look at the distributions of all columns next. First we will have a look at the countries.

In [None]:
def plot_categorical_hist(categorical, limit=0):
    """
    Plot a histogram for categoricals with 90° ticks.
    Limit output to the highest 'limit' counts.
    """
    categorical_counts = categorical.value_counts()

    limit = limit if limit != 0 else categorical_counts.shape[0]

    categorical_dict = categorical_counts.iloc[:limit].to_dict()

    fig, ax = plt.subplots(1, 1)
    plt.bar(categorical_dict.keys(), categorical_dict.values())
    ax.tick_params(axis='x', labelrotation=90)

    return ax

n_countries = 20

ax = plot_categorical_hist(df_quality['country'], limit=n_countries)
ax.set_title(f'Country distribution of the {n_countries} most frequent countries.')
ax.set_xlabel('Country')
ax.set_ylabel('# of cities')



Looks like cities in the US and European cities are highly present in this dataset.

This raises the question, which continents the cities are located on.

In [None]:
def convert_country_to_continent(country_name):
    """ 
    Function provides continent name for a country name.
    """

    # initialize continent_name and country_obj
    continent_name = 'Unknown'
    country_obj = []

    # get the pycountry.Country object
    try:
        country_obj = pycountry.countries.lookup(country_name)
    
    except LookupError:
        # Print info
        print('Could not find {} with lookup function. Trying search_fuzzy.'.format(
            country_name
        ))

        # try search fuzzy instead
        try:
            country_list = pycountry.countries.search_fuzzy(country_name)           

            # print information whether multiple results occured
            if len(country_list) > 1:
                print('Expected only one country during search_fuzzy, but got {}!.'.format(
                    len(country_list)
                ))
                
            else:
                print('search_fuzzy was successful with exactly one result!')    
            # end if

            country_obj = country_list[0]
            print('Using "{}" for "{}".'.format(
                country_obj.name,
                country_name
            ))

        except:
            print('{} not found. Country will have no continent'.format(
                country_name
            ))
            
        # end try
    # end try
        
    if str(type(country_obj)) == "<class 'pycountry.db.Country'>": # isinstance does not work
        # convert alpha_2 value of country object into continent name         
        country_code = country_obj.alpha_2
        continent_code = country_alpha2_to_continent_code(country_code)
        continent_name = convert_continent_code_to_continent_name(continent_code)
    # end if

    return continent_name

# add a continen column to the dataset
df_quality['continent'] = df_quality['country'].apply(convert_country_to_continent)



In [None]:
ax = plot_categorical_hist(df_quality['continent'])
ax.set_title('Continent distribution')
ax.set_xlabel('Continent')
ax.set_ylabel('# of cities')



This confirms our first impression: Most of the cities are in Europe and Nothern America. Unknown refers to countries, where the continent could not be automatically assigned.

Let's look at the other distributions

In [None]:
df_hist = df_quality.select_dtypes(include='float')
n_plots = df_hist.shape[1]
n_cols = 6
n_rows = int(np.ceil( n_plots / n_cols ))
fig, ax  = plt.subplots(n_rows, n_cols, squeeze=True, figsize=(3*n_cols, 3*n_rows))
ax.resize((ax.size,))

for i in range(n_plots):
    df_hist.iloc[:,i].hist(ax=ax[i])
    ax[i].set_title(df_hist.columns[i])



## Further Analysis

Let's see where the rental prices for apartments are the highest!

In [None]:
cols = [
    'country',
    'apartment_rent_1_room_centre', 
    'apartment_rent_1_room_outside', 
    'apartment_rent_3_room_centre', 
    'apartment_rent_3_room_outside', 
    'apartment_price_centre',
    'apartment_price_outside',
]
print('Are there na values in the analyzed columns? {}'.format(
    na_check(df_quality, cols)
))

# drop the rows with na values
df_apartment_prices = df_quality.loc[:, cols].dropna(axis=0, how='any')

# mean apartment prices and rent grouped by country
mean_apartment_prices = df_apartment_prices.groupby('country').mean()
mean_apartment_prices.head()

# Which countries are in the Top 15 for all categories
n_largest = 15
countries_high_prices = set(mean_apartment_prices.index)
for col in cols[1:]:
    countries_high_prices = countries_high_prices & set(mean_apartment_prices.nlargest(n_largest, col).index)
# end for
print(countries_high_prices)

Looks like 6 countries are amoung the most expensive apartments in all categories!

Let's see where people pay the highest share of their salary for rent. 
For this, we will calculate the mean of the rents for 1- and 3-room-apartments in- and outside of the city centres.
This is suggested by the [numbeo methodology](https://www.numbeo.com/common/motivation_and_methodology.jsp).
The mean is divided by the salary.

Since the mean and division are not legitimate with missing values, we drop those.

In [None]:
cols = [
    'apartment_rent_1_room_centre', 
    'apartment_rent_1_room_outside', 
    'apartment_rent_3_room_centre', 
    'apartment_rent_3_room_outside', 
    'salary'
]

print('Are there na values in the analyzed columns? {}'.format(
    na_check(df_quality, cols)
))

df_quality = df_quality.dropna(axis=0, how='any', subset=cols)

df_quality.loc[:,'rent_salary_share'] = df_quality.loc[:,cols].mean(axis=1)/df_quality.loc[:,'salary']

# mean over countries
mean_rent_salary_share = df_quality.loc[:,['country', 'rent_salary_share', 'salary']].groupby('country').mean()

mean_rent_salary_share.hist(bins=20)

In [None]:
fig, ax = plt.subplots(1,1)
plt.scatter(mean_rent_salary_share['salary'], mean_rent_salary_share['rent_salary_share'])
ax.set_ylim((0,1))