In [None]:
# Import of all packages used in this notebook
import zipfile
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from column_names import get_column_names
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer, StandardScaler, PowerTransformer, QuantileTransformer 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import PredictionErrorDisplay
from utils.data_handling import DataHandling


%matplotlib inline

# Setup

Check requirements.txt

In [None]:
! pip install -r "requirements.txt"

Download and unzip dataset, if necessary.

In [None]:
filename = 'cost-of-living_v2.csv'

# check if file already exists
if os.path.exists(filename):
    print('File {} exists.'.format(filename))

else:
    
    zip_file = 'global-cost-of-living.zip'
    
    # check if kaggle zip-file already exists
    if os.path.exists(zip_file):
        print('File {} exists.'.format(zip_file))
    
    else:
        # Download files from kaggle
        ! kaggle datasets download -d mvieira101/global-cost-of-living
        

    # end if

    # Unpacking files
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall('')
    print('Unpacking {}.'.format(zip_file))

# end if

Import dataset

In [None]:
# import dataset
df = pd.read_csv(filename)

# give the columns informative names
df.columns = get_column_names()

global_random_state = 42

# Data Analysis

## Basics

Shape of the dataset

In [None]:
df.shape

Lets count the missing values

In [None]:
(df.shape[0]-df.count()).sort_values(ascending=False)

Let's look at the column datatypes

In [None]:
df.dtypes.value_counts()

In [None]:
df.select_dtypes(include='object').columns

This shows that all columns except `city` and `country` are numerical.