# Start the system

In [331]:
#1. Libraries
import psycopg2 as pg
import pandas as pd
import os
import re
import pycountry_convert as pc
import matplotlib.pyplot as plt
import datetime
import math
import numpy as np

### Read csv files

In [346]:
df = pd.read_csv("../data/listings_al.csv")
df_gdp = pd.read_csv('../data/GDP per capita (worldbank).csv')

### Transformation methods

In [333]:
# Check for missing values:
def is_nan(x):
    return (x != x)

def get_host_name(name):
    if is_nan(name): return "Unknown"
    return name

In [334]:
def get_host_membership_duration(date):
    now = datetime.datetime.now()
    #date is a string with the date as YYYY-MM-DD
    membership_time = now.year - int(date[0:4])
    if membership_time > 10: return "Member for more than 10 years"
    if membership_time > 5: return "Member for more than 5 years"
    if membership_time > 2: return "Member for more than 2 years"
    if membership_time < 1: return "Member for less than 1 year"

In [335]:
def get_host_country(location):
    #location is a string with the host location -- may not have country
    location = str(location)
    if re.search('po.*tugal', location, re.IGNORECASE): return "Portugal"
    if re.search('lisbo', location, re.IGNORECASE): return "Portugal"
    if re.search('PT', location): return "Portugal"
    if re.search('spain', location, re.IGNORECASE): return "Spain"
    if re.search('ES', location): return "Spain"
    if re.search('madrid', location, re.IGNORECASE): return "Spain"
    if re.search('united kingdom', location, re.IGNORECASE): return "United Kingdom"
    if re.search('UK', location): return "United Kingdom"
    if re.search('GB', location): return "United Kingdom"
    if re.search('ireland', location, re.IGNORECASE): return "United Kingdom"
    if re.search('denmark', location, re.IGNORECASE): return "Denmark"
    if re.search('netherlands', location, re.IGNORECASE): return "Netherlands"
    if re.search('NL', location): return "Netherlands"
    if re.search('germany', location, re.IGNORECASE): return "Germany"
    if re.search('DE', location): return "Germany"
    if re.search('belgium', location, re.IGNORECASE): return "Belgium"
    if re.search('united states', location, re.IGNORECASE): return "United States"
    if re.search('US', location): return "United States"
    if re.search('canada', location, re.IGNORECASE): return "Canada"
    if re.search('france', location, re.IGNORECASE): return "France"
    if re.search('FR', location): return "France"
    if re.search('italy', location, re.IGNORECASE): return "Italy"
    if re.search('IT', location): return "Italy"
    if re.search('switzerland', location, re.IGNORECASE): return "Switzerland"
    if re.search('sweden', location, re.IGNORECASE): return "Sweden"
    if re.search('poland', location, re.IGNORECASE): return "Poland"
    if re.search('finland', location, re.IGNORECASE): return "Finland"
    if re.search('czechia', location, re.IGNORECASE): return "Czechia"
    if re.search('serbia', location, re.IGNORECASE): return "Serbia"
    if re.search('austria', location, re.IGNORECASE): return "Austria"
    if re.search('iceland', location, re.IGNORECASE): return "Iceland"
    if re.search('norway', location, re.IGNORECASE): return "Norway"
    if re.search('china', location, re.IGNORECASE): return "China"
    if re.search('angola', location, re.IGNORECASE): return "Angola"
    if re.search('australia', location, re.IGNORECASE): return "Australia"
    if re.search('brazil', location, re.IGNORECASE): return "Brazil"
    if re.search('BR', location): return "Brazil"
    if re.search('peru', location, re.IGNORECASE): return "Peru"
    if re.search('sri lanka', location, re.IGNORECASE): return "Sri Lanka"
    if re.search('vietnam', location, re.IGNORECASE): return "Vietnam"
    if re.search('united arab emirates', location, re.IGNORECASE): return "United Arab Emirates"
    if re.search('south africa', location, re.IGNORECASE): return "South Africa"
    if re.search('qatar', location, re.IGNORECASE): return "Qatar"
    if re.search('japan', location, re.IGNORECASE): return "Japan"
    if re.search('turkey', location, re.IGNORECASE): return "Turkey"
    else: return "Unknown"

In [336]:
def get_host_continent(country):
    if country == "Unknown": return "Unknown"
    
    continents_dic = {
    'EU': 'Europe',
    'NA': 'North America',
    'SA': 'South America', 
    'AS': 'Asia',
    'OC': 'Australia',
    'AF': 'Africa',
    }
    
    # Acquire the country code
    country_code = pc.country_name_to_country_alpha2(country, cn_name_format="default")
    # Convert country code to continent code; use dictionary to return continent name
    continent_name = pc.country_alpha2_to_continent_code(country_code)
    
    return continents_dic[continent_name]

In [337]:
def get_host_country_gdp(country):
    if country == "Unknown": return "Unknown"
    
    # Always get latest (most updated) gdp available in the gdp dataset - last column
    # The level of detail required here is not very high, since we will use bins
    # to characterize the countries' wealth
    dict_gdp = pd.Series(df_gdp.iloc[:,-1].values, index=df_gdp['Country Name']).to_dict()
    
    if country not in dict_gdp.keys(): return "Unknown"
    
    gdp = dict_gdp[country]
    if gdp <=20000  : return "Below 20k"
    if gdp <=35000  : return "20k - 35k"
    if gdp <=50000  : return "35k - 50k"
    return "Above 50k"    

In [338]:
def get_host_response_time(response_time):
    if is_nan(response_time): return "Unknown"
    return response_time

In [339]:
def get_host_is_superhost(is_superhost):
    if is_superhost == 't': return "Superhost"
    if is_superhost == 'f': return "Not Superhost"

In [340]:
def get_host_identity_verified(identity):
    if identity == 't': return "Verified"
    if identity == 'f': return "Unverified"

### Create clean dataframe

In [341]:
names = [n for n in df['host_name']]
membership_duration = [get_host_membership_duration(d) for d in df['host_since']]
country = [get_host_country(l) for l in df['host_location']]
continent = [get_host_continent(c) for c in country]
gdp = [get_host_country_gdp(c) for c in country]
response_time = [get_host_response_time(t) for t in df['host_response_time']]
superhost = [get_host_is_superhost(b) for b in df['host_is_superhost']]
identity_verified = [get_host_identity_verified(i) for i in df['host_identity_verified']]

In [342]:
columns = ['host_name', 'host_membership_duration','host_country', 'host_continent', 
           'host_country_gdp', 'host_response_time', 'host_is_superhost', 'host_is_verified']
df_host_dimension = pd.DataFrame(np.stack((names, membership_duration, country, 
                                           continent, gdp, response_time, superhost, 
                                           identity_verified),axis=-1), columns = columns)
df_host_dimension.shape

(17168, 8)

Remove duplicates:

In [343]:
df_host_dimension = df_host_dimension.drop_duplicates()
df_host_dimension.shape

(5255, 8)

Add primary key:

In [344]:
pks = [i for i in range(1,df_host_dimension.shape[0]+1)]
df_host_dimension.insert(0, "host_id", pks, True) 

In [345]:
df_host_dimension

Unnamed: 0,host_id,host_name,host_membership_duration,host_country,host_continent,host_country_gdp,host_response_time,host_is_superhost,host_is_verified
0,1,Ellie,Member for more than 5 years,United States,North America,Above 50k,within an hour,Superhost,Verified
1,2,Bárbara,Member for more than 5 years,Portugal,Europe,20k - 35k,within an hour,Not Superhost,Unverified
2,3,Mónica,Member for more than 5 years,Portugal,Europe,20k - 35k,within an hour,Not Superhost,Unverified
3,4,Francisco,Member for more than 5 years,Portugal,Europe,20k - 35k,within a day,Superhost,Verified
5,5,Sara,Member for more than 5 years,United States,North America,Above 50k,within a day,Not Superhost,Verified
...,...,...,...,...,...,...,...,...,...
17149,5251,Circle,Member for less than 1 year,Spain,Europe,20k - 35k,Unknown,Not Superhost,Unverified
17152,5252,Alexander,Member for more than 5 years,Portugal,Europe,20k - 35k,within a day,Not Superhost,Unverified
17153,5253,Gonçalo,Member for less than 1 year,Portugal,Europe,20k - 35k,Unknown,Not Superhost,Unverified
17155,5254,Margarida,Member for less than 1 year,Portugal,Europe,20k - 35k,Unknown,Not Superhost,Unverified
