## 2.2. Preprocessing

In [1]:
# libraries
import psycopg2 as pg
import pandas as pd
import numpy as np
import os

# set environment variable for psycog2 (for some systems)
os.environ["PGGSSENCMODE"] = "disable"

# get the stored passwords
f=open("credentials_david.txt", "rt")
pwd=f.readline().strip()  #di   password
hpwd=f.readline().strip() #home password
f.close()

In [2]:
# di test: WORKING
#host_di = "appserver-01.alunos.di.fc.ul.pt"
#db_di = "tpd012"
#user_di = "tpd012"

#conn = pg.connect(host = host_di,
#                  database = db_di,
#                  user = user_di,
#                  password = pwd)
#conn.close()

# local test: WORKING
host_local = "localhost"
db_local = "tpd012"
user_local = "postgres"

conn = pg.connect(host = host_local,
                  database = db_local,
                  user = user_local,
                  password=hpwd)
conn.close()

In [3]:
# functions
def run_sql_command(sql, host, database, user, password):
    """Executes a single SQL statement from a string variable and the database credentials"""
    conn = pg.connect(host = host,
                      database = database,
                      user = user,
                      password = password)
    cur = conn.cursor()
    cur.execute(sql)
    cur.close()
    conn.commit()
    conn.close() 
    
def intTryParse(value):
    """Parse a string to an integer"""
    try:
        a = int(value)
        return True
    except ValueError:
        return False

## 1. Creating _Property_ dimension table in SQL

The defined _Property_ table in __Phase I__ is as follows:

<img src="property_sql.png" width="200" align="center"/>

The SQL statement to create the table is as follows:

In [None]:
create_table = """
CREATE TABLE Property (
    PROPERTY_ID SERIAL PRIMARY KEY NOT NULL,
    PROPERTY_TYPE VARCHAR(30) CHECK (PROPERTY_TYPE in ('Apartment', 'GuestHouse', 'House', 'Hostel', 'Room')) NOT NULL,
    ROOM_TYPE VARCHAR(30) CHECK (ROOM_TYPE in ('Entire Property', 'Private Room', 'Hotel Room', 'Shared Room')) NOT NULL,
    ACCOMMODATES VARCHAR(10) CHECK (ACCOMMODATES in ('0-2','2-4','4-6','>6')) NOT NULL,
    BATHROOMS VARCHAR(10) CHECK (BATHROOMS in ('0','1','2','3','>=4')) NOT NULL,
    BEDROOMS VARCHAR(10) CHECK (BEDROOMS in ('T0','T1','T2','T3','T>=4')) NOT NULL,
    BEDS VARCHAR(10) CHECK (BATHROOMS in ('0','1','2','3','>=4')) NOT NULL,
    BED_TYPE VARCHAR(30) CHECK (BED_TYPE in ('Real Bed', 'Pull-out Sofa', 'Futton', 'Couch', 'Airbed')) NOT NULL,
    PRICE_SRQT VARCHAR(10) CHECK (PRICE_SRQT in ('Expensive', 'Medium', 'Cheap')) NOT NULL
)
"""

The `create_table` command contains the integrity constraints essential for modelling the dimension.

In [None]:
# creating table
run_sql_command(create_table,
                host_local,
                db_local,
                user_local,
                hpwd)

The dimension `Property` is ready for data __loading__. Now, it's necessary to __transform__ the data, which was previously __extracted__ from its source.

## 2. Pre-processing data table

### 2.1. Loading data into Python

In [4]:
listings_file_path = '../data/airbnb/listings.csv'
al_file_path = '../data/Alojamento_Local.csv'
df_al = pd.read_csv(al_file_path)
df_listings = pd.read_csv(listings_file_path)

  interactivity=interactivity, compiler=compiler, result=result)


### 2.2. Merging _listings.csv_ data with _Alojamento_Local.csv_ data

In [5]:
# get only listings where 'license' is not NaN and not 'Exempt'
df_listings_with_license = df_listings[(~df_listings['license'].isnull()) & (df_listings['license'] != 'Exempt')].copy()

# string replace
df_listings_with_license['NrRNAL'] = [s.replace('/AL','').replace('.','') for s in df_listings_with_license['license']]

# get only records where license number can be converted to an integer 
df_listings_with_license = df_listings_with_license[[intTryParse(s) for s in df_listings_with_license['NrRNAL']]]

# convert NrRNAL to an integer before merging the two DataFrames
df_listings_with_license['NrRNAL'] = df_listings_with_license['NrRNAL'].astype(np.int64)

# inner join of both DataFrames
df_result = pd.merge(df_listings_with_license, df_al, how='inner', on='NrRNAL')

In [46]:
# keep only columns which are relevant for Property dimension
df_columns = ['property_type',
              'room_type',
              'accommodates',
              'bathrooms',
              'bedrooms',
              'beds',
              'bed_type']

# removing rows with missing data
kept_data = df_result[df_columns].dropna().reset_index().copy()
print('Removing rows with missing data keeps {}% of the original data.'.format( round( (kept_data.shape[0] / df_result[df_columns].shape[0]) * 100, 2) ) )

Removing rows with missing data keeps 99.59% of the original data.


# Drafts

This will change in future, with the addition of price/sqr. meter

In [None]:
# convert columns to categorical variables and keep rows with only relevant categories

delete_rows = []

# change 'property_type' to four categories, exclude exceptions
for i, value in enumerate(kept_data['property_type']):
    if value in ["Apartment", "Serviced apartment", "Aparthotel","Loft"]:  
        kept_data.loc[i, 'property_type'] = 'Apartment'
    elif value in ["House","Townhouse","Villa","Dome house","Vacation home","Lighthouse","Casa particular (Cuba)","Tiny house","Farm stay","Cottage"]:
        kept_data.loc[i, 'property_type'] = 'House'
    elif value in ["Guesthouse","Guest suite"]:
        kept_data.loc[i, 'property_type'] = 'Guesthouse'
    elif value in ["Hostel","Bed and breakfast","Boutique hotel","Hotel"]:
        kept_data.loc[i, 'property_type'] = 'Hotel/Hostel'
    else:
        delete_rows.append(i)  

list(set(delete_rows))
kept_data.drop(delete_rows, inplace = True)
kept_data = kept_data.drop('index', axis = 1).reset_index().drop('index', axis = 1)
kept_data

# change 'bathrooms' to five categories, exclude exceptions
for i, value in enumerate(kept_data['bathrooms']):
    if value < 0.5:  
        kept_data.loc[i, 'bathrooms'] = '0 bathrooms'
    elif value < 1.5:
        kept_data.loc[i, 'bathrooms'] = '1 bathroom'
    elif value < 2.5:
        kept_data.loc[i, 'bathrooms'] = '2 bathrooms'
    elif value < 3.5:
        kept_data.loc[i, 'bathrooms'] = '3 bathrooms'
    elif value >= 3.5:
        kept_data.loc[i, 'bathrooms'] = '4+ bathrooms'
    else:
        delete_rows.append(i) # for the case of missing values

# change 'bedrooms' to five categories, exclude exceptions
for i, value in enumerate(kept_data['bedrooms']):
    if value == 0:  
        kept_data.loc[i, 'bedrooms'] = 'T0'
    elif value == 1:
        kept_data.loc[i, 'bedrooms'] = 'T1'
    elif value == 2:
        kept_data.loc[i, 'bedrooms'] = 'T2'
    elif value == 3:
        kept_data.loc[i, 'bedrooms'] = 'T3'
    elif value >= 4:
        kept_data.loc[i, 'bedrooms'] = 'T4+'
    else:
        delete_rows.append(i) # for the case of missing values
        
# create a binned category type for 'accommodates'
kept_data['accommodates_new'] = pd.qcut(kept_data['accommodates'],4, duplicates='drop')