In [1]:
# libraries
import psycopg2 as pg
import pandas as pd
import numpy as np
import os
import psycopg2.extras

# set environment variable for psycog2 (for some systems)
os.environ["PGGSSENCMODE"] = "disable"

# get the stored passwords
f=open("credentials.txt", "rt")
pwd=f.readline().strip()  #di   password
hpwd=f.readline().strip() #home password
f.close()

In [2]:
# di test: WORKING
host_di = "appserver-01.alunos.di.fc.ul.pt"
db_di = "tpd012"
user_di = "tpd012"

conn = pg.connect(host = host_di,
                  database = db_di,
                  user = user_di,
                  password = pwd)
conn.close()

# local test: WORKING
#host_local = "localhost"
#db_local = "tpd012"
#user_local = "postgres"

#conn = pg.connect(host = host_local,
#                  database = db_local,
#                  user = user_local,
#                  password=hpwd)
#conn.close()

In [None]:
listings_al_file_path = '../data/listings_al.csv'
df_listings_al = pd.read_csv(listings_al_file_path)

## 2.2. Preprocessing

The defined _'Property'_ dimension table in __Phase I__ is as follows:

<img src="property_schema.png" width="150" align="center"/>

# __CHANGE SCHEMA__

<img src="PropertyETL.png" align="center"/>

In [None]:
def delete_null_rows(df,columns):
    """Removes all records with any empty cells from input DataFrame"""
    processed_df = df[columns].copy()
    total_rows = processed_df.shape[0]
    delete_rows = []
    
    if processed_df.isnull().values.any(): # if there are any null values in DataFrame, process DataFrame
        for index, row in enumerate(processed_df.itertuples(), start = 0):
            if (pd.Series(row).isnull().values.any()): # if row has any null value
                delete_rows.append(index) # add row index to delete list
    
    processed_df.drop(df.index[delete_rows], inplace = True) # delete rows fr
    processed_df = processed_df.reset_index().drop('index', axis = 1)
    print('DataFrame contains {} rows. Deleted {} rows ({}% of total rows)'.format(processed_df.shape[0], len(delete_rows), round(len(delete_rows)*100/total_rows, 2)))
    return processed_df

In [None]:
property_columns = ['id',
                    'property_type',
                    'room_type',
                    'accommodates',
                    'bathrooms',
                    'bedrooms',
                    'beds',
                    'bed_type']

df_property = delete_null_rows(df_listings_al,property_columns)

From `df_property`, we create `df_conv_fact`, which will be the table containing a record for each fact ID converted to the dimension format. `property_dimension` will be the _de facto_ property dimension from which we can correspond each fact ID record to the appropriate dimension foreign key according to its set of attributes.

### _'property_type_category'_

We start by creating _'property_type_category'_, a set of four categories from _'property_type'_.

In [None]:
def get_property_type_category(value):
# change 'property_type' to four categories, exclude exceptions
    if value in ["Apartment", "Serviced apartment", "Aparthotel","Loft"]:
        return 'Apartment'
    if value in ["House","Townhouse","Villa","Dome house","Vacation home","Lighthouse","Casa particular (Cuba)","Tiny house","Farm stay","Cottage"]:
        return 'House'
    if value in ["Guesthouse","Guest suite"]:
        return 'Guesthouse'
    if value in ["Hostel","Bed and breakfast","Boutique hotel","Hotel"]:
        return 'Hotel/Hostel'
    return None # for the case of missing or invalid values

### _'room_type'_

This attribute is already defined in satisfactory categories.

### _'accommodates'_

In [None]:
def get_accommodates(value):
# change 'property_type' to four categories, exclude exceptions
    if value in [1,2]: return 'Up to 2 guests'
    if value in [3,4]: return 'Up to 4 guests'
    if value in [5,6]: return 'Up to 6 guests'
    return 'Up to 7 guests or more' # no missing values in this column

### _'bathrooms'_

In [None]:
def get_bathrooms(value):
# change 'bathrooms' to five categories, exclude exceptions
    if value < 0.5: return 'No bathrooms'
    if value < 1.5: return '1 bathroom'
    if value < 2.5: return '2 bathrooms'
    if value < 3.5: return '3 bathrooms'
    if value >= 3.5: return '4+ bathrooms'
    return None # for the case of missing or invalid values

### _'bedrooms'_

In [None]:
def get_bedrooms(value):
# change 'bedrooms' to five categories, exclude exceptions
    if value == 0: return 'T0'
    if value == 1: return 'T1'
    if value == 2: return 'T2'
    if value == 3: return 'T3'
    if value >= 4: return 'T4+'
    return None # for the case of missing or invalid values

### _'beds'_

In [None]:
def get_beds(value):
# change 'beds' to five categories, exclude exceptions
    if value == 0: return 'No beds'
    if value == 1: return '1 bed'
    if value == 2: return '2 beds'
    if value == 3: return '3 beds'
    if value >= 4: return '4+ beds'
    return None # for the case of missing or invalid values

In [None]:
def convert_facts_property(df_non_null_facts):
    """Performs preprocessing in facts to a Property dimension format"""
    dimension = {}

    dimension['ID'] = [value for value in df_property['id']]
    dimension['property_type_category'] = [get_property_type_category(value) for value in df_property['property_type']]
    dimension['property_type'] = [value for value in df_property['property_type']]
    dimension['room_type'] = [value for value in df_property['room_type']]
    dimension['accommodates'] = [get_accommodates(value) for value in df_property['accommodates']]
    dimension['bathrooms'] = [get_bathrooms(value) for value in df_property['bathrooms']]
    dimension['bedrooms'] = [get_bedrooms(value) for value in df_property['bedrooms']]
    dimension['beds'] = [get_beds(value) for value in df_property['beds']]
    dimension['bed_type'] = [value for value in df_property['bed_type']]

    df_conv_facts = pd.DataFrame(dimension)
    df_conv_facts.index += 1
    df_conv_facts = delete_null_rows(df_conv_facts,df_conv_facts.columns)
    
    return df_conv_facts

In [None]:
df_conv_facts = convert_facts_property(df_property)

df_conv_facts

## 2.3. Creating the dimension table

In [None]:
create_table = """
CREATE TABLE IF NOT EXISTS Property (
    PROPERTY_ID SERIAL PRIMARY KEY NOT NULL,
    PROPERTY_TYPE_CATEGORY VARCHAR(30) CHECK (PROPERTY_TYPE_CATEGORY in ('Apartment', 'Guesthouse', 'House', 'Hotel/Hostel')) NOT NULL,
    PROPERTY_TYPE VARCHAR(30) CHECK (PROPERTY_TYPE in ('Apartment','Serviced apartment','Aparthotel','Loft','House','Townhouse','Villa','Dome house','Vacation home','Lighthouse','Casa particular (Cuba)','Tiny house','Farm stay','Cottage','Guesthouse','Guest suite','Hostel','Bed and breakfast','Boutique hotel','Hotel')) NOT NULL,
    ROOM_TYPE VARCHAR(30) CHECK (ROOM_TYPE in ('Entire home/apt', 'Private room', 'Hotel room', 'Shared room')) NOT NULL,
    ACCOMMODATES VARCHAR(30) CHECK (ACCOMMODATES in ('Up to 2 guests','Up to 4 guests','Up to 6 guests','Up to 7 guests or more')) NOT NULL,
    BATHROOMS VARCHAR(30) CHECK (BATHROOMS in ('No bathrooms','1 bathroom','2 bathrooms','3 bathrooms','4+ bathrooms')) NOT NULL,
    BEDROOMS VARCHAR(10) CHECK (BEDROOMS in ('T0','T1','T2','T3','T4+')) NOT NULL,
    BEDS VARCHAR(10) CHECK (BEDS in ('No beds','1 bed','2 beds','3 beds','4+ beds')) NOT NULL,
    BED_TYPE VARCHAR(30) CHECK (BED_TYPE in ('Real Bed', 'Pull-out Sofa', 'Futon', 'Couch', 'Airbed')) NOT NULL
)
"""

The `create_table` command contains the integrity constraints essential for modelling the dimension.

In [None]:
# functions
def run_sql_command(sql, host, database, user, password):
    """Executes a single SQL statement from a string variable and the database credentials"""
    conn = pg.connect(host = host,
                      database = database,
                      user = user,
                      password = password)
    cur = conn.cursor()
    cur.execute(sql)
    cur.close()
    conn.commit()
    conn.close() 

In [None]:
# creating table from scratch
run_sql_command(create_table, host_di, db_di, user_di, pwd)

The dimension table is created empty, to be populated later with incoming data.

### 2.3.1. Adding new data to dimension table

In [None]:
def create_property_dimension(df_conv_facts,columns):
    """Creates a Property dimension table from a converted facts DataFrame"""
    property_dimension = df_conv_facts[columns].drop_duplicates().copy()
    property_dimension = property_dimension.reset_index().drop('index', axis = 1)
    property_dimension.index += 1

    return property_dimension

In [None]:
property_dimension_new = create_property_dimension(df_conv_facts,list(df_conv_facts.columns[1:]))

property_dimension_new

In [None]:
# function to query table and convert it to pandas dataframe
def query_table(conn, table_name):
    """Returns DataFrame with queried database table"""
    sql = "select * from {};".format(table_name)
    #return dataframe
    return pd.read_sql_query(sql, conn)

# for this function to run, the dataframes must have the same columns, in the same order
def get_data_to_insert(df_etl, df_sql):
    """Returns data valid for insertion in dimension from a new ETL-processed DataFrame"""
    return df_etl[~df_etl.isin(df_sql)].dropna(how = 'all') # checks which rows are not yet in the dimension

# function for bulk insert
def insert_data(df, table_name, conn):
    """Inserts selected data into dimension table in database"""
    df_columns = list(df)
    columns = ",".join(df_columns)
    values = "VALUES({})".format(",".join(["%s" for _ in df_columns])) 
    insert_stmt = "INSERT INTO {} ({}) {}".format(table_name,columns,values)
    success = True
    try:
        cursor = conn.cursor()
        psycopg2.extras.execute_batch(cursor, insert_stmt, df.values)
        conn.commit()
        success = True
    except pg.DatabaseError as error:
        success = False
        print(error)
    finally:
        if conn is not None:
            conn.close()
    return success

In [None]:
# retrieve Property dimension table
conn = pg.connect(host = host_di,
                        database = db_di,
                        user = user_di,
                        password = pwd)

property_dimension_old = query_table(conn, 'property')

conn.close()

property_dimension_old.set_index('property_id', inplace = True)

property_dimension_old

In [None]:
# checks which rows from new data will be inserted into database dimension table
def get_data_to_insert(df1, df2, columns):
    """Checks if rows in df1 are already present in df2"""
    return df1[~df1[columns].apply(tuple,1).isin(df2[columns].apply(tuple,1))]

In [None]:
# compares rows (excluding unique IDs) of new data and database dimension
dimension_insert = get_data_to_insert(property_dimension_new,property_dimension_old,property_dimension_new.columns[1:])
dimension_insert.index += property_dimension_old.shape[0]

In [None]:
# uploading new data to dimension in database
from sqlalchemy import create_engine

engine = create_engine("postgres://{user}:{password}@{host}:5432/{database}".format(user = user_di,
                                                                                    password = pwd,
                                                                                    host = host_di,
                                                                                    database = db_di))
dimension_insert.to_sql('property',
                          con = engine,
                          if_exists = 'append',
                          index = True,
                          index_label = 'property_id')

## 2.4. Attributing dimension keys to facts

Having our dimension DataFrame `property_dimension` ready, we can now use our converted listings table, `df_conv_facts` (which contains Property dimension attributes for all valid facts), to assign each fact its corresponding Property dimension foreign key.

<img src="PropertyETL2.png" align="center"/>

In [None]:
# retrieve updated Property dimension table for mapping
conn = pg.connect(host = host_di,
                        database = db_di,
                        user = user_di,
                        password = pwd)

property_dimension_updated = query_table(conn, 'property')

conn.close()

property_dimension_updated.set_index('property_id', inplace = True)
property_dimension_updated.reset_index(inplace = True)

property_dimension_updated

In [None]:
def key_mapping(df1, df2, pk1, pk2):
    # This function merges two dataframes, and creates a map linking their keys
    df_merged = df1.merge(df2, how='outer')
    df_map = pd.DataFrame()
    df_map[pk1]= df_merged[pk1]
    df_map[pk2]= df_merged[pk2]
    
    df_map = delete_null_rows(df_map,df_map.columns)
    
    return df_map

In [None]:
# match fact IDs with FKs in dimension
df_mapping = key_mapping(df_conv_facts, property_dimension_updated, 'ID', 'property_id')
df_mapping.to_csv('df_listings_property.csv')

This table can then be merged with all other corresponding tables for the remaining dimensions to produce each fact record in the facts table. It will be used both in the _Listings_ and _Availability_ facts tables.