In [1]:
# libraries
import psycopg2 as pg
import pandas as pd
import numpy as np
import os

# set environment variable for psycog2 (for some systems)
os.environ["PGGSSENCMODE"] = "disable"

# get the stored passwords
f=open("credentials_david.txt", "rt")
pwd=f.readline().strip()  #di   password
hpwd=f.readline().strip() #home password
f.close()

In [24]:
# di test: WORKING
host_di = "appserver-01.alunos.di.fc.ul.pt"
db_di = "tpd012"
user_di = "tpd012"

conn = pg.connect(host = host_di,
                  database = db_di,
                  user = user_di,
                  password = pwd)
conn.close()

# local test: WORKING
host_local = "localhost"
db_local = "tpd012"
user_local = "postgres"

conn = pg.connect(host = host_local,
                  database = db_local,
                  user = user_local,
                  password=hpwd)
conn.close()

In [29]:
# functions
def run_sql_command(sql, host, database, user, password):
    """Executes a single SQL statement from a string variable and the database credentials"""
    conn = pg.connect(host = host,
                      database = database,
                      user = user,
                      password = password)
    cur = conn.cursor()
    cur.execute(sql)
    cur.close()
    conn.commit()
    conn.close() 

In [None]:
def intTryParse(value):
    """Parse a string to an integer"""
    try:
        a = int(value)
        return True
    except ValueError:
        return False

In [4]:
listings_al_file_path = '../data/listings_al.csv'
df_listings_al = pd.read_csv(listings_al_file_path)

  interactivity=interactivity, compiler=compiler, result=result)


## 2.2. Preprocessing

The defined _'Property'_ dimension table in __Phase I__ is as follows:

<img src="property_schema.png" width="150" align="center"/>

# __CHANGE SCHEMA__

In [5]:
def delete_null_rows(df,columns):
    """Removes all records with any empty cells from input DataFrame"""
    processed_df = df[columns].copy()
    total_rows = processed_df.shape[0]
    delete_rows = []
    
    if processed_df.isnull().values.any(): # if there are any null values in DataFrame, process DataFrame
        for index, row in enumerate(processed_df.itertuples(), start = 0):
            if (pd.Series(row).isnull().values.any()): # if row has any null value
                delete_rows.append(index) # add row index to delete list
    
    processed_df.drop(df.index[delete_rows], inplace = True) # delete rows fr
    processed_df = processed_df.reset_index().drop('index', axis = 1)
    print('DataFrame contains {} rows. Deleted {} rows ({}% of total rows)'.format(processed_df.shape[0], len(delete_rows), round(len(delete_rows)*100/total_rows, 2)))
    return processed_df

In [6]:
df_columns = ['id',
              'property_type',
              'room_type',
              'accommodates',
              'bathrooms',
              'bedrooms',
              'beds',
              'bed_type']

df_property = delete_null_rows(df_listings_al,df_columns)

DataFrame contains 17097 rows. Deleted 71 rows (0.41% of total rows)


`df_property` will be the table containing a record for each fact ID converted to the dimension format. `df_property_dimension` will be the _de facto_ property dimension from which we can convert each fact ID record to the dimension key according to its set of attributes.

### _'property_type_category'_

We start by creating _'property_type_category'_, a set of four categories from _'property_type'_.

In [7]:
def get_property_type_category(value):
# change 'property_type' to four categories, exclude exceptions
    if value in ["Apartment", "Serviced apartment", "Aparthotel","Loft"]:
        return 'Apartment'
    if value in ["House","Townhouse","Villa","Dome house","Vacation home","Lighthouse","Casa particular (Cuba)","Tiny house","Farm stay","Cottage"]:
        return 'House'
    if value in ["Guesthouse","Guest suite"]:
        return 'Guesthouse'
    if value in ["Hostel","Bed and breakfast","Boutique hotel","Hotel"]:
        return 'Hotel/Hostel'
    return None # for the case of missing or invalid values

### _'room_type'_

This attribute is already defined in satisfactory categories.

### _'accommodates'_

In [8]:
def get_accommodates(value):
# change 'property_type' to four categories, exclude exceptions
    if value in [1,2]: return 'Up to 2 guests'
    if value in [3,4]: return 'Up to 4 guests'
    if value in [5,6]: return 'Up to 6 guests'
    return 'Up to 7 guests or more' # no missing values in this column

### _'bathrooms'_

In [9]:
def get_bathrooms(value):
# change 'bathrooms' to five categories, exclude exceptions
    if value < 0.5: return 'No bathrooms'
    if value < 1.5: return '1 bathroom'
    if value < 2.5: return '2 bathrooms'
    if value < 3.5: return '3 bathrooms'
    if value >= 3.5: return '4+ bathrooms'
    return None # for the case of missing or invalid values

### _'bedrooms'_

In [10]:
def get_bedrooms(value):
# change 'bedrooms' to five categories, exclude exceptions
    if value == 0: return 'T0'
    if value == 1: return 'T1'
    if value == 2: return 'T2'
    if value == 3: return 'T3'
    if value >= 4: return 'T4+'
    return None # for the case of missing or invalid values

### _'beds'_

In [11]:
def get_beds(value):
# change 'beds' to five categories, exclude exceptions
    if value == 0: return 'No beds'
    if value == 1: return '1 bed'
    if value == 2: return '2 beds'
    if value == 3: return '3 beds'
    if value >= 4: return '4+ beds'
    return None # for the case of missing or invalid values

## 2.3. Creating the dimension table

In [12]:
dimension = {}

dimension['ID'] = [value for value in df_property['id']]
dimension['property_type_category'] = [get_property_type_category(value) for value in df_property['property_type']]
dimension['property_type'] = [value for value in df_property['property_type']]
dimension['room_type'] = [value for value in df_property['room_type']]
dimension['accommodates'] = [get_accommodates(value) for value in df_property['accommodates']]
dimension['bathrooms'] = [get_bathrooms(value) for value in df_property['bathrooms']]
dimension['bedrooms'] = [get_bedrooms(value) for value in df_property['bedrooms']]
dimension['beds'] = [get_beds(value) for value in df_property['beds']]
dimension['bed_type'] = [value for value in df_property['bed_type']]

In [13]:
df_property_dimension = pd.DataFrame(dimension)
df_property_dimension.index += 1
df_property_dimension

Unnamed: 0,ID,property_type_category,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type
1,25659,Apartment,Apartment,Entire home/apt,Up to 4 guests,1 bathroom,T1,1 bed,Real Bed
2,29248,Apartment,Apartment,Entire home/apt,Up to 4 guests,1 bathroom,T1,2 beds,Real Bed
3,29396,Apartment,Apartment,Entire home/apt,Up to 4 guests,1 bathroom,T1,1 bed,Real Bed
4,29720,Apartment,Apartment,Entire home/apt,Up to 7 guests or more,4+ bathrooms,T4+,4+ beds,Real Bed
5,27899698,Apartment,Apartment,Entire home/apt,Up to 4 guests,2 bathrooms,T2,2 beds,Real Bed
...,...,...,...,...,...,...,...,...,...
17093,41870065,Apartment,Apartment,Entire home/apt,Up to 2 guests,1 bathroom,T1,1 bed,Real Bed
17094,41879410,Apartment,Apartment,Entire home/apt,Up to 4 guests,1 bathroom,T1,3 beds,Real Bed
17095,41882911,Apartment,Apartment,Entire home/apt,Up to 4 guests,1 bathroom,T1,3 beds,Real Bed
17096,41879552,Apartment,Loft,Entire home/apt,Up to 2 guests,1 bathroom,T1,1 bed,Real Bed


In [14]:
df_property_dimension = delete_null_rows(df_property_dimension,df_property_dimension.columns)

DataFrame contains 16626 rows. Deleted 471 rows (2.75% of total rows)


In [15]:
# create df_property_dimension
property_dimension = df_property_dimension[['property_type_category','property_type','room_type','accommodates','bathrooms','bedrooms','beds','bed_type']].drop_duplicates().copy()
property_dimension = property_dimension.reset_index().drop('index', axis = 1)
property_dimension.index += 1

property_dimension

Unnamed: 0,property_type_category,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type
1,Apartment,Apartment,Entire home/apt,Up to 4 guests,1 bathroom,T1,1 bed,Real Bed
2,Apartment,Apartment,Entire home/apt,Up to 4 guests,1 bathroom,T1,2 beds,Real Bed
3,Apartment,Apartment,Entire home/apt,Up to 7 guests or more,4+ bathrooms,T4+,4+ beds,Real Bed
4,Apartment,Apartment,Entire home/apt,Up to 4 guests,2 bathrooms,T2,2 beds,Real Bed
5,Apartment,Apartment,Entire home/apt,Up to 4 guests,1 bathroom,T1,No beds,Real Bed
...,...,...,...,...,...,...,...,...
1020,House,Farm stay,Private room,Up to 6 guests,1 bathroom,T2,4+ beds,Real Bed
1021,Apartment,Apartment,Shared room,Up to 4 guests,1 bathroom,T1,No beds,Real Bed
1022,House,Casa particular (Cuba),Entire home/apt,Up to 4 guests,1 bathroom,T2,4+ beds,Real Bed
1023,Hotel/Hostel,Bed and breakfast,Private room,Up to 6 guests,2 bathrooms,T1,3 beds,Real Bed


## 2.4. Attributing dimension keys to facts

Having our dimension DataFrame ready, we can now use our converted listings table, `df_property` (which contains Property dimension attributes for all valid facts), to assign each fact its corresponding Property dimension foreign key.

In [16]:
# change df_property to property dimension ID
FK = []
for i, row in enumerate(df_property_dimension.itertuples(), start = 1): # creates enumerate object (counter, value1, value2, ...)
    row = list(row[2:]) # remove ID and index
    for j, dim_row in enumerate(property_dimension.itertuples(), start = 1):
        dim_row = list(dim_row[1:]) # remove index
        if row == dim_row: # checks which dimension row the fact is equal to
            FK.append(j)

In [17]:
df_property_dimension['Property'] = FK

In [18]:
df_property_dimension = df_property_dimension[['ID','Property']]
df_property_dimension.index += 1
df_property_dimension

Unnamed: 0,ID,Property
1,25659,1
2,29248,2
3,29396,1
4,29720,3
5,27899698,4
...,...,...
16622,41870065,6
16623,41879410,9
16624,41882911,9
16625,41879552,50


## 2.5. Creating the _Property_ dimension table in SQL

The SQL statement to create the table is as follows:

In [35]:
delete_table = """
DROP TABLE IF EXISTS Property;
"""

create_table = """
CREATE TABLE Property (
    PROPERTY_ID SERIAL PRIMARY KEY NOT NULL,
    PROPERTY_TYPE_CATEGORY VARCHAR(30) CHECK (PROPERTY_TYPE in ('Apartment', 'Guesthouse', 'House', 'Hotel/Hostel')) NOT NULL,
    PROPERTY_TYPE VARCHAR(30) CHECK (PROPERTY_TYPE in ('Apartment','Serviced apartment','Aparthotel','Loft','House','Townhouse','Villa','Dome house','Vacation home','Lighthouse','Casa particular (Cuba)','Tiny house','Farm stay','Cottage','Guesthouse','Guest suite','Hostel','Bed and breakfast','Boutique hotel','Hotel')) NOT NULL,
    ROOM_TYPE VARCHAR(30) CHECK (ROOM_TYPE in ('Entire Property', 'Private Room', 'Hotel Room', 'Shared Room')) NOT NULL,
    ACCOMMODATES VARCHAR(30) CHECK (ACCOMMODATES in ('Up to 2 guests','Up to 4 guests','Up to 6 guests','Up to 7 guests or more')) NOT NULL,
    BATHROOMS VARCHAR(30) CHECK (BATHROOMS in ('No bathrooms','1 bathroom','2 bathrooms','3 bathrooms','4+ bathrooms')) NOT NULL,
    BEDROOMS VARCHAR(10) CHECK (BEDROOMS in ('T0','T1','T2','T3','T4+')) NOT NULL,
    BEDS VARCHAR(10) CHECK (BATHROOMS in ('No beds','1 bed','2 beds','3 beds','4+ beds')) NOT NULL,
    BED_TYPE VARCHAR(30) CHECK (BED_TYPE in ('Real Bed', 'Pull-out Sofa', 'Futton', 'Couch', 'Airbed')) NOT NULL,
)
"""

In [36]:
#creating table
run_sql_command(delete_table, host_di, db_di, user_di, pwd)
run_sql_command(create_table, host_di, db_di, user_di, pwd)

DependentObjectsStillExist: cannot drop table property because other objects depend on it
DETAIL:  constraint listings_property_id_fkey on table listings depends on table property
constraint availability_property_id_fkey on table availability depends on table property
HINT:  Use DROP ... CASCADE to drop the dependent objects too.


The `create_table` command contains the integrity constraints essential for modelling the dimension.

The dimension `Property` is ready for data __loading__. Now, it's necessary to __transform__ the data, which was previously __extracted__ from its source.