## 0. Basic setup (library imports, database connection, function setup)

In [8]:
# libraries
import psycopg2 as pg
import pandas as pd
import numpy as np
import os

# set environment variable for psycog2 (for some systems)
os.environ["PGGSSENCMODE"] = "disable"

# get the stored passwords
f=open("credentials_david.txt", "rt")
pwd=f.readline().strip()  #di   password
hpwd=f.readline().strip() #home password
f.close()

In [None]:
# di test: WORKING
#host_di = "appserver-01.alunos.di.fc.ul.pt"
#db_di = "tpd012"
#user_di = "tpd012"

#conn = pg.connect(host = host_di,
#                  database = db_di,
#                  user = user_di,
#                  password = pwd)
#conn.close()

# local test: WORKING
host_local = "localhost"
db_local = "tpd012"
user_local = "postgres"

conn = pg.connect(host = host_local,
                  database = db_local,
                  user = user_local,
                  password=hpwd)
conn.close()

In [6]:
# functions
def run_sql_command(sql, host, database, user, password):
    """Executes a single SQL statement from a string variable and the database credentials"""
    conn = pg.connect(host = host,
                      database = database,
                      user = user,
                      password = password)
    cur = conn.cursor()
    cur.execute(sql)
    cur.close()
    conn.commit()
    conn.close() 
    
def intTryParse(value):
    """Parse a string to an integer"""
    try:
        a = int(value)
        return True
    except ValueError:
        return False

## 1. Creating _Property_ dimension table in SQL

The defined _Property_ table in __Phase I__ is as follows:

<img src="property_sql.png" width="200" align="center"/>

The SQL statement to create the table is as follows:

In [None]:
create_table = """
CREATE TABLE Property (
    PROPERTY_ID SERIAL PRIMARY KEY NOT NULL,
    PROPERTY_TYPE VARCHAR(30) CHECK (PROPERTY_TYPE in ('Apartment', 'GuestHouse', 'House', 'Hostel', 'Room')) NOT NULL,
    ROOM_TYPE VARCHAR(30) CHECK (ROOM_TYPE in ('Entire Property', 'Private Room', 'Hotel Room', 'Shared Room')) NOT NULL,
    ACCOMMODATES VARCHAR(10) CHECK (ACCOMMODATES in ('0-2','2-4','4-6','>6')) NOT NULL,
    BATHROOMS VARCHAR(10) CHECK (BATHROOMS in ('0','1','2','3','>=4')) NOT NULL,
    BEDROOMS VARCHAR(10) CHECK (BEDROOMS in ('T0','T1','T2','T3','T>=4')) NOT NULL,
    BEDS VARCHAR(10) CHECK (BATHROOMS in ('0','1','2','3','>=4')) NOT NULL,
    BED_TYPE VARCHAR(30) CHECK (BED_TYPE in ('Real Bed', 'Pull-out Sofa', 'Futton', 'Couch', 'Airbed')) NOT NULL,
    PRICE_SRQT VARCHAR(10) CHECK (PRICE_SRQT in ('Expensive', 'Medium', 'Cheap')) NOT NULL
)
"""

The `create_table` command contains the integrity constraints essential for modelling the dimension.

In [None]:
# creating table
run_sql_command(create_table,
                host_local,
                db_local,
                user_local,
                hpwd)

The dimension `Property` is ready for data __loading__. Now, it's necessary to __transform__ the data, which was previously __extracted__ from its source.

## 2. Pre-processing data table

### 2.1. Loading data into Python

In [5]:
listings_file_path = '../data/airbnb/listings.csv'
al_file_path = '../data/Alojamento_Local.csv'
df_al = pd.read_csv(al_file_path)
df_listings = pd.read_csv(listings_file_path)

  interactivity=interactivity, compiler=compiler, result=result)


### 2.2. Merging _listings.csv_ data with _Alojamento_Local.csv_ data

In [9]:
# get only listings where 'license' is not NaN and not 'Exempt'
df_listings_with_license = df_listings[(~df_listings['license'].isnull()) & (df_listings['license'] != 'Exempt')]

# string replace
df_listings_with_license['NrRNAL'] = [s.replace('/AL','').replace('.','') for s in df_listings_with_license['license']]

# get only records where license number can be converted to an integer 
df_listings_with_license = df_listings_with_license[[intTryParse(s) for s in df_listings_with_license['NrRNAL']]]

# convert NrRNAL to an integer before merging the two DataFrames
df_listings_with_license['NrRNAL'] = df_listings_with_license['NrRNAL'].astype(np.int64)

# inner join of both DataFrames
df_result = pd.merge(df_listings_with_license, df_al, how='inner', on='NrRNAL')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [46]:
# keep only columns which are relevant for Property dimension
df_columns = ['property_type',
              'room_type',
              'accommodates',
              'bathrooms',
              'bedrooms',
              'beds',
              'bed_type',
              'Modalidade']

# removing rows with missing data
kept_data = df_result[df_columns].dropna()
print('Removing rows with missing data keeps {}% of the original data.'.format( round( (kept_data.shape[0] / df_result[df_columns].shape[0]) * 100, 2) ) )

Removing rows with missing data keeps 99.59% of the original data.


In [72]:
unique_data = kept_data.drop_duplicates()
unique_data

Unnamed: 0,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,Modalidade
0,Apartment,Entire home/apt,3,1.0,1.0,1.0,Real Bed,Apartamento
1,Apartment,Entire home/apt,4,1.0,1.0,2.0,Real Bed,Apartamento
2,Apartment,Entire home/apt,4,1.0,1.0,1.0,Real Bed,Apartamento
3,Apartment,Entire home/apt,16,8.0,9.0,14.0,Real Bed,Estabelecimento de Hospedagem
4,Apartment,Entire home/apt,4,2.0,2.0,2.0,Real Bed,Estabelecimento de Hospedagem
...,...,...,...,...,...,...,...,...
17116,House,Entire home/apt,8,1.0,2.0,4.0,Real Bed,Apartamento
17117,Apartment,Entire home/apt,8,2.0,2.0,4.0,Real Bed,Moradia
17118,Villa,Entire home/apt,15,4.5,5.0,8.0,Real Bed,Moradia
17146,Apartment,Shared room,8,1.0,1.0,8.0,Real Bed,Moradia
