# ETL for the Review dimension

In [199]:
#1. Libraries
import psycopg2 as pg
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
import psycopg2.extras
import math
import numpy as np

In [203]:
#2. set environment variable for psycog2 (for some systems)
os.environ["PGGSSENCMODE"] = "disable"

#3. get the stored Passwords
f=open("credentials.txt", "rt")
pwd=f.readline().strip()  #di   pass 
f.close()

#4. test connections using the password
# FCUL server
host_di = "appserver-01.alunos.di.fc.ul.pt"
db_di = "tpd012"
user_di = "tpd012"

conn = pg.connect(host = host_di, database = db_di, user = user_di, password = pwd)
conn.close()

# Local server
#host = "localhost"
#database="teste"
#user="postgres"
#password="0000"

#conn = pg.connect(host=host,database=database, user=user, password=password)
#conn.close()


### Read csv files

In [106]:
df = pd.read_csv("../data/listings_al.csv")

### Transformation methods

In [107]:
# Check for missing values:
def is_nan(x):
    return (x != x)

In [108]:
def get_rating(score):
    if is_nan(score): return None
    if score < 88: return "Bellow average"
    if score < 93: return "Average"
    if score < 96: return "Good"
    if score < 99: return "Very good"
    if score < 100: return "Excelent"

In [109]:
def get_accuracy(score):
    if is_nan(score): return None
    if score < 9: return "Accurate description"
    return "Description is not accurate"

In [110]:
def get_cleanliness(score):
    if is_nan(score): return None
    if score < 9: return "Not clean"
    return "Clean"

In [111]:
def get_communication(score):
    if is_nan(score): return None
    if score < 9: return "Bad communication"
    return "Good communication"

In [112]:
def get_location(score):
    if is_nan(score): return None
    if score < 9: return "Bad location"
    return "Good location"

### Create clean dataframe

In [113]:
listing_id = [i for i in df['id']]
ratings = [get_rating(s) for s in df['review_scores_rating']]
accuracy = [get_accuracy(s) for s in df['review_scores_accuracy']]
cleanliness = [get_cleanliness(s) for s in df['review_scores_cleanliness']]
communication = [get_communication(s) for s in df['review_scores_communication']]
location = [get_location(s) for s in df['review_scores_location']]

In [100]:
columns = ['listing_id','rating','accuracy','cleanliness','communication','location']
df_review_dimension = pd.DataFrame(np.stack((listing_id,ratings, accuracy, cleanliness,communication,
                                          location),axis=-1), columns = columns)
df_review_dimension.shape

(17168, 6)

In [101]:
def delete_null_rows(df,columns):
    """Removes all records with any empty cells from input DataFrame"""
    processed_df = df[columns].copy()
    total_rows = processed_df.shape[0]
    delete_rows = []
    
    if processed_df.isnull().values.any(): # if there are any null values in DataFrame, process DataFrame
        for index, row in enumerate(processed_df.itertuples(), start = 0):
            if (pd.Series(row).isnull().values.any()): # if row has any null value
                delete_rows.append(index) # add row index to delete list
    
    processed_df.drop(df.index[delete_rows], inplace = True) # delete rows fr
    processed_df = processed_df.reset_index().drop('index', axis = 1)
    print('DataFrame contains {} rows. Deleted {} rows ({}% of total rows)'.format(processed_df.shape[0], len(delete_rows), round(len(delete_rows)*100/total_rows, 2)))
    return processed_df

In [122]:
df_review_dimension = delete_null_rows(df_review_dimension, ['listing_id','rating','accuracy','cleanliness','communication','location'])

DataFrame contains 12830 rows. Deleted 0 rows (0.0% of total rows)


Add primary key:

In [123]:
pk_bf_duplicates = [i for i in range(1, len(df_review_dimension)+1)]
df_review_dimension.insert(0,'pk_bf_duplicates', pk_bf_duplicates)

In [124]:
df_review_dimension

Unnamed: 0,pk_bf_duplicates,listing_id,rating,accuracy,cleanliness,communication,location
0,1,25659,Very good,Description is not accurate,Clean,Good communication,Good location
1,2,29248,Average,Description is not accurate,Clean,Good communication,Good location
2,3,29396,Very good,Description is not accurate,Clean,Good communication,Good location
3,4,29720,Very good,Description is not accurate,Clean,Good communication,Good location
4,5,29915,Good,Description is not accurate,Clean,Good communication,Good location
...,...,...,...,...,...,...,...
12825,12826,41251727,Bellow average,Accurate description,Clean,Good communication,Good location
12826,12827,41397838,Average,Description is not accurate,Clean,Good communication,Good location
12827,12828,41403386,Good,Description is not accurate,Clean,Good communication,Good location
12828,12829,41429051,Average,Description is not accurate,Clean,Good communication,Bad location


Remove duplicates:

In [127]:
columns = ['rating','accuracy','cleanliness','communication','location']

review_dimension_processed = df_review_dimension[columns].drop_duplicates().copy()
review_dimension_processed = review_dimension_processed.reset_index().drop('index', axis = 1)
review_dimension_processed.index += 1

Add new primary key:

In [130]:
pk_aft_duplicates = [i for i in range(1, len(review_dimension_processed)+1)]
review_dimension_processed.insert(0,'pk_aft_duplicates', pk_aft_duplicates)

## Attributing dimension keys to facts

In order to do a mapping between the listing_id (i.e. the facts PK) and the review_ids (FK), two keys were created: one for the dataset before we removed the duplicates and one for after. 

The primary keys created before the duplicate removal have a one-to-one relationship with the listings_id. Here we do an outer join in order to establish correspondence between the old PKs (and therefore the listings_id) and the new PKs.

In [197]:
def key_mapping(df1, df2, pk1, pk2, dimension_name):
    # This function merges two dataframes, and creates a map linking their keys
    df_merged = df1.merge(df2, how='outer')
    df_map = pd.DataFrame()
    df_map['listing_id']= df_merged['listing_id']
    # df_map[pk1]= df_merged[pk1] # Remove this comment if you want to see the middle key in the final df
    df_map[dimension_name+"_id"]= df_merged[pk2]
    
    return df_map

In [198]:
# Do the mapping
key_map = key_mapping(df_review_dimension, review_dimension_processed, "pk_bf_duplicates" , "pk_aft_duplicates", "review")

In [194]:
key_map

Unnamed: 0,listing_id,review_id
0,25659,1
1,29396,1
2,29720,1
3,65878,1
4,103252,1
...,...,...
12825,26582375,46
12826,33126271,46
12827,31012644,47
12828,35116421,47


In [201]:
df_mapping.to_csv('df_listings_review.csv')

### Load Host dimension

In [204]:
delete_table = """
DROP CASCADE TABLE IF EXISTS Review;
"""

create_table = """
CREATE TABLE Review (
    REVIEW_ID SERIAL PRIMARY KEY NOT NULL,
    RATING VARCHAR(30) CHECK (RATING in ('Bellow average','Average','Good','Very good','Excelent')) NOT NULL, 
    ACCURACY VARCHAR(30) CHECK (ACCURACY in ('Accurate description','Description is not accurate')) NOT NULL,
    CLEANLINESS VARCHAR(30) CHECK (CLEANLINESS in ('Clean','Not clean')) NOT NULL,
    COMMUNICATION VARCHAR(30) CHECK (COMMUNICATION in ('Good communication','Bad communication')) NOT NULL,
    LOCATION VARCHAR(30) CHECK (LOCATION in ('Good location','Bad location')) NOT NULL
    )
"""

In [205]:
def excuteSingleSQLstatement(sql, host, database, user, password):
    conn = pg.connect(host=host,database=database, user=user, password=password)
    cur = conn.cursor()
    cur.execute(sql)
    cur.close()
    conn.commit()
    conn.close() 

In [206]:
excuteSingleSQLstatement(delete_table, host_di, db_di, user_di, pwd)
excuteSingleSQLstatement(delete_table, host_di, db_di, user_di, pwd)

DependentObjectsStillExist: cannot drop table review because other objects depend on it
DETAIL:  constraint listings_review_id_fkey on table listings depends on table review
HINT:  Use DROP ... CASCADE to drop the dependent objects too.


In [67]:
#method for bulk insert
def insert_data(df, table_name, conn):
    df_columns = list(df)
    columns = ",".join(df_columns)
    values = "VALUES({})".format(",".join(["%s" for _ in df_columns])) 
    insert_stmt = "INSERT INTO {} ({}) {}".format(table_name,columns,values)
    success = True
    try:
        cursor = conn.cursor()
        pg.extras.execute_batch(cursor, insert_stmt, df.values)
        conn.commit()
        success = True
    except pg.DatabaseError as error:
        success = False
        print(error)
    finally:
        if conn is not None:
            conn.close()
    return success

In [68]:
if len(df_review_dimension) > 0:
    table_name = 'Review'
    conn = pg.connect(host=host, database=database, user=user, password=password)
    success = insert_data(df_review_dimension,table_name, conn)
    conn.close()
    if success == True: print('Data inserted succefully')
else: print('No data to insert')

Data inserted succefully
