# Start the system

In [61]:
#1. Libraries
import psycopg2 as pg
import pandas as pd
import os
import re
import pycountry_convert as pc
import matplotlib.pyplot as plt
import datetime
import psycopg2.extras
import math
import numpy as np


In [54]:
#2. set environment variable for psycog2 (for some systems)
os.environ["PGGSSENCMODE"] = "disable"

#3. get the stored Passwords
f=open("credentials.txt", "rt")
pwd=f.readline().strip()  #di   pass 
f.close()

#4. test connections using the password
# FCUL server
#host_di = "appserver-01.alunos.di.fc.ul.pt"
#db_di = "tpd012"
#user_di = "tpd012"

#conn = pg.connect(host = host_di, database = db_di, user = user_di, password = pwd)
#conn.close()

# Local server
host = "localhost"
database="teste"
user="postgres"
password="0000"

conn = pg.connect(host=host,database=database, user=user, password=password)
conn.close()


### Read csv files

In [5]:
df = pd.read_csv("../data/listings_al.csv")

  interactivity=interactivity, compiler=compiler, result=result)


### Transformation methods

In [38]:
# Check for missing values:
def is_nan(x):
    return (x != x)

In [39]:
def get_rating(score):
    if is_nan(score): return None
    if score < 88: return "Bellow average"
    if score < 93: return "Average"
    if score < 96: return "Good"
    if score < 99: return "Very good"
    if score < 100: return "Excelent"

In [40]:
def get_accuracy(score):
    if is_nan(score): return None
    if score < 9: return "Accurate description"
    return "Description is not accurate"

In [41]:
def get_cleanliness(score):
    if is_nan(score): return None
    if score < 9: return "Not clean"
    return "Clean"

In [42]:
def get_communication(score):
    if is_nan(score): return None
    if score < 9: return "Bad communication"
    return "Good communication"

In [43]:
def get_location(score):
    if is_nan(score): return None
    if score < 9: return "Bad location"
    return "Good location"

### Create clean dataframe

In [44]:
ratings = [get_rating(s) for s in df['review_scores_rating']]
accuracy = [get_accuracy(s) for s in df['review_scores_accuracy']]
cleanliness = [get_cleanliness(s) for s in df['review_scores_cleanliness']]
communication = [get_communication(s) for s in df['review_scores_communication']]
location = [get_location(s) for s in df['review_scores_location']]

In [45]:
columns = ['rating','accuracy','cleanliness','communication','location']
df_review_dimension = pd.DataFrame(np.stack((ratings, accuracy, cleanliness,communication,
                                          location),axis=-1), columns = columns)
df_review_dimension.shape

(17168, 5)

In [46]:
def delete_null_rows(df,columns):
    """Removes all records with any empty cells from input DataFrame"""
    processed_df = df[columns].copy()
    total_rows = processed_df.shape[0]
    delete_rows = []
    
    if processed_df.isnull().values.any(): # if there are any null values in DataFrame, process DataFrame
        for index, row in enumerate(processed_df.itertuples(), start = 0):
            if (pd.Series(row).isnull().values.any()): # if row has any null value
                delete_rows.append(index) # add row index to delete list
    
    processed_df.drop(df.index[delete_rows], inplace = True) # delete rows fr
    processed_df = processed_df.reset_index().drop('index', axis = 1)
    print('DataFrame contains {} rows. Deleted {} rows ({}% of total rows)'.format(processed_df.shape[0], len(delete_rows), round(len(delete_rows)*100/total_rows, 2)))
    return processed_df

In [47]:
df_review_dimension = delete_null_rows(df_review_dimension, ['rating','accuracy','cleanliness','communication','location'])

DataFrame contains 12830 rows. Deleted 4338 rows (25.27% of total rows)


Remove duplicates:

In [48]:
df_review_dimension = df_review_dimension.drop_duplicates()
df_review_dimension.shape

(47, 5)

Add primary key:

In [49]:
pks = [i for i in range(1,df_review_dimension.shape[0]+1)]
df_review_dimension.insert(0, "review_id", pks, True) 

In [50]:
df_review_dimension

Unnamed: 0,review_id,rating,accuracy,cleanliness,communication,location
0,1,Very good,Description is not accurate,Clean,Good communication,Good location
1,2,Average,Description is not accurate,Clean,Good communication,Good location
4,3,Good,Description is not accurate,Clean,Good communication,Good location
9,4,Bellow average,Description is not accurate,Clean,Good communication,Good location
12,5,Average,Description is not accurate,Clean,Bad communication,Good location
21,6,Excelent,Description is not accurate,Clean,Good communication,Good location
29,7,Average,Description is not accurate,Not clean,Good communication,Good location
33,8,Bellow average,Accurate description,Not clean,Good communication,Good location
39,9,Bellow average,Accurate description,Not clean,Bad communication,Bad location
55,10,Bellow average,Description is not accurate,Not clean,Good communication,Good location


### Load Host dimension

In [64]:
delete_table = """
DROP TABLE IF EXISTS Review;
"""

create_table = """
CREATE TABLE Review (
    REVIEW_ID SERIAL PRIMARY KEY NOT NULL,
    RATING VARCHAR(30) CHECK (RATING in ('Bellow average','Average','Good','Very good','Excelent')) NOT NULL, 
    ACCURACY VARCHAR(30) CHECK (ACCURACY in ('Accurate description','Description is not accurate')) NOT NULL,
    CLEANLINESS VARCHAR(30) CHECK (CLEANLINESS in ('Clean','Not clean')) NOT NULL,
    COMMUNICATION VARCHAR(30) CHECK (COMMUNICATION in ('Good communication','Bad communication')) NOT NULL,
    LOCATION VARCHAR(30) CHECK (LOCATION in ('Good location','Bad location')) NOT NULL
    )
"""

In [65]:
def excuteSingleSQLstatement(sql, host, database, user, password):
    conn = pg.connect(host=host,database=database, user=user, password=password)
    cur = conn.cursor()
    cur.execute(sql)
    cur.close()
    conn.commit()
    conn.close() 

In [66]:
excuteSingleSQLstatement(delete_table, host, database, user, password)
excuteSingleSQLstatement(create_table, host, database, user, password)

In [67]:
#method for bulk insert
def insert_data(df, table_name, conn):
    df_columns = list(df)
    columns = ",".join(df_columns)
    values = "VALUES({})".format(",".join(["%s" for _ in df_columns])) 
    insert_stmt = "INSERT INTO {} ({}) {}".format(table_name,columns,values)
    success = True
    try:
        cursor = conn.cursor()
        pg.extras.execute_batch(cursor, insert_stmt, df.values)
        conn.commit()
        success = True
    except pg.DatabaseError as error:
        success = False
        print(error)
    finally:
        if conn is not None:
            conn.close()
    return success

In [68]:
if len(df_review_dimension) > 0:
    table_name = 'Review'
    conn = pg.connect(host=host, database=database, user=user, password=password)
    success = insert_data(df_review_dimension,table_name, conn)
    conn.close()
    if success == True: print('Data inserted succefully')
else: print('No data to insert')

Data inserted succefully
