## <span style=color:blue>DISCUSSION 5 - BENCHMARKING and VISUALIZATION</span>

In [4]:
import sys
import json
import csv
import yaml

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

import time
from datetime import datetime

import pprint

import psycopg2
from sqlalchemy import create_engine, text as sql_text

In [5]:
# Create an utilities file util.py in a folder benchmarking and import it
sys.path.append('benchmarking/')
import util

In [6]:
# test that utils.py has been imported well
util.hello_world()

Hello World!


In [7]:
# Load the env file 

dotenv_path = 'variables.env'
load_dotenv(dotenv_path=dotenv_path)

True

In [8]:
# Import the env variables

load_dotenv()

schema = os.getenv('DISC_4_SCHEMA')
port = os.getenv('DISC_4_PORT')
host = os.getenv('DISC_4_HOST')
database = os.getenv('DISC_4_DB')
    

In [9]:
# Create the db engine 

db_eng = create_engine(f"postgresql+psycopg2://postgres:postgres@{host}:{port}/{database}",
                       connect_args={'options': '-csearch_path={}'.format(schema)},
                       isolation_level = 'SERIALIZABLE')

print("Successfully created db engine.")

Successfully created db engine.


In [10]:
# Check to see the count of tables 

q = """select left(to_char(date, 'YYYY-MM-DD'),4) as year, count(*)
from reviews
group by year
order by year"""

with db_eng.connect() as conn:
    result = conn.execute(sql_text(q))

result_list = result.fetchall()

pprint.pp(result_list)

[('2009', 56),
 ('2010', 449),
 ('2011', 1905),
 ('2012', 3872),
 ('2013', 7317),
 ('2014', 14203),
 ('2015', 28465),
 ('2016', 42825),
 ('2017', 39464),
 ('2018', 41836),
 ('2019', 41273),
 ('2020', 10239),
 ('2021', 18463),
 ('2022', 26739),
 ('2023', 22383),
 ('2024', 511)]


In [None]:
# Create a function build_query_listings_reviews to build a query for reviews for each year

date_start = '2015-01-01'
date_end = '2015-12-31'

# BUILD THE QUERY HERE ------- P1



In [None]:
# Create function to build queries from the year 2009 to 2024 ------- P2



In [1]:
# Create a function to calculate the performance of a single query from q_dict

count = 10

time_list = []
for i in range(0,count):
    time_start = datetime.now()
    # Open new db connection for each execution of the query to avoid multithreading
    with db_eng.connect() as conn:
        df = pd.read_sql(q_dict['listings_join_review_2015'], con=conn)

    time_end = datetime.now()
    diff = util.time_diff(time_start, time_end)
    time_list.append(diff)

pprint.pp(time_list)
print('mean', round(sum(time_list)/len(time_list), 4), 'min', \
        round(min(time_list), 4), 'max', \
        round(max(time_list), 4), 'std', \
        round(np.std(time_list), 4))

NameError: name 'datetime' is not defined

In [None]:
# Adding and dropping indexes in the table reviews ---- P3

# WRITE THE QUERIES BELOW


# DB CONNECTION
with db_eng.connect() as conn:
    conn.execute(sql_text(q_create_date_in_reviews))
    result_reviews_add = conn.execute(sql_text(q_show_indexes_for_reviews))
    print('The set of indexes on reviews is: ')
    print(result_reviews_add.all())

In [None]:
# Calculate statistics for each year ---- P4

#Initialize the count to 20
count = 20

perf_details = {}
perf_details['with_bm'] = {}

# Iterate through all the queries in q_dict --- BUILD THE QUERY BELOW


print(perf_details)
    

In [None]:
# Create a seperate function for the above and put it in the util file. ------ P5
# Run it again to be sure  

count = 20


In [None]:
# Drop the index on date in reviews

with db_eng.connect() as conn:
    conn.execute(sql_text(q_drop_date_in_reviews))

In [None]:
# Calculate the same metrics for review without the index

perf_details['without_bm'] = util.calc_time_diff_per_year(db_eng, count, q_dict)

pp.pprint(perf_details)

In [None]:
# We need a way to save this data somewhere....save it in a json file (pref_data.json)

perf_file = 'perf_data.json'

try:
    old_perf_summary = util.fetch_perf_data(perf_file)
    print('Successfully read file perf_data/' + perf_file)
except:
    print('Not successful in finding file perf_data/' + perf_file + '; so creating it')
    old_perf_summary = {}
    util.write_perf_data(perf_details, perf_file)
    
util.write_perf_data(perf_details, perf_file)

# With this we have completed the benchmarking test using a single index on the date column of the reviews table
# However let's include visualization to see our results

In [None]:
# Load JSON data from a file ------ P6
with open('perf_data/perf_data.json', 'r') as file:
    data = json.load(file)

# Printing the loaded file
print(json.dumps(data, indent=4, sort_keys=True))

In [None]:
# Convert the json into dataframes ----- P7

df_with_bm = pd.DataFrame(data['with_bm']).transpose()
df_without_bm = pd.DataFrame(data['without_bm']).transpose()

print(df_with_bm)
print(df_without_bm)

In [None]:
# Plot to compare the performances with and without indexes with respect to average time ----- P8
# Also include the average time comparison lines



In [None]:
# Plot to compare the performances with and without indexes with respect to standard deviation

plt.figure(figsize=(14, 7))

# Plot standard deviations
plt.plot(df_with_bm.index, df_with_bm['std'], label='With BM', marker='o')
plt.plot(df_without_bm.index, df_without_bm['std'], label='Without BM', marker='x')

# Calculate and plot the average standard deviation for with_bm and without_bm
avg_std_with_bm = df_with_bm['std'].mean()
avg_std_without_bm = df_without_bm['std'].mean()

plt.axhline(y=avg_std_with_bm, color='blue', linestyle='--', label=f'Avg. Std With BM ({avg_std_with_bm:.4f})')
plt.axhline(y=avg_std_without_bm, color='red', linestyle='--', label=f'Avg. Std Without BM ({avg_std_without_bm:.4f})')

plt.xlabel('Year')
plt.ylabel('Standard Deviation of Time')
plt.title('Standard Deviation Comparison Over Years')
plt.xticks(rotation=45)
plt.legend()
plt.show()

In [None]:
# Create a Scatter Plot to compare the performances with and without indexes with respect to maximum time.
# Also include the average maximum time comparison lines

plt.figure(figsize=(14, 7))
plt.scatter(df_with_bm.index, df_with_bm['max'], color='blue', label='With BM')
plt.scatter(df_without_bm.index, df_without_bm['max'], color='red', label='Without BM')

# Calculate and plot the average of maximum times for with_bm and without_bm
avg_max_with_bm = df_with_bm['max'].mean()
avg_max_without_bm = df_without_bm['max'].mean()

plt.axhline(y=avg_max_with_bm, color='blue', linestyle='--', label=f'Avg. Max Time With BM ({avg_max_with_bm:.4f})')
plt.axhline(y=avg_max_without_bm, color='red', linestyle='--', label=f'Avg. Max Time Without BM ({avg_max_without_bm:.4f})')

plt.xlabel('Year')
plt.ylabel('Maximum Time')
plt.title('Maximum Time Scatter Plot Comparison')
plt.xticks(rotation=45)
plt.legend()
plt.show()