## <span style=color:blue>DISCUSSION 5 - BENCHMARKING and VISUALIZATION</span>

In [1]:
import sys
import json
import csv
import yaml

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

import time
from datetime import datetime

import pprint

import psycopg2
from sqlalchemy import create_engine, text as sql_text

In [2]:
# Create an utilities file util.py in a folder benchmarking and import it
sys.path.append('benchmarking/')
import util_main as util

In [3]:
# test that utils.py has been imported well
util.hello_world()

Hello World!


In [4]:
# Load the env file 

dotenv_path = 'variables.env'
load_dotenv(dotenv_path=dotenv_path)

True

In [5]:
# Import the env variables

load_dotenv()

schema = os.getenv('DISC_4_SCHEMA')
port = os.getenv('DISC_4_PORT')
host = os.getenv('DISC_4_HOST')
database = os.getenv('DISC_4_DB')
    

In [6]:
# Create the db engine 

db_eng = create_engine(f"postgresql+psycopg2://postgres:postgres@{host}:{port}/{database}",
                       connect_args={'options': '-csearch_path={}'.format(schema)},
                       isolation_level = 'SERIALIZABLE')

print("Successfully created db engine.")

Successfully created db engine.


In [7]:
# Check to see the count of tables 

q = """select left(to_char(date, 'YYYY-MM-DD'),4) as year, count(*)
from reviews
group by year
order by year"""

with db_eng.connect() as conn:
    result = conn.execute(sql_text(q))

result_list = result.fetchall()

pprint.pp(result_list)

[('2009', 56),
 ('2010', 449),
 ('2011', 1905),
 ('2012', 3872),
 ('2013', 7317),
 ('2014', 14203),
 ('2015', 28465),
 ('2016', 42825),
 ('2017', 39464),
 ('2018', 41836),
 ('2019', 41273),
 ('2020', 10239),
 ('2021', 18463),
 ('2022', 26739),
 ('2023', 22383),
 ('2024', 511)]


In [8]:
# Create a function to build a query for reviews for each year

date_start = '2015-01-01'
date_end = '2015-12-31'

q = util.build_query_listings_reviews(date_start, date_end)

print(q)

SELECT DISTINCT l.id, l.name
FROM listings l, reviews r 
WHERE l.id = r.listing_id
  AND r.date >= '2015-01-01'
  AND r.date <= '2015-12-31'
ORDER BY l.id;


In [9]:
# Create function to build queries from the year 2009 to 2024

q_dict = {}

for yr in range(2009,2025):
    q_name = 'listings_join_review_' + str(yr)
    date_start = str(yr) + '-01-01'
    date_end = str(yr) + '-12-31'
    q_dict[q_name] = util.build_query_listings_reviews(date_start, date_end)
    
pprint.pp(q_dict)


{'listings_join_review_2009': 'SELECT DISTINCT l.id, l.name\n'
                              'FROM listings l, reviews r \n'
                              'WHERE l.id = r.listing_id\n'
                              "  AND r.date >= '2009-01-01'\n"
                              "  AND r.date <= '2009-12-31'\n"
                              'ORDER BY l.id;',
 'listings_join_review_2010': 'SELECT DISTINCT l.id, l.name\n'
                              'FROM listings l, reviews r \n'
                              'WHERE l.id = r.listing_id\n'
                              "  AND r.date >= '2010-01-01'\n"
                              "  AND r.date <= '2010-12-31'\n"
                              'ORDER BY l.id;',
 'listings_join_review_2011': 'SELECT DISTINCT l.id, l.name\n'
                              'FROM listings l, reviews r \n'
                              'WHERE l.id = r.listing_id\n'
                              "  AND r.date >= '2011-01-01'\n"
                              "  A

In [10]:
# Create a function to calculate the performance of a single query from q_dict

count = 10

time_list = []
for i in range(0,count):
    time_start = datetime.now()
    # Open new db connection for each execution of the query to avoid multithreading
    with db_eng.connect() as conn:
        df = pd.read_sql(q_dict['listings_join_review_2015'], con=conn)

    time_end = datetime.now()
    diff = util.time_diff(time_start, time_end)
    time_list.append(diff)

pprint.pp(time_list)
print('mean', round(sum(time_list)/len(time_list), 4), 'min', \
        round(min(time_list), 4), 'max', \
        round(max(time_list), 4), 'std', \
        round(np.std(time_list), 4))

[0.111929,
 0.105735,
 0.108335,
 0.096723,
 0.091923,
 0.099697,
 0.119122,
 0.106385,
 0.083823,
 0.111343]
mean 0.1035 min 0.0838 max 0.1191 std 0.01


In [11]:
# Adding and dropping indexes in the table reviews 

q_create_date_in_reviews = '''
BEGIN TRANSACTION;
CREATE INDEX IF NOT EXISTS date_in_reviews
ON reviews(date);
END TRANSACTION;
'''

q_drop_date_in_reviews = '''
BEGIN TRANSACTION;
DROP INDEX IF EXISTS date_in_reviews;
END TRANSACTION;
'''

q_show_indexes_for_reviews = '''
select *
from pg_indexes
where tablename = 'reviews';
'''


with db_eng.connect() as conn:
    conn.execute(sql_text(q_create_date_in_reviews))
    result_reviews_add = conn.execute(sql_text(q_show_indexes_for_reviews))
    print('The set of indexes on reviews is: ')
    print(result_reviews_add.all())

The set of indexes on reviews is: 
[('new_york_city', 'reviews', 'date_in_reviews', None, 'CREATE INDEX date_in_reviews ON new_york_city.reviews USING btree (date)')]


In [12]:
# Calculate statistics for each year

#Initialize the count to 20
count = 20

perf_details = {}
perf_details['with_bm'] = {}

# Iterate through all the queries in q_dict
for year, sql_query in q_dict.items():
    time_list = []
    for i in range(count): 
        time_start = datetime.now()

        with db_eng.connect() as conn:
            df = pd.read_sql(sql_query, con=conn)

        time_end = datetime.now()
        # Calulate the time difference
        diff = util.time_diff(time_start, time_end)
        time_list.append(diff)

    # Splitting the string to get the year
    parts = year.split('_')
    curr_year = parts[-1]

    # Calulcate the metrics
    perf_profile = {
        'avg': round(sum(time_list) / len(time_list), 4),
        'min': round(min(time_list), 4),
        'max': round(max(time_list), 4),
        'std': round(np.std(time_list), 4)
    }

    # Add metrics according to the year
    perf_details['with_bm'][curr_year] = perf_profile

print(perf_details)
    

{'with_bm': {'2009': {'avg': 0.0111, 'min': 0.0, 'max': 0.024, 'std': 0.0076}, '2010': {'avg': 0.0126, 'min': 0.0, 'max': 0.026, 'std': 0.0081}, '2011': {'avg': 0.0246, 'min': 0.0147, 'max': 0.0508, 'std': 0.0105}, '2012': {'avg': 0.0178, 'min': 0.0153, 'max': 0.0231, 'std': 0.0021}, '2013': {'avg': 0.0354, 'min': 0.0157, 'max': 0.0674, 'std': 0.0149}, '2014': {'avg': 0.0489, 'min': 0.0301, 'max': 0.0815, 'std': 0.0165}, '2015': {'avg': 0.1093, 'min': 0.0648, 'max': 0.1529, 'std': 0.0242}, '2016': {'avg': 0.0874, 'min': 0.0482, 'max': 0.1904, 'std': 0.0298}, '2017': {'avg': 0.133, 'min': 0.094, 'max': 0.1778, 'std': 0.024}, '2018': {'avg': 0.0744, 'min': 0.0516, 'max': 0.0996, 'std': 0.012}, '2019': {'avg': 0.0511, 'min': 0.0313, 'max': 0.065, 'std': 0.0092}, '2020': {'avg': 0.0424, 'min': 0.0306, 'max': 0.0942, 'std': 0.0162}, '2021': {'avg': 0.0503, 'min': 0.0312, 'max': 0.0638, 'std': 0.0096}, '2022': {'avg': 0.0747, 'min': 0.0479, 'max': 0.1248, 'std': 0.0171}, '2023': {'avg': 0.05

In [13]:
# Create a seperate function for the above and put it in the util file.
# Run it again to be sure  

count = 20

perf_details = {}

perf_details['with_bm'] = util.calc_time_diff_per_year(db_eng, count, q_dict)

pp = pprint.PrettyPrinter(indent=4)

pp.pprint(perf_details)

{   'with_bm': {   '2009': {   'avg': 0.0125,
                               'max': 0.0312,
                               'min': 0.0,
                               'std': 0.0087},
                   '2010': {   'avg': 0.0101,
                               'max': 0.0236,
                               'min': 0.0,
                               'std': 0.0078},
                   '2011': {   'avg': 0.0151,
                               'max': 0.0309,
                               'min': 0.0,
                               'std': 0.006},
                   '2012': {   'avg': 0.0179,
                               'max': 0.0543,
                               'min': 0.0062,
                               'std': 0.0095},
                   '2013': {   'avg': 0.0277,
                               'max': 0.0324,
                               'min': 0.0156,
                               'std': 0.0061},
                   '2014': {   'avg': 0.0448,
                               'max': 0

In [14]:
# Drop the index on date in reviews

with db_eng.connect() as conn:
    conn.execute(sql_text(q_drop_date_in_reviews))

In [None]:
# Calculate the same metrics for review without the index

perf_details['without_bm'] = util.calc_time_diff_per_year(db_eng, count, q_dict)

pp.pprint(perf_details)

In [None]:
# We need a way to save this data somewhere....save it in a json file (pref_data.json)

perf_file = 'perf_data-main.json'

try:
    old_perf_summary = util.fetch_perf_data(perf_file)
    print('Successfully read file perf_data/' + perf_file)
except:
    print('Not successful in finding file perf_data/' + perf_file + '; so creating it')
    old_perf_summary = {}
    util.write_perf_data(perf_details, perf_file)
    
util.write_perf_data(perf_details, perf_file)

# With this we have completed the benchmarking test using a single index on the date column of the reviews table
# However let's include visualization to see our results

In [None]:
# Load JSON data from a file
with open('perf_data/perf_data-main.json', 'r') as file:
    data = json.load(file)

# Printing the loaded file
print(json.dumps(data, indent=4, sort_keys=True))

In [None]:
# Convert the json into dataframes

df_with_bm = pd.DataFrame(data['with_bm']).transpose()
df_without_bm = pd.DataFrame(data['without_bm']).transpose()

print(df_with_bm)
print(df_without_bm)

In [None]:
# Plot to compare the performances with and without indexes with respect to average time

plt.figure(figsize=(14, 7))
index = range(len(df_with_bm))
bar_width = 0.35

plt.bar(index, df_with_bm['avg'], bar_width, label='With BM')
plt.bar([p + bar_width for p in index], df_without_bm['avg'], bar_width, label='Without BM')

# Calculate and plot the average times for with_bm and without_bm
avg_time_with_bm = df_with_bm['avg'].mean()
avg_time_without_bm = df_without_bm['avg'].mean()

plt.axhline(y=avg_time_with_bm, color='blue', linestyle='--', label=f'Avg. Time With BM ({avg_time_with_bm:.4f})')
plt.axhline(y=avg_time_without_bm, color='red', linestyle='--', label=f'Avg. Time Without BM ({avg_time_without_bm:.4f})')

plt.xlabel('Year')
plt.ylabel('Average Time')
plt.title('Average Performance Comparison With and Without Benchmarking')
plt.xticks([p + bar_width / 2 for p in index], df_with_bm.index)
plt.legend()
plt.show()

In [None]:
# Plot to compare the performances with and without indexes with respect to standard deviation

plt.figure(figsize=(14, 7))

# Plot standard deviations
plt.plot(df_with_bm.index, df_with_bm['std'], label='With BM', marker='o')
plt.plot(df_without_bm.index, df_without_bm['std'], label='Without BM', marker='x')

# Calculate and plot the average standard deviation for with_bm and without_bm
avg_std_with_bm = df_with_bm['std'].mean()
avg_std_without_bm = df_without_bm['std'].mean()

plt.axhline(y=avg_std_with_bm, color='blue', linestyle='--', label=f'Avg. Std With BM ({avg_std_with_bm:.4f})')
plt.axhline(y=avg_std_without_bm, color='red', linestyle='--', label=f'Avg. Std Without BM ({avg_std_without_bm:.4f})')

plt.xlabel('Year')
plt.ylabel('Standard Deviation of Time')
plt.title('Standard Deviation Comparison Over Years')
plt.xticks(rotation=45)
plt.legend()
plt.show()

In [None]:
# Plot to compare the performances with and without indexes with respect to maximum time

plt.figure(figsize=(14, 7))
plt.scatter(df_with_bm.index, df_with_bm['max'], color='blue', label='With BM')
plt.scatter(df_without_bm.index, df_without_bm['max'], color='red', label='Without BM')

# Calculate and plot the average of maximum times for with_bm and without_bm
avg_max_with_bm = df_with_bm['max'].mean()
avg_max_without_bm = df_without_bm['max'].mean()

plt.axhline(y=avg_max_with_bm, color='blue', linestyle='--', label=f'Avg. Max Time With BM ({avg_max_with_bm:.4f})')
plt.axhline(y=avg_max_without_bm, color='red', linestyle='--', label=f'Avg. Max Time Without BM ({avg_max_without_bm:.4f})')

plt.xlabel('Year')
plt.ylabel('Maximum Time')
plt.title('Maximum Time Scatter Plot Comparison')
plt.xticks(rotation=45)
plt.legend()
plt.show()