In [2]:
import json
import os
import time
from datetime import datetime
import numpy as np
from sqlalchemy import create_engine, text as sql_text

# Function to calculate time difference
def time_diff(start_time, end_time):
    return (end_time - start_time).total_seconds()

# Function to execute a query and measure time
def run_query(query, conn):
    times = []
    for _ in range(1):  # Execute the query 50 times as specified
        start_time = datetime.now()
        conn.execute(sql_text(query))
        end_time = datetime.now()
        times.append(time_diff(start_time, end_time))
    return times

# Function to compute performance metrics
def compute_metrics(times):
    return {
        "avg": round(np.mean(times), 4),
        "min": round(np.min(times), 4),
        "max": round(np.max(times), 4),
        "std": round(np.std(times), 4),
        "exec_count": len(times),
        "timestamp": datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    }

# Fetch performance data
def fetch_perf_data(filename):
    try:
        with open(filename) as f:
            if os.stat(filename).st_size == 0:
                return {}
            return json.load(f)
    except FileNotFoundError:
        return {}

# Write performance data
def write_perf_data(data, filename):
    with open(filename, 'w') as fp:
        json.dump(data, fp, indent=4)

# Function to add or drop index
def add_drop_index(engine, action, column, table):
    index_name = f"idx_{column}_in_{table}"
    if action == 'add':
        query = sql_text(f"CREATE INDEX IF NOT EXISTS {index_name} ON {table}({column});")
    elif action == 'drop':
        query = sql_text(f"DROP INDEX IF EXISTS {index_name};")
    with engine.connect() as conn:
        conn.execute(query)

print("Setup and function definitions completed.")


Setup and function definitions completed.


In [3]:
# Ensure 'update_datetimes_query.json' exists
update_query_path = 'perf_data/update_datetimes_query.json'
if not os.path.exists(update_query_path):
    with open(update_query_path, 'w') as f:
        json.dump({}, f)

perf_summary = fetch_perf_data(update_query_path)

update_queries = {
    'update_datetimes_neigh_Manhattan': """
    UPDATE reviews r
    SET datetime = datetime + interval '5 days'
    FROM listings l
    WHERE l.id = r.listing_id
      AND l.neighbourhood_group = 'Manhattan'
    RETURNING 'done';
    """,
    'update_datetimes_neigh_Bedford-Stuyvesant': """
    UPDATE reviews r
    SET datetime = datetime - interval '5 days'
    FROM listings l
    WHERE l.id = r.listing_id
      AND l.neighbourhood = 'Bedford-Stuyvesant'
    RETURNING 'done';
    """
}

all_indexes = [['datetime', 'reviews'], ['neighbourhood', 'listings'], ['neighbourhood_group', 'listings']]
specs = [
    [],
    [['datetime', 'reviews']],
    [['neighbourhood', 'listings']],
    [['neighbourhood_group', 'listings']],
    [['datetime', 'reviews'], ['neighbourhood', 'listings']],
    [['datetime', 'reviews'], ['neighbourhood_group', 'listings']],
    [['neighbourhood', 'listings'], ['neighbourhood_group', 'listings']],
    [['datetime', 'reviews'], ['neighbourhood', 'listings'], ['neighbourhood_group', 'listings']]
]

db_eng = create_engine('postgresql+psycopg2://postgres:postgres@localhost:5432/airbnb',
                       connect_args={'options': '-csearch_path={}'.format('new_york_city')},
                       isolation_level='SERIALIZABLE')

print("Data preparation and index definitions completed.")


Data preparation and index definitions completed.


In [4]:
# Connect to the database and run update queries with different specs
with db_eng.connect() as conn:
    for query_name, query in update_queries.items():
        for spec in specs:
            print(f'Processing spec: {str(spec)} for {query_name}\n')

            for index in all_indexes:
                if index not in spec:
                    add_drop_index(db_eng, 'drop', index[0], index[1])
                    print(f'\nAfter dropping {str(index)}')

            for index in spec:
                add_drop_index(db_eng, 'add', index[0], index[1])
                print(f'\nAfter adding {str(index)}')

            time_list = run_query(query, conn)

            perf_profile = compute_metrics(time_list)

            print('\nThe list of running times is as follows:')
            print(time_list)

            print('\nThe statistics on the list of running times are as follows:')
            print(perf_profile)

            key_value = "__" + "__".join([f"{index[0]}_in_{index[1]}" for index in spec]) + "__"
            if query_name not in perf_summary:
                perf_summary[query_name] = {}
            perf_summary[query_name][key_value] = perf_profile

            # Print the results for debugging
            print(f"\nResults for {query_name} {key_value}:")
            print(perf_profile)

write_perf_data(perf_summary, update_query_path)

print("JSON files created successfully.")


Processing spec: [] for update_datetimes_neigh_Manhattan


After dropping ['datetime', 'reviews']

After dropping ['neighbourhood', 'listings']

After dropping ['neighbourhood_group', 'listings']

The list of running times is as follows:
[35.18692]

The statistics on the list of running times are as follows:
{'avg': 35.1869, 'min': 35.1869, 'max': 35.1869, 'std': 0.0, 'exec_count': 1, 'timestamp': '2024-05-21 18:13:35'}

Results for update_datetimes_neigh_Manhattan ____:
{'avg': 35.1869, 'min': 35.1869, 'max': 35.1869, 'std': 0.0, 'exec_count': 1, 'timestamp': '2024-05-21 18:13:35'}
Processing spec: [['datetime', 'reviews']] for update_datetimes_neigh_Manhattan


After dropping ['neighbourhood', 'listings']

After dropping ['neighbourhood_group', 'listings']
