In [15]:

import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text as sql_text
import json
from datetime import datetime

# Configuration
DATABASE_URL = 'postgresql+psycopg2://postgres:password@localhost:5432/airbnb'
SCHEMA = 'new_york_city'
PERF_DATA_DIR = 'perf_data'

# Connect to the database
engine = create_engine(DATABASE_URL, connect_args={'options': f'-csearch_path={SCHEMA}'}, echo=True)

# Update and add datetime column to reviews
with engine.connect() as conn:
    conn.execute(sql_text("""
        ALTER TABLE reviews ADD COLUMN IF NOT EXISTS datetime TIMESTAMP;
        UPDATE reviews SET datetime = TO_TIMESTAMP(TO_CHAR(date, 'YYYY-MM-DD') || ' 12:00:00', 'YYYY-MM-DD HH24:MI:SS');
    """))
    conn.commit()

# Function to run query and collect performance data
def run_query_and_collect_data(query, engine):
    times = []
    with engine.connect() as conn:
        for _ in range(50):  # Run the query multiple times to average out the performance
            start_time = datetime.now()
            conn.execute(sql_text(query))
            end_time = datetime.now()
            times.append((end_time - start_time).total_seconds())
    return {
        "avg": np.mean(times),
        "min": np.min(times),
        "max": np.max(times),
        "std": np.std(times),
        "exec_count": len(times),
        "timestamp": datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    }

'datetime' column successfully added.


In [None]:
# Function to add or drop index and fetch current indexes on the table
def add_drop_index(engine, action, column, table):
    index_name = f"idx_{column}_in_{table}"
    if action == 'add':
        query = sql_text(f"CREATE INDEX {index_name} ON {table}({column});")
    elif action == 'drop':
        query = sql_text(f"DROP INDEX IF EXISTS {index_name};")
    with engine.connect() as conn:
        conn.execute(query)

# Function to calculate time difference
def time_diff(start_time, end_time):
    return (end_time - start_time).total_seconds()

# Function to execute a query and measure time
def run_query(query, conn):
    times = []
    for _ in range(1):
        start_time = datetime.now()
        conn.execute(sql_text(query))
        end_time = datetime.now()
        times.append(time_diff(start_time, end_time))
    return times

# Function to compute performance metrics
def compute_metrics(times):
    return {
        "avg": round(np.mean(times), 4),
        "min": round(np.min(times), 4),
        "max": round(np.max(times), 4),
        "std": round(np.std(times), 4),
        "exec_count": len(times),
        "timestamp": datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    }

# Function to build the index description key
def build_index_description_key(all_indexes, spec):
    description_key = ""
    for index in all_indexes:
        if index in spec:
            description_key += f"__{index[0]}_in_{index[1]}"
    description_key += "__"
    return description_key

# Fetch performance data
def fetch_perf_data(filename):
    try:
        with open(filename) as f:
            if os.stat(filename).st_size == 0:
                return {}
            return json.load(f)
    except FileNotFoundError:
        return {}

# Write performance data
def write_perf_data(data, filename):
    with open(filename, 'w') as fp:
        json.dump(data, fp, indent=4)

# Create queries for each year from 2009 to 2024
q_dict = {}
for yr in range(2009, 2025):
    q_name = 'listings_join_review_' + str(yr)
    date_start = str(yr) + '-01-01 00:00:00'
    date_end = str(yr) + '-12-31 23:59:59'
    q_dict[q_name] = build_query_listings_reviews(date_start, date_end)

pprint.pp(q_dict)

perf_summary_path = 'perf_data/listings_join_reviews.json'
if not os.path.exists(perf_summary_path):
    with open(perf_summary_path, 'w') as f:
        json.dump({}, f)

perf_summary = fetch_perf_data(perf_summary_path)

all_indexes = [['datetime','reviews'], ['id','listings']]
specs = [
    [['datetime','reviews'], ['id','listings']],
    [['datetime','reviews']],
    [['id','listings']],
    []
]

for query_name, query in q_dict.items():
    for spec in specs:
        print('Processing spec: ', str(spec), '\n')

        for index in all_indexes:
            if index not in spec:
                add_drop_index(db_eng, 'drop', index[0], index[1])
                print('\nAfter dropping', str(index))

        for index in spec:
            add_drop_index(db_eng, 'add', index[0], index[1])
            print('\nAfter adding', str(index))

        time_list = []
        for i in range(50):
            with db_eng.connect() as conn:
                times = run_query(query, conn)
            time_list.extend(times)
        
        perf_profile = compute_metrics(time_list)

        print('\nThe list of running times is as follows:')
        pprint.pp(time_list)

        print('\nThe statistics on the list of running times are as follows:')
        pprint.pp(perf_profile)

        key_value = build_index_description_key(all_indexes, spec)
        print('\nThe new value for "' + key_value + '" will be', str(perf_profile))

        if query_name in perf_summary:
            perf_dict = perf_summary[query_name]
            print("\nBefore modifying perf_dict, the value of perf_summary[query_name] (if it existed) was: ")
            pprint.pp(perf_dict)
        else:
            perf_dict = {}
            print("\nBefore modifying perf_dict, the value of perf_summary[query_name] had empty value")
        print()
        perf_dict[key_value] = perf_profile
        perf_summary[query_name] = perf_dict

        print("\nAfter modifying perf_dict, the value of perf_summary[query_name] is: ")
        pprint.pp(perf_summary[query_name])
        print()

        print('\nThe full value of perf_summary is:')
        pprint.pp(perf_summary)

        write_perf_data(perf_summary, perf_summary_path)

print("JSON files created successfully.")
