In [None]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text as sql_text
from datetime import datetime
import json
import os
import pprint

# Ensure 'perf_data' directory exists
os.makedirs('perf_data', exist_ok=True)

# Database connection setup
db_eng = create_engine('postgresql+psycopg2://postgres:postgres@localhost:5432/airbnb',
                       connect_args={'options': '-csearch_path={}'.format('new_york_city')},
                       isolation_level='SERIALIZABLE')

# Adding and updating the 'datetime' column in the 'reviews' table
with db_eng.connect() as conn:
    conn.execute(sql_text("ALTER TABLE reviews ADD COLUMN IF NOT EXISTS datetime TIMESTAMP;"))
    conn.execute(sql_text("UPDATE reviews SET datetime = TO_TIMESTAMP(TO_CHAR(date, 'YYYY-MM-DD') || ' 12:00:00', 'YYYY-MM-DD HH24:MI:SS');"))
    conn.commit()

In [38]:
results = {}
years = range(2009, 2025)
index_configs = [[], [('datetime', 'reviews')], [('id', 'listings')], [('datetime', 'reviews'), ('id', 'listings')]]
config_names = ["__", "__datetime_in_reviews__", "__id_in_listings__", "__datetime_in_reviews__id_in_listings__"]

def run_query(conn, query):
    start_time = datetime.now()
    conn.execute(sql_text(query))
    end_time = datetime.now()
    return (end_time - start_time).total_seconds()

def compute_metrics(times):
    return {"avg": np.mean(times), "min": np.min(times), "max": np.max(times), "std": np.std(times), "exec_count": len(times), "timestamp": datetime.now().isoformat()}

for year in years:
    query = f"SELECT DISTINCT l.id, l.name FROM listings l JOIN reviews r ON l.id = r.listing_id WHERE r.datetime >= '{year}-01-01' AND r.datetime <= '{year}-12-31' ORDER BY l.id;"
    query_name = f"listings_join_reviews_{year}"
    results[query_name] = {}
    
    for indexes, config_name in zip(index_configs, config_names):
        with db_eng.connect() as conn:
            # Add required indexes
            for column, table in indexes:
                index_name = f"idx_{column}_on_{table}"
                conn.execute(sql_text(f"CREATE INDEX IF NOT EXISTS {index_name} ON {table} ({column});"))
                
            # Execute the query 50 times and measure performance
            times = [run_query(conn, query) for _ in range(1)]
            
            # Remove indexes after testing
            for column, table in indexes:
                index_name = f"idx_{column}_on_{table}"
                conn.execute(sql_text(f"DROP INDEX IF EXISTS {index_name};"))

            # Store results
            results[query_name][config_name] = compute_metrics(times)
            print(f"Results for {query_name} {config_name}: {json.dumps(results[query_name][config_name], indent=4)}")

# Save performance data to JSON
perf_summary_path = 'perf_data/listings_join_reviews.json'
with open(perf_summary_path, 'w') as f:
    json.dump(results, f, indent=4)

print("All tests completed and data recorded.")

Results for listings_join_reviews_2009 __: {
    "avg": 7.12661,
    "min": 7.12661,
    "max": 7.12661,
    "std": 0.0,
    "exec_count": 1,
    "timestamp": "2024-05-24T18:38:42.532060"
}
Results for listings_join_reviews_2009 __datetime_in_reviews__: {
    "avg": 0.026932,
    "min": 0.026932,
    "max": 0.026932,
    "std": 0.0,
    "exec_count": 1,
    "timestamp": "2024-05-24T18:38:48.388295"
}
Results for listings_join_reviews_2009 __id_in_listings__: {
    "avg": 5.992614,
    "min": 5.992614,
    "max": 5.992614,
    "std": 0.0,
    "exec_count": 1,
    "timestamp": "2024-05-24T18:38:54.402891"
}
Results for listings_join_reviews_2009 __datetime_in_reviews__id_in_listings__: {
    "avg": 0.007413,
    "min": 0.007413,
    "max": 0.007413,
    "std": 0.0,
    "exec_count": 1,
    "timestamp": "2024-05-24T18:39:01.438906"
}
Results for listings_join_reviews_2010 __: {
    "avg": 5.661931,
    "min": 5.661931,
    "max": 5.661931,
    "std": 0.0,
    "exec_count": 1,
    "timesta