In [5]:
import numpy as np
import json
from datetime import datetime
from sqlalchemy import create_engine, text as sql_text

# Database configuration
db_eng = create_engine('postgresql+psycopg2://postgres:postgres@localhost:5432/airbnb',
                       connect_args={'options': '-csearch_path=new_york_city'},
                       echo=False)

def add_drop_index(conn, action, index_name, column, table):
    """Add or drop an index based on the action."""
    if action == 'add':
        conn.execute(sql_text(f"""
        BEGIN;
        CREATE INDEX IF NOT EXISTS {index_name} ON {table} ({column});
        COMMIT;
        """))
    elif action == 'drop':
        conn.execute(sql_text(f"""
        BEGIN;
        DROP INDEX IF EXISTS {index_name};
        COMMIT;
        """))

def run_update_and_count_query(conn, update_query, count_query, count=1):
    """Run an update query followed by a count query multiple times and collect performance data."""
    times = []
    for _ in range(count):
        start = datetime.now()
        conn.execute(sql_text(update_query))  # Execute the update query
        result = conn.execute(sql_text(count_query))  # Execute the count query
        end = datetime.now()
        times.append((end - start).total_seconds())
    return {
        'avg': np.round(np.mean(times), 4),
        'min': np.round(np.min(times), 4),
        'max': np.round(np.max(times), 4),
        'std': np.round(np.std(times), 4),
        'count': count,
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    }

areas = [
    ('Bedford-Stuyvesant', 'neighbourhood'),
    ('Bronx', 'neighbourhood_group'),
    ('Fort Hamilton', 'neighbourhood'),
    ('Long Island City', 'neighbourhood'),
    ('Manhattan', 'neighbourhood_group'),
    ('New Springville', 'neighbourhood'),
    ('Queens', 'neighbourhood_group'),
    ('Staten Island', 'neighbourhood_group')
]


In [6]:
results = {}
with db_eng.connect() as conn:
    for area, area_type in areas:
        update_key = f"update_datetimes_neigh_{area.replace(' ', '_')}"
        results[update_key] = {}

        update_query = f"""
        UPDATE reviews SET datetime = datetime + interval '5 days'
        FROM listings WHERE listings.id = reviews.listing_id AND listings.{area_type} = '{area}';
        """
        count_query = f"""
        SELECT count(*) FROM reviews
        WHERE listing_id IN (SELECT id FROM listings WHERE {area_type} = '{area}');
        """

        # No index
        results[update_key]['__'] = run_update_and_count_query(conn, update_query, count_query)

        # With datetime index
        datetime_index_name = "datetime_in_reviews"
        add_drop_index(conn, 'add', datetime_index_name, 'datetime', 'reviews')
        results[update_key]['__datetime_in_reviews__'] = run_update_and_count_query(conn, update_query, count_query)
        add_drop_index(conn, 'drop', datetime_index_name, 'datetime', 'reviews')

        # With area/area group index
        area_index_name = f"{area_type}_in_listings"
        add_drop_index(conn, 'add', area_index_name, area_type, 'listings')
        results[update_key][f"__neighbourhood_group_in_listings__"] = run_update_and_count_query(conn, update_query, count_query)
        add_drop_index(conn, 'drop', area_index_name, area_type, 'listings')

        # With both indexes
        add_drop_index(conn, 'add', datetime_index_name, 'datetime', 'reviews')
        add_drop_index(conn, 'add', area_index_name, area_type, 'listings')
        combined_index_key = f"__datetime_in_reviews__neighbourhood_group_in_listings__"
        results[update_key][combined_index_key] = run_update_and_count_query(conn, update_query, count_query)
        add_drop_index(conn, 'drop', area_index_name, area_type, 'listings')
        add_drop_index(conn, 'drop', datetime_index_name, 'datetime', 'reviews')

        # Print results for each area
        print(f"{update_key}: {json.dumps(results[update_key], indent=4)}")

# Output results to a JSON file
with open('perf_data/update_datetimes_query.json', 'w') as f:
    json.dump(results, f, indent=4, sort_keys=True)

print("Update performance data recorded successfully.")


update_datetimes_neigh_Bedford-Stuyvesant: {
    "__": {
        "avg": 11.8403,
        "min": 8.7269,
        "max": 17.2456,
        "std": 3.8368,
        "count": 3,
        "timestamp": "2024-05-23 20:51:33"
    },
    "__datetime_in_reviews__": {
        "avg": 17.2783,
        "min": 16.2422,
        "max": 19.2654,
        "std": 1.4056,
        "count": 3,
        "timestamp": "2024-05-23 20:52:29"
    },
    "__neighbourhood_in_listings__": {
        "avg": 14.9379,
        "min": 11.4676,
        "max": 17.751,
        "std": 2.6069,
        "count": 3,
        "timestamp": "2024-05-23 20:53:15"
    },
    "__datetime_in_reviews__neighbourhood_in_listings__": {
        "avg": 21.2818,
        "min": 19.9904,
        "max": 22.3071,
        "std": 0.9643,
        "count": 3,
        "timestamp": "2024-05-23 20:54:25"
    }
}
update_datetimes_neigh_Bronx: {
    "__": {
        "avg": 14.03,
        "min": 12.6585,
        "max": 14.9256,
        "std": 0.9848,
        "count"