In [7]:
import numpy as np
import json
from datetime import datetime
from sqlalchemy import create_engine, text as sql_text

# Database configuration
db_eng = create_engine('postgresql+psycopg2://postgres:postgres@localhost:5432/airbnb',
                       connect_args={'options': '-csearch_path=new_york_city'},
                       echo=False)

def add_drop_index(conn, action, index_name, column, table):
    """Add or drop an index based on the action."""
    if action == 'add':
        conn.execute(sql_text(f"""
        BEGIN;
        CREATE INDEX IF NOT EXISTS {index_name} ON {table} ({column});
        COMMIT;
        """))
    elif action == 'drop':
        conn.execute(sql_text(f"""
        BEGIN;
        DROP INDEX IF EXISTS {index_name};
        COMMIT;
        """))

def reset_reviews_table(conn):
    """Reset the reviews table to a clean state before each test."""
    conn.execute(sql_text("CREATE TABLE copy_table AS SELECT * FROM reviews;"))
    conn.execute(sql_text("DROP TABLE reviews;"))
    conn.execute(sql_text("ALTER TABLE copy_table RENAME TO reviews;"))

def run_update_and_count_query(conn, update_query, count_query, count=50):
    """Run an update query followed by a count query multiple times and collect performance data."""
    times = []
    for _ in range(count):
        start = datetime.now()
        conn.execute(sql_text(update_query))  # Execute the update query
        result = conn.execute(sql_text(count_query))  # Execute the count query
        end = datetime.now()
        times.append((end - start).total_seconds())
    return {
        'avg': np.round(np.mean(times), 4),
        'min': np.round(np.min(times), 4),
        'max': np.round(np.max(times), 4),
        'std': np.round(np.std(times), 4),
        'count': count,
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    }

areas = [
    ('Bedford-Stuyvesant', 'neighbourhood'),
    ('Bronx', 'neighbourhood_group'),
    ('Fort Hamilton', 'neighbourhood'),
    ('Long Island City', 'neighbourhood'),
    ('Manhattan', 'neighbourhood_group'),
    ('New Springville', 'neighbourhood'),
    ('Queens', 'neighbourhood_group'),
    ('Staten Island', 'neighbourhood_group')
]


In [8]:
results = {}
# Ensure db_eng is correctly configured as a SQLAlchemy engine
with db_eng.connect() as conn:
    trans = conn.begin()
    try:
        for area, area_type in areas:

            update_key = f"update_datetimes_neigh_{area.replace(' ', '_')}"
            results[update_key] = {}

            update_query = f"""
            UPDATE reviews SET datetime = datetime + interval '5 days'
            FROM listings WHERE listings.id = reviews.listing_id AND listings.{area_type} = '{area}';
            """
            count_query = f"""
            SELECT count(*) FROM reviews
            WHERE listing_id IN (SELECT id FROM listings WHERE {area_type} = '{area}');
            """

            # No index
            reset_reviews_table(conn)
            results[update_key]['__'] = run_update_and_count_query(conn, update_query, count_query)

            # With datetime index
            reset_reviews_table(conn)
            datetime_index_name = "datetime_in_reviews"
            add_drop_index(conn, 'add', datetime_index_name, 'datetime', 'reviews')
            results[update_key]['__datetime_in_reviews__'] = run_update_and_count_query(conn, update_query, count_query)
            add_drop_index(conn, 'drop', datetime_index_name, 'datetime', 'reviews')

            # With area/area group index
            reset_reviews_table(conn)
            area_index_name = f"{area_type}_in_listings"
            add_drop_index(conn, 'add', area_index_name, area_type, 'listings')
            results[update_key][f"__neigh_in_listings__"] = run_update_and_count_query(conn, update_query, count_query)
            add_drop_index(conn, 'drop', area_index_name, area_type, 'listings')

            # With both indexes
            reset_reviews_table(conn)
            add_drop_index(conn, 'add', datetime_index_name, 'datetime', 'reviews')
            add_drop_index(conn, 'add', area_index_name, area_type, 'listings')
            combined_index_key = f"__datetime_in_reviews__neigh_in_listings__"
            results[update_key][combined_index_key] = run_update_and_count_query(conn, update_query, count_query)
            add_drop_index(conn, 'drop', area_index_name, area_type, 'listings')
            add_drop_index(conn, 'drop', datetime_index_name, 'datetime', 'reviews')
            print(f"{update_key}: {json.dumps(results[update_key], indent=4)}")

        trans.commit()  # Commit the transaction if all operations were successful
    except Exception as e:
        trans.rollback()  # Roll back the transaction in case of an error
        print(f"An error occurred: {e}")

# Output results to a JSON file
with open('perf_data/update_datetimes_query.json', 'w') as f:
    json.dump(results, f, indent=4, sort_keys=True)

print("Update performance data recorded successfully.")

update_datetimes_neigh_Bedford-Stuyvesant: {
    "__": {
        "avg": 7.5524,
        "min": 3.0716,
        "max": 14.1244,
        "std": 2.5688,
        "count": 50,
        "timestamp": "2024-05-24 03:23:36"
    },
    "__datetime_in_reviews__": {
        "avg": 8.2498,
        "min": 1.713,
        "max": 20.1522,
        "std": 4.821,
        "count": 50,
        "timestamp": "2024-05-24 03:30:46"
    },
    "__neigh_in_listings__": {
        "avg": 4.7111,
        "min": 0.9016,
        "max": 19.1962,
        "std": 4.3694,
        "count": 50,
        "timestamp": "2024-05-24 03:35:15"
    },
    "__datetime_in_reviews__neigh_in_listings__": {
        "avg": 8.1511,
        "min": 1.8341,
        "max": 14.2975,
        "std": 4.1339,
        "count": 50,
        "timestamp": "2024-05-24 03:42:26"
    }
}
update_datetimes_neigh_Bronx: {
    "__": {
        "avg": 1.5506,
        "min": 0.6549,
        "max": 4.9441,
        "std": 1.0727,
        "count": 50,
        "timest