In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text as sql_text
from sqlalchemy.pool import QueuePool
import json
import os
import pprint
from datetime import datetime

# Ensure 'perf_data' directory exists
os.makedirs('perf_data', exist_ok=True)

# Connect to the database with connection pooling
db_eng = create_engine(
    'postgresql+psycopg2://postgres:postgres@localhost:5432/airbnb',
    connect_args={'options': '-csearch_path={}'.format('new_york_city')},
    isolation_level='SERIALIZABLE',
    poolclass=QueuePool,
    pool_size=10,
    max_overflow=20,
    pool_timeout=30,
    pool_recycle=1800
)


In [2]:
# Function to add or drop index and fetch current indexes on the table
def add_drop_index(engine, action, column, table):
    index_name = f"idx_{column}_in_{table}"
    if column == 'comments_tsv':
        index_type = 'GIN'
    else:
        index_type = 'btree'
        
    if action == 'add':
        query = sql_text(f"CREATE INDEX {index_name} ON {table} USING {index_type}({column});")
    elif action == 'drop':
        query = sql_text(f"DROP INDEX IF EXISTS {index_name};")
    with engine.connect() as conn:
        conn.execute(query)

# Function to calculate time difference
def time_diff(start_time, end_time):
    return (end_time - start_time).total_seconds()

# Function to execute a query and measure time
def run_query(query, conn):
    times = []
    for _ in range(2):  # Run the query twice for average timing
        start_time = datetime.now()
        conn.execute(sql_text(query))
        end_time = datetime.now()
        times.append(time_diff(start_time, end_time))
    return times

# Function to compute performance metrics
def compute_metrics(times):
    return {
        "avg": round(np.mean(times), 4),
        "min": round(np.min(times), 4),
        "max": round(np.max(times), 4),
        "std": round(np.std(times), 4),
        "exec_count": len(times),
        "timestamp": datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    }

# Function to build the index description key
def build_index_description_key(all_indexes, spec):
    description_key = ""
    for index in all_indexes:
        if index in spec:
            description_key += f"__{index[0]}_in_{index[1]}"
    description_key += "__"
    return description_key

# Fetch performance data
def fetch_perf_data(filename):
    try:
        with open(filename) as f:
            if os.stat(filename).st_size == 0:
                return {}
            return json.load(f)
    except FileNotFoundError:
        return {}

# Write performance data
def write_perf_data(data, filename):
    with open(filename, 'w') as fp:
        json.dump(data, fp, indent=4)


In [3]:
# Step 3b: Testing with query that does text search
text_search_queries = {
    'awesome': "comments_tsv @@ to_tsquery('awesome')",
    'horrible': "comments_tsv @@ to_tsquery('horrible')",
    'apartment': "comments_tsv @@ to_tsquery('apartment')",
}

years = [2009, 2010, 2011, 2012, 2013, 2014, 2017, 2019, 2023]

perf_summary_text_search_path = 'perf_data/text_search_query.json'
if not os.path.exists(perf_summary_text_search_path):
    with open(perf_summary_text_search_path, 'w') as f:
        json.dump({}, f)

perf_summary_text_search = fetch_perf_data(perf_summary_text_search_path)


In [4]:
all_indexes = [['datetime', 'reviews'], ['comments_tsv', 'reviews']]
specs_text_search = [
    [],
    [['datetime', 'reviews']],
    [['comments_tsv', 'reviews']],
    [['datetime', 'reviews'], ['comments_tsv', 'reviews']]
]

for word, condition in text_search_queries.items():
    for year in years:
        query_name = f"{word}_{year}"
        date_start = f"{year}-01-01 00:00:00"
        date_end = f"{year}-12-31 23:59:59"
        query_with_index = f"""
        SELECT count(*)
        FROM reviews r
        WHERE {condition}
          AND datetime >= '{date_start}'
          AND datetime <= '{date_end}'
        """
        query_without_index = f"""
        SELECT count(*)
        FROM reviews r
        WHERE comments ILIKE '%%{word}%%'
          AND datetime >= '{date_start}'
          AND datetime <= '{date_end}'
        """

        for spec in specs_text_search:
            print('Processing text search spec: ', str(spec), '\n')

            for index in all_indexes:
                if index not in spec:
                    add_drop_index(db_eng, 'drop', index[0], index[1])
                    print('\nAfter dropping', str(index))

            for index in spec:
                add_drop_index(db_eng, 'add', index[0], index[1])
                print('\nAfter adding', str(index))

            time_list_with_index = []
            time_list_without_index = []
            for i in range(50):  # Run 50 times for statistical significance
                with db_eng.connect() as conn:
                    times_with_index = run_query(query_with_index, conn)
                    times_without_index = run_query(query_without_index, conn)
                time_list_with_index.extend(times_with_index)
                time_list_without_index.extend(times_without_index)
            
            perf_profile_with_index = compute_metrics(time_list_with_index)
            perf_profile_without_index = compute_metrics(time_list_without_index)

            print('\nThe list of running times with index is as follows:')
            pprint.pp(time_list_with_index)

            print('\nThe statistics on the list of running times with index are as follows:')
            pprint.pp(perf_profile_with_index)

            print('\nThe list of running times without index is as follows:')
            pprint.pp(time_list_without_index)

            print('\nThe statistics on the list of running times without index are as follows:')
            pprint.pp(perf_profile_without_index)

            key_value_with_index = build_index_description_key(all_indexes, spec) + 'with_index'
            key_value_without_index = build_index_description_key(all_indexes, spec) + 'without_index'

            print('\nThe new value for "' + key_value_with_index + '" will be', str(perf_profile_with_index))
            print('\nThe new value for "' + key_value_without_index + '" will be', str(perf_profile_without_index))

            if query_name in perf_summary_text_search:
                perf_dict = perf_summary_text_search[query_name]
                print("\nBefore modifying perf_dict, the value of perf_summary_text_search[query_name] (if it existed) was: ")
                pprint.pp(perf_dict)
            else:
                perf_dict = {}
                print("\nBefore modifying perf_dict, the value of perf_summary_text_search[query_name] had empty value")
            print()
            perf_dict[key_value_with_index] = perf_profile_with_index
            perf_dict[key_value_without_index] = perf_profile_without_index
            perf_summary_text_search[query_name] = perf_dict

            print("\nAfter modifying perf_dict, the value of perf_summary_text_search[query_name] is: ")
            pprint.pp(perf_summary_text_search[query_name])
            print()

            print('\nThe full value of perf_summary_text_search is:')
            pprint.pp(perf_summary_text_search)

            write_perf_data(perf_summary_text_search, perf_summary_text_search_path)

print("Text search JSON files created successfully.")


{'update_datetimes_neigh_group_Manhattan_add': '\n'
                                               '    UPDATE reviews r\n'
                                               '    SET datetime = datetime + '
                                               "interval '5 days'\n"
                                               '    FROM listings l\n'
                                               '    WHERE l.id = r.listing_id\n'
                                               '      AND '
                                               'l.neighbourhood_group = '
                                               "'Manhattan'\n"
                                               "    RETURNING 'done';\n"
                                               '    ',
 'update_datetimes_neigh_group_Manhattan_subtract': '\n'
                                                    '    UPDATE reviews r\n'
                                                    '    SET datetime = '
                                          