In [1]:
import numpy as np
from sqlalchemy import create_engine, text as sql_text
import json
from datetime import datetime
from collections import OrderedDict

# Database configuration
db_eng = create_engine('postgresql+psycopg2://postgres:postgres@localhost:5432/airbnb',
                       connect_args={'options': '-csearch_path=new_york_city'},
                       echo=False)


In [7]:
with db_eng.connect() as conn:
    # Add a column if it does not exist
    conn.execute(sql_text("""
    ALTER TABLE reviews ADD COLUMN IF NOT EXISTS comments_tsv tsvector;
    """))
    # Update the new column with data
    conn.execute(sql_text("""
    UPDATE reviews SET comments_tsv = to_tsvector(comments);
    """))
    # Create a GIN index if it does not exist
    conn.execute(sql_text("""
    CREATE INDEX IF NOT EXISTS comments_tsv_in_reviews ON reviews USING GIN (comments_tsv);
    """))
    conn.commit()  # Ensure all changes are committed to the database

In [8]:
def add_drop_index(conn, action, index_name, column, table):
    """Add or drop an index based on the action."""
    if action == 'add':
        query = f"""
        BEGIN;
        CREATE INDEX IF NOT EXISTS {index_name} ON {table} ({column});
        COMMIT;
        """
    elif action == 'drop':
        query = f"""
        BEGIN;
        DROP INDEX IF EXISTS {index_name};
        COMMIT;
        """

# Function to run a query multiple times and collect performance data
def run_query_and_collect_data(query, conn, count=50):
    times = []
    for _ in range(count):
        start = datetime.now()
        conn.execute(sql_text(query))
        end = datetime.now()
        times.append((end - start).total_seconds())
    return {
        'avg': np.round(np.mean(times), 4),
        'min': np.round(np.min(times), 4),
        'max': np.round(np.max(times), 4),
        'std': np.round(np.std(times), 4),
        'count': count,
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    }

words = ['horrible', 'awesome', 'apartment']
years = [2009, 2010, 2011, 2012, 2013, 2014, 2017, 2019, 2023]
results = {}

with db_eng.connect() as conn:
    for year in years:
        for word in words:
            query_key = f"{word}_{year}"
            results[query_key] = {}
            
            # Define the index name
            index_name = f"datetime_idx_on_{year}"
            
            ts_query = f"SELECT count(*) FROM reviews WHERE comments_tsv @@ to_tsquery('{word}') AND datetime BETWEEN '{year}-01-01' AND '{year}-12-31';"
            ilike_query = f"SELECT count(*) FROM reviews WHERE comments ILIKE '%%{word}%%' AND datetime BETWEEN '{year}-01-01' AND '{year}-12-31';"
            
            # Drop datetime index for the LIKE query
            add_drop_index(conn, 'drop', index_name, 'datetime', 'reviews')
            results[query_key]['__'] = run_query_and_collect_data(ilike_query, conn)
            results[query_key]['__comments_tsv_in_reviews__'] = run_query_and_collect_data(ts_query, conn)

            # Re-create datetime index for the indexed queries
            add_drop_index(conn, 'add', index_name, 'datetime', 'reviews')
            results[query_key]['__datetime_in_reviews__'] = run_query_and_collect_data(ilike_query, conn)
            results[query_key]['__datetime_in_reviews__comments_tsv_in_reviews__'] = run_query_and_collect_data(ts_query, conn)
            print(f"{query_key}: {json.dumps(results[query_key], indent=4)}")


# Sorting the dictionary by keys in the order of words and then years
sorted_keys = sorted(results.keys(), key=lambda x: (x.split('_')[0], int(x.split('_')[1])))
sorted_results = OrderedDict((k, results[k]) for k in sorted_keys)

# Write performance data to JSON in sorted order
with open('perf_data/text_search_query.json', 'w') as file:
    json.dump(sorted_results, file, indent=4)

print("Performance data has been recorded successfully.")

horrible_2009: {
    "__": {
        "avg": 2.0068,
        "min": 1.301,
        "max": 4.7836,
        "std": 0.5851,
        "count": 50,
        "timestamp": "2024-05-23 22:05:35"
    },
    "__comments_tsv_in_reviews__": {
        "avg": 0.0011,
        "min": 0.0003,
        "max": 0.0336,
        "std": 0.0047,
        "count": 50,
        "timestamp": "2024-05-23 22:05:35"
    },
    "__datetime_in_reviews__": {
        "avg": 1.6343,
        "min": 1.2567,
        "max": 2.7094,
        "std": 0.3412,
        "count": 50,
        "timestamp": "2024-05-23 22:06:57"
    },
    "__datetime_in_reviews__comments_tsv_in_reviews__": {
        "avg": 0.0006,
        "min": 0.0005,
        "max": 0.0019,
        "std": 0.0002,
        "count": 50,
        "timestamp": "2024-05-23 22:06:57"
    }
}
awesome_2009: {
    "__": {
        "avg": 2.2477,
        "min": 1.6357,
        "max": 4.4618,
        "std": 0.5297,
        "count": 50,
        "timestamp": "2024-05-23 22:08:50"
    },
