In [11]:
import numpy as np
from sqlalchemy import create_engine, text as sql_text
import json
from datetime import datetime
from collections import OrderedDict

# Database configuration
db_eng = create_engine('postgresql+psycopg2://postgres:postgres@localhost:5432/airbnb',
                       connect_args={'options': '-csearch_path=new_york_city'},
                       echo=False)

In [24]:
with db_eng.connect() as conn:
    # Add a column if it does not exist
    conn.execute(sql_text("""
    ALTER TABLE reviews ADD COLUMN IF NOT EXISTS comments_tsv tsvector;
    """))
    # Update the new column with data
    conn.execute(sql_text("""
    UPDATE reviews SET comments_tsv = to_tsvector(comments);
    """))
    # Create a GIN index if it does not exist
    conn.execute(sql_text("""
    CREATE INDEX IF NOT EXISTS comments_tsv_in_reviews ON reviews USING GIN (comments_tsv);
    """))
    conn.commit()  # Ensure all changes are committed to the database


In [22]:
# Function to run a query multiple times and collect performance data
def run_query_and_collect_data(query, conn, count=50):
    times = []
    for _ in range(count):
        start = datetime.now()
        conn.execute(sql_text(query))
        end = datetime.now()
        times.append((end - start).total_seconds())
    return {
        'avg': round(np.mean(times), 4),
        'min': round(np.min(times), 4),
        'max': round(np.max(times), 4),
        'std': round(np.std(times), 4),
        'exec_count': count,
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    }


In [25]:
results = {}

# Main execution loop
for year in [2009, 2010, 2011, 2012, 2013, 2014, 2017, 2019, 2023]:
    for word in ['apartment', 'awesome', 'horrible']:
        query_key = f"{word}_{year}"
        results[query_key] = {}
        with db_eng.connect() as conn:
            # Text search without index
            like_query = f"SELECT count(*) FROM reviews WHERE comments ILIKE '%%{word}%%' AND datetime BETWEEN '{year}-01-01' AND '{year}-12-31';"
            ts_query = f"SELECT count(*) FROM reviews WHERE comments_tsv @@ to_tsquery('{word}') AND datetime BETWEEN '{year}-01-01' AND '{year}-12-31';"
            
            # Drop datetime index for the LIKE query
            conn.execute(sql_text("DROP INDEX IF EXISTS idx_datetime_reviews;"))
            results[query_key]['__'] = run_query_and_collect_data(like_query, conn)
            results[query_key]['__comments_tsv_in_reviews__'] = run_query_and_collect_data(ts_query, conn)

            # Re-create datetime index for the indexed queries
            conn.execute(sql_text("CREATE INDEX idx_datetime_reviews ON reviews (datetime);"))
            results[query_key]['__datetime_in_reviews__'] = run_query_and_collect_data(like_query, conn)
            results[query_key]['__datetime_in_reviews__comments_tsv_in_reviews__'] = run_query_and_collect_data(ts_query, conn)
            print(f"{query_key}: {json.dumps(results[query_key], indent=4)}")


# Sorting the dictionary by keys in the order of words and then years
sorted_keys = sorted(results.keys(), key=lambda x: (x.split('_')[0], int(x.split('_')[1])))
sorted_results = OrderedDict((k, results[k]) for k in sorted_keys)

# Write performance data to JSON in sorted order
with open('perf_data/text_search_query.json', 'w') as file:
    json.dump(sorted_results, file, indent=4)

print("Performance data recorded successfully.")

apartment_2009: {
    "__": {
        "avg": 10.231,
        "min": 10.231,
        "max": 10.231,
        "std": 0.0,
        "exec_count": 1,
        "timestamp": "2024-05-23 00:03:57"
    },
    "__comments_tsv_in_reviews__": {
        "avg": 3.2356,
        "min": 3.2356,
        "max": 3.2356,
        "std": 0.0,
        "exec_count": 1,
        "timestamp": "2024-05-23 00:04:00"
    },
    "__datetime_in_reviews__": {
        "avg": 0.0067,
        "min": 0.0067,
        "max": 0.0067,
        "std": 0.0,
        "exec_count": 1,
        "timestamp": "2024-05-23 00:04:07"
    },
    "__datetime_in_reviews__comments_tsv_in_reviews__": {
        "avg": 0.0033,
        "min": 0.0033,
        "max": 0.0033,
        "std": 0.0,
        "exec_count": 1,
        "timestamp": "2024-05-23 00:04:07"
    }
}
awesome_2009: {
    "__": {
        "avg": 5.6622,
        "min": 5.6622,
        "max": 5.6622,
        "std": 0.0,
        "exec_count": 1,
        "timestamp": "2024-05-23 00:04:13"
