#### SQL Tuning Exploration

In [31]:
# Packages
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy import text
from sqlalchemy.orm import sessionmaker
import os
from dotenv import load_dotenv
import logging
import re

In [139]:
# Constants
load_dotenv("../src/.env")
DB_USER = os.getenv("DB_USER")
DB_NAME = "dblp"
DB_PASSWORD = os.getenv("DB_PASSWORD")
HOST = os.getenv("HOST")
PORT = os.getenv("PORT")
CONNECT_STRING = f"postgresql://{DB_USER}:{DB_PASSWORD}@{HOST}:{PORT}/{DB_NAME}"
SQL_CODE = "../src/sql_tuning_testing.sql"

# Settings
pd.set_option('display.max_colwidth', None)

# Engine
engine = create_engine(CONNECT_STRING)
Session = sessionmaker(engine)

In [132]:
# Parse query into a dictionary of dictionaries with keys (Question, Query, Indexes, Cleanup)
parsed_query = {f"P{k+1}":v for k,v in 
                enumerate([{re.sub(r"^/\*(\w+):[\S\s]*", r"\1",i):re.sub(r"\*/","",re.sub(r"^.*:\*/\n|^.*: ", "",i)) for i in j} for j in test])}

In [146]:
def indexing_experiment(question_num):
    """Helper function to iterate through parsed SQL code to run each query, evaluate performance with/without indices, and
    cleanup."""
    context = parsed_query[question_num]
    # Show Question
    print(context['Question'])

    # Show query result
    print("Query Result (First 5 rows)")
    display(pd.read_sql(context['Query'], engine).head())

    # Explain results before indexing
    print('EXPLAIN results')
    display(pd.read_sql("EXPLAIN (ANALYZE,BUFFERS) " + context['Query'], engine))

    # Run indexing
    with Session() as session:
        session.execute(text(context['Indexing']))
        session.commit()

    # Re-run Explain
    print('EXPLAIN results')
    display(pd.read_sql("EXPLAIN (ANALYZE,BUFFERS) " + context['Query'], engine))

    # Cleanup indexes
    with Session() as session:
        session.execute(text(context['Cleanup']))
        session.commit()

##### Question 1:
We can see that

In [147]:
indexing_experiment("P1")

Write a SQL Query to find all the conferences held in 2018 that have published at least 200 papers in a single decade.


Query Result (First 5 rows)


Unnamed: 0,booktitle
0,COLINS
1,ICL (2)
2,CBI (1)
3,ICSOC
4,FG


EXPLAIN results


Unnamed: 0,QUERY PLAN
0,HashAggregate (cost=609916.11..609918.11 rows=200 width=8) (actual time=1527.338..1527.457 rows=1456 loops=1)
1,Group Key: inp.booktitle
2,Batches: 1 Memory Usage: 145kB
3,Buffers: shared hit=31928 read=133286
4,-> HashAggregate (cost=549876.72..608476.32 rows=115183 width=48) (actual time=1526.077..1526.970 rows=2691 loops=1)
5,"Group Key: inp.booktitle, substr(inp.year, 3, 1)"
6,Filter: (count(*) >= 200)
7,Planned Partitions: 8 Batches: 1 Memory Usage: 2065kB
8,Rows Removed by Filter: 3890
9,Buffers: shared hit=31928 read=133286


EXPLAIN results


Unnamed: 0,QUERY PLAN
0,Unique (cost=416268.58..508132.91 rows=200 width=8) (actual time=944.256..956.138 rows=1456 loops=1)
1,Buffers: shared hit=135348 read=125840 written=12
2,-> Subquery Scan on paper_counts (cost=416268.58..507844.95 rows=115183 width=8) (actual time=944.255..955.860 rows=2691 loops=1)
3,Buffers: shared hit=135348 read=125840 written=12
4,-> Finalize GroupAggregate (cost=416268.58..506693.12 rows=115183 width=48) (actual time=944.254..955.725 rows=2691 loops=1)
5,"Group Key: inp.booktitle, (substr(inp.year, 3, 1))"
6,Filter: (count(*) >= 200)
7,Rows Removed by Filter: 3890
8,Buffers: shared hit=135348 read=125840 written=12
9,-> Gather Merge (cost=416268.58..496902.54 rows=691100 width=48) (actual time=944.243..953.757 rows=11304 loops=1)
