In [1]:
import pandas as pd
from sql_metadata import Parser
from collections import defaultdict

pd.set_option('display.max_colwidth', None)

In [13]:
!ls /usr/local/var/postgres/log

postgresql-2022-02-10_211004.csv postgresql-2022-02-10_211004.log


In [30]:
LOG_DIRECTORY = 'epinions_workload.csv'
df = pd.read_csv(LOG_DIRECTORY, header=None, usecols=[7, 13], names=["query_type", "query_text"])
df.iloc[70:80]

Unnamed: 0,query_type,query_text
70,COMMIT,execute S_1: COMMIT
71,BEGIN,execute <unnamed>: BEGIN
72,UPDATE,execute <unnamed>: UPDATE review SET rating = $1 WHERE i_id=$2 AND u_id=$3
73,COMMIT,execute S_1: COMMIT
74,BEGIN,execute <unnamed>: BEGIN
75,UPDATE,execute <unnamed>: UPDATE trust SET trust = $1 WHERE source_u_id=$2 AND target_u_id=$3
76,COMMIT,execute S_1: COMMIT
77,BEGIN,execute <unnamed>: BEGIN
78,UPDATE,execute <unnamed>: UPDATE item SET title = $1 WHERE i_id=$2
79,COMMIT,execute S_1: COMMIT


In [16]:
table_to_columns_epinions = {
  "item": set(["i_id", "title"]),
  "useracct": set(["u_id", "name"]),
  "review": set(["a_id", "u_id", "i_id", "rating", "rank"]),
  "trust": set(["source_u_id", "target_u_id", "trust", "creation_date"]),
  "review_rating": set(["u_id", "a_id", "rating", "status", "creation_date", "last_mod_date", "type", "vertical_id"])
}

In [28]:
qs = set()
def parse_query(query_str : str, ttc_mapping : dict):
  
  if "pg_" in query_str:
    return None

  query_str = query_str[query_str.find(':')+1:] # Remove "execute <unamed>:"

  # Ignore non-relevant queries
  if "BEGIN" in query_str or "COMMIT" in query_str:
    return None

  try:
    p = Parser(query_str)
    tables = p.tables
    columns = p.columns
  except:
    return None

  # Skip queries that dont have a where clause
  if len(tables) == 0 or len(columns) == 0:
    return None

  # Build string "table.column_name" for each column
  added = False
  for column in columns:
    if "." in column:
      added = True
    else:
      # Find which table this column corresponds to
      for table in tables:
        if column in ttc_mapping[table]:
          added = True
          break
  
  if not added:
    # print("Invalid Query:", query_str)
    return None
  return query_str

In [29]:
queries = []
for index, row in df.iterrows():
  if pd.isna(row["query_type"]):
    continue
  q = parse_query(row["query_text"], table_to_columns_epinions)
  if q is not None:
    queries.append(q)

set(queries)


Not supported query type:  SET extra_float_digits = 3
Not supported query type:  SET application_name = 'PostgreSQL JDBC Driver'
Not supported query type:  SET extra_float_digits = 3
Not supported query type:  SET application_name = 'PostgreSQL JDBC Driver'
Not supported query type:  SET extra_float_digits = 3
Not supported query type:  SET application_name = 'PostgreSQL JDBC Driver'
Not supported query type:  SET SESSION CHARACTERISTICS AS TRANSACTION ISOLATION LEVEL SERIALIZABLE
Not supported query type:  SET extra_float_digits = 3
Not supported query type:  SET application_name = 'PostgreSQL JDBC Driver'
Not supported query type:  SHOW ALL
Not supported query type:  ALTER SYSTEM SET log_destination='stderr'
Not supported query type:  ALTER SYSTEM SET logging_collector='off'
Not supported query type:  ALTER SYSTEM SET log_statement='none'


{' SELECT * FROM review r WHERE r.i_id=$1 ORDER BY creation_date DESC',
 ' SELECT * FROM review r, item i WHERE i.i_id = r.i_id and r.i_id=$1 ORDER BY rating DESC, r.creation_date DESC LIMIT 10',
 ' SELECT * FROM review r, useracct u WHERE u.u_id = r.u_id AND r.u_id=$1 ORDER BY rating DESC, r.creation_date DESC LIMIT 10',
 ' SELECT * FROM trust t WHERE t.source_u_id=$1',
 ' SELECT avg(rating) FROM review r WHERE r.i_id=$1',
 ' SELECT avg(rating) FROM review r, trust t WHERE r.u_id=t.target_u_id AND r.i_id=$1 AND t.source_u_id=$2',
 ' SELECT i_id FROM item',
 ' SELECT u_id FROM useracct',
 ' UPDATE item SET title = $1 WHERE i_id=$2',
 ' UPDATE review SET rating = $1 WHERE i_id=$2 AND u_id=$3',
 ' UPDATE trust SET trust = $1 WHERE source_u_id=$2 AND target_u_id=$3',
 ' UPDATE useracct SET name = $1 WHERE u_id=$2'}

In [22]:
def build_actions_str(indexes : list, filename : str) -> list:
  cmds = [f'echo "Generating sql commands for {filename}"']
  for i, index in enumerate(indexes):
    table, column = index.split(".")
    if i == 0:
      cmd = f'echo "CREATE INDEX idx_{table}_{column} ON {table}({column});" > {filename}'
    else:
      cmd = f'echo "CREATE INDEX idx_{table}_{column} ON {table}({column});" >> {filename}'
    cmds.append(cmd)

  return cmds


def build_drop_idx_cmd(indexes : list) -> str:
  drop_cmd = "DROP INDEX"
  for i, index in enumerate(indexes):
    table, column = index.split(".")
    if i != len(indexes)-1:
      drop_cmd += f" idx_{table}_{column},"
    else:
      drop_cmd += f" idx_{table}_{column};"

  return drop_cmd



  
  
      
