In [12]:
import pandas as pd
from sql_metadata import Parser
from collections import defaultdict

pd.set_option('display.max_colwidth', None)

In [13]:
!ls /usr/local/var/postgres/log

postgresql-2022-02-10_211004.csv postgresql-2022-02-10_211004.log


In [35]:
LOG_DIRECTORY = '/usr/local/var/postgres/log/postgresql-2022-02-10_211004.csv'
df = pd.read_csv(LOG_DIRECTORY, header=None, usecols=[7, 13, 14], names=["query_type", "query_text", "params"])
df.head(10)

Unnamed: 0,query_type,query_text,params
0,,ending log output to stderr,
1,,"starting PostgreSQL 14.1 on x86_64-apple-darwin21.1.0, compiled by Apple clang version 13.0.0 (clang-1300.0.29.3), 64-bit",
2,,"listening on IPv6 address ""::1"", port 5432",
3,,"listening on IPv4 address ""127.0.0.1"", port 5432",
4,,"listening on Unix socket ""/tmp/.s.PGSQL.5432""",
5,,database system was shut down at 2022-02-10 21:10:02 EST,
6,,database system is ready to accept connections,
7,BEGIN,execute <unnamed>: BEGIN,
8,SET,execute <unnamed>: SET extra_float_digits = 3,
9,SET,execute <unnamed>: SET application_name = 'PostgreSQL JDBC Driver',


In [17]:
table_to_columns_epinions = {
  "item": set(["i_id", "title"]),
  "useracct": set(["u_id", "name"]),
  "review": set(["a_id", "u_id", "i_id", "rating", "rank"]),
  "trust": set(["source_u_id", "target_u_id", "trust", "creation_date"]),
  "review_rating": set(["u_id", "a_id", "rating", "status", "creation_date", "last_mod_date", "type", "vertical_id"])
}

In [61]:
def parse_query(query_str : str, ttc_mapping : dict):
  if "pg_" in query_str:
    return None

  query_str = query_str[query_str.find(':')+1:] # Remove "execute <unamed>:"

  # Ignore non-relevant queries
  if "BEGIN" in query_str or "COMMIT" in query_str:
    return None

  try:
    p = Parser(query_str)
    tables = p.tables
    columns = p.columns
  except:
    return None

  # Skip queries that dont have a where clause
  if len(tables) == 0 or len(columns) == 0:
    return None

  # Build string "table.column_name" for each column
  added = False
  for column in columns:
    if "." in column:
      added = True
      column_count[column] += 1
    else:
      # Find which table this column corresponds to
      for table in tables:
        if column in ttc_mapping[table]:
          added = True
          column_count[f"{table}.{column}"] += 1
          break
  
  if not added:
    print("Invalid Query:", query_str)      

In [62]:
queries = []
for index, row in df.iterrows():
  if pd.isna(row["query_type"]):
    continue
  q = parse_query(row["query_text"], table_to_columns_epinions)
  if q is not None:
    queries.append(q)


Not supported query type:  SET extra_float_digits = 3
Not supported query type:  SET application_name = 'PostgreSQL JDBC Driver'
Not supported query type:  SET extra_float_digits = 3
Not supported query type:  SET application_name = 'PostgreSQL JDBC Driver'
Not supported query type:  SET extra_float_digits = 3
Not supported query type:  SET application_name = 'PostgreSQL JDBC Driver'
Not supported query type:  SET SESSION CHARACTERISTICS AS TRANSACTION ISOLATION LEVEL SERIALIZABLE
Not supported query type:  SET extra_float_digits = 3
Not supported query type:  SET application_name = 'PostgreSQL JDBC Driver'
Not supported query type:  SHOW ALL
Not supported query type:  ALTER SYSTEM SET log_destination='stderr'
Not supported query type:  ALTER SYSTEM SET logging_collector='off'
Not supported query type:  ALTER SYSTEM SET log_statement='none'


In [63]:
queries[-1:]

[(' UPDATE review SET rating = $1 WHERE i_id=$2 AND u_id=$3',
  ['review'],
  ['rating', 'i_id', 'u_id'])]

In [None]:
def parse_query(query_str : str, ttc_mapping : dict, column_count : dict) -> None:
  if "pg_" in query_str:
    return
    
  query_str = query_str[query_str.find(':')+1:] # Remove "execute <unamed>:"

  # Ignore non-relevant queries
  if "BEGIN" in query_str or "COMMIT" in query_str:
    return

  try:
    p = Parser(query_str)
    tables = p.tables
    columns = p.columns
  except:
    print("Invalid Query:", query_str)
    return

  # Build string "table.column_name" for each column
  added = False
  for column in columns:
    if "." in column:
      added = True
      column_count[column] += 1
    else:
      # Find which table this column corresponds to
      for table in tables:
        if column in ttc_mapping[table]:
          added = True
          column_count[f"{table}.{column}"] += 1
          break
  
  if not added:
    print("Invalid Query:", query_str)


def build_actions_str(indexes : list, filename : str) -> list:
  cmds = [f'echo "Generating sql commands for {filename}"']
  for i, index in enumerate(indexes):
    table, column = index.split(".")
    if i == 0:
      cmd = f'echo "CREATE INDEX idx_{table}_{column} ON {table}({column});" > {filename}'
    else:
      cmd = f'echo "CREATE INDEX idx_{table}_{column} ON {table}({column});" >> {filename}'
    cmds.append(cmd)

  return cmds


def build_drop_idx_cmd(indexes : list) -> str:
  drop_cmd = "DROP INDEX"
  for i, index in enumerate(indexes):
    table, column = index.split(".")
    if i != len(indexes)-1:
      drop_cmd += f" idx_{table}_{column},"
    else:
      drop_cmd += f" idx_{table}_{column};"

  return drop_cmd



  
  
      


In [6]:
column_count = defaultdict(int)
for index, item in df.iteritems():
  parse_query(item, table_to_columns_epinions, column_count)

top_5_columns = sorted(column_count.keys(), key=lambda x: column_count[x], reverse=True)[:5]

print("\nTop columns:")
for col in top_5_columns:
  print(col, column_count[col])

build_actions_str(top_5_columns, "t1_epinions_1.sql")

Not supported query type:  SET extra_float_digits = 3
Not supported query type:  SET application_name = 'PostgreSQL JDBC Driver'
Not supported query type:  SET SESSION CHARACTERISTICS AS TRANSACTION ISOLATION LEVEL SERIALIZABLE


Invalid Query:  SET extra_float_digits = 3
Invalid Query:  SET application_name = 'PostgreSQL JDBC Driver'
Invalid Query:  SET SESSION CHARACTERISTICS AS TRANSACTION ISOLATION LEVEL SERIALIZABLE

Top columns:
review.rating 297
review.i_id 296
trust.source_u_id 242
review.u_id 189
trust.target_u_id 178


['echo "Generating sql commands for t1_epinions_1.sql"',
 'echo "CREATE INDEX idx_review_rating ON review(rating);" > t1_epinions_1.sql',
 'echo "CREATE INDEX idx_review_i_id ON review(i_id);" >> t1_epinions_1.sql',
 'echo "CREATE INDEX idx_trust_source_u_id ON trust(source_u_id);" >> t1_epinions_1.sql',
 'echo "CREATE INDEX idx_review_u_id ON review(u_id);" >> t1_epinions_1.sql',
 'echo "CREATE INDEX idx_trust_target_u_id ON trust(target_u_id);" >> t1_epinions_1.sql']

In [11]:
build_drop_idx_cmd(top_5_columns)

'DROP INDEX idx_review_rating, idx_review_i_id, idx_trust_source_u_id, idx_review_u_id, idx_trust_target_u_id;'