In [1]:
import pandas as pd
from sql_metadata import Parser
from collections import defaultdict

pd.set_option('display.max_colwidth', None)

In [2]:
!ls /usr/local/var/postgres/log

postgresql-2022-02-10_211004.csv postgresql-2022-02-10_211004.log


In [3]:
LOG_DIRECTORY = '/usr/local/var/postgres/log/postgresql-2022-02-10_211004.csv'
df = pd.read_csv(LOG_DIRECTORY, header=None).iloc[33:1910, 13]

In [4]:
table_to_columns_epinions = {
  "item": set(["i_id", "title"]),
  "useracct": set(["u_id", "name"]),
  "review": set(["a_id", "u_id", "i_id", "rating", "rank"]),
  "trust": set(["source_u_id", "target_u_id", "trust", "creation_date"]),
  "review_rating": set(["u_id", "a_id", "rating", "status", "creation_date", "last_mod_date", "type", "vertical_id"])
}

In [10]:
def parse_query(query_str : str, ttc_mapping : dict, column_count : dict) -> None:
  query_str = query_str[query_str.find(':')+1:] # Remove "execute <unamed>:"

  # Ignore non-relevant queries
  if "BEGIN" in query_str or "COMMIT" in query_str:
    return

  try:
    p = Parser(query_str)
    tables = p.tables
    columns = p.columns
  except:
    print("Invalid Query:", query_str)
    return

  # Build string "table.column_name" for each column
  added = False
  for column in columns:
    if "." in column:
      added = True
      column_count[column] += 1
    else:
      # Find which table this column corresponds to
      for table in tables:
        if column in ttc_mapping[table]:
          added = True
          column_count[f"{table}.{column}"] += 1
          break
  
  if not added:
    print("Invalid Query:", query_str)


def build_actions_str(indexes : list, filename : str) -> list:
  cmds = [f'echo "Generating sql commands for {filename}"']
  for i, index in enumerate(indexes):
    table, column = index.split(".")
    if i == 0:
      cmd = f'echo "CREATE INDEX idx_{table}_{column} ON {table}({column});" > {filename}'
    else:
      cmd = f'echo "CREATE INDEX idx_{table}_{column} ON {table}({column});" >> {filename}'
    cmds.append(cmd)

  return cmds


def build_drop_idx_cmd(indexes : list) -> str:
  drop_cmd = "DROP INDEX"
  for i, index in enumerate(indexes):
    table, column = index.split(".")
    if i != len(indexes)-1:
      drop_cmd += f" idx_{table}_{column},"
    else:
      drop_cmd += f" idx_{table}_{column};"

  return drop_cmd



  
  
      


In [6]:
column_count = defaultdict(int)
for index, item in df.iteritems():
  parse_query(item, table_to_columns_epinions, column_count)

top_5_columns = sorted(column_count.keys(), key=lambda x: column_count[x], reverse=True)[:5]

print("\nTop columns:")
for col in top_5_columns:
  print(col, column_count[col])

build_actions_str(top_5_columns, "t1_epinions_1.sql")

Not supported query type:  SET extra_float_digits = 3
Not supported query type:  SET application_name = 'PostgreSQL JDBC Driver'
Not supported query type:  SET SESSION CHARACTERISTICS AS TRANSACTION ISOLATION LEVEL SERIALIZABLE


Invalid Query:  SET extra_float_digits = 3
Invalid Query:  SET application_name = 'PostgreSQL JDBC Driver'
Invalid Query:  SET SESSION CHARACTERISTICS AS TRANSACTION ISOLATION LEVEL SERIALIZABLE

Top columns:
review.rating 297
review.i_id 296
trust.source_u_id 242
review.u_id 189
trust.target_u_id 178


['echo "Generating sql commands for t1_epinions_1.sql"',
 'echo "CREATE INDEX idx_review_rating ON review(rating);" > t1_epinions_1.sql',
 'echo "CREATE INDEX idx_review_i_id ON review(i_id);" >> t1_epinions_1.sql',
 'echo "CREATE INDEX idx_trust_source_u_id ON trust(source_u_id);" >> t1_epinions_1.sql',
 'echo "CREATE INDEX idx_review_u_id ON review(u_id);" >> t1_epinions_1.sql',
 'echo "CREATE INDEX idx_trust_target_u_id ON trust(target_u_id);" >> t1_epinions_1.sql']

In [11]:
build_drop_idx_cmd(top_5_columns)

'DROP INDEX idx_review_rating, idx_review_i_id, idx_trust_source_u_id, idx_review_u_id, idx_trust_target_u_id;'