In [None]:
import numpy as np
import pandas as pd
from pace.missingness import *
from pace.history import *
from pace.plots import PlotSession

from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, tools, CustomJS

output_notebook(hide_banner=True)

In [None]:
# dependencies postgres
import psycopg2
from psycopg2.extensions import connection
from psycopg2 import sql

## Database version

In [None]:
DBNAME = "postgres"
USERNAME = "pwochner"

conn = psycopg2.connect(host="localhost", database=DBNAME, user=USERNAME)
curs = conn.cursor()

### List of column names in a relation

In [None]:
curs.execute("SELECT * FROM diag_example.patients LIMIT 0")
column_names = [c.name for c in curs.description]
key = sql.Identifier(column_names[0])
relation = sql.Identifier("patients")
schema = sql.Identifier("diag_example")
column_identifiers = [sql.Identifier(c) for c in column_names[1:]]

### combination id to records

In [None]:
# missingness for all fields except key
selection_list = [sql.SQL("{col}").format(col=key)]
selection_list += [sql.SQL("{col} IS NULL AS {col}").format(col=x) for x in column_identifiers]
# all fields
fields = sql.SQL(",").join(column_identifiers)
# queries
query_missing = sql.SQL("select {0} from {1}.{2} ").format(sql.SQL(', ').join(selection_list),schema,relation)
query_patterns = sql.SQL("""select *, dense_rank() over(
order by ({0}) asc
) from ({1}) AS query_missing""").format(fields, query_missing, key)
query_df = sql.SQL(""" select "dense_rank",{0} from ({1}) AS query_pattern """).format(key, query_patterns)


In [None]:
curs.execute(query_df)

In [None]:
df = pd.DataFrame(curs.fetchall(), columns=["combination_id", "_record_id"])
df = df.apply(lambda x: x-1)
df = df.set_index("combination_id")
# df = pd.DataFrame(curs.fetchall(), columns=column_names)
# df = pd.DataFrame(curs.fetchall(),columns=column_names+["combination_id"])

In [None]:
pg_combination_id_to_records = df

### Combination id to columns

In [None]:
query_distinct_combinations = sql.SQL("""select distinct on("dense_rank") "dense_rank", {0} 
from ({1}) AS query_patterns
order by "dense_rank"
""").format(fields, query_patterns)

In [None]:
curs.execute(query_distinct_combinations)

In [None]:
df_comb = pd.DataFrame(curs.fetchall(), columns=["combination_id"]+column_names[1:])
df_comb["combination_id"] = df_comb["combination_id"].apply(lambda x: x-1)
df_comb = df_comb.set_index("combination_id")

In [None]:
df_comb

## Pandas version

In [None]:
# df = pd.read_csv("../../data/Synthetic_APC_DIAG_Fields.csv", low_memory=False)
df = pd.read_csv("../../data/test_data_merged_10000.csv", low_memory=False)
m = Missingness.from_data_frame(df)

In [None]:
combination_id_to_records = m._combination_id_to_records
combination_id_to_columns = m._combination_id_to_columns
combination_id_to_columns

## Compare the two methods

In [None]:
combination_id_to_records["_record_id"].is_monotonic_decreasing

In [None]:
comb = 45
pg_combination_id_to_records.loc[comb]

In [None]:
# idx = combination_id_to_records.index.max() - comb
combination_id_to_records.loc[comb]

In [None]:
same_length = [len(pg_combination_id_to_records.loc[x])==len(combination_id_to_records.loc[x]) for x in pg_combination_id_to_records.index.unique()]
sum(same_length)

In [None]:
def intersection_records(df1, df2):
    """
    Compares the intersection of _records_ids between two dataframes for each combination.
    """
    col = "_record_id"
    combinations = df1.index.unique()
    intersection = [np.intersect1d(df1[col].loc[x], df2[col].loc[x]).shape[0] == df1.loc[x].shape[0] for x in combinations]
    return intersection

In [None]:
# check if record_ids for each combination are the same in both dataframes
intersections = intersection_records(pg_combination_id_to_records, combination_id_to_records)
sum(intersections)

In [None]:
# test if unique combination ids (in order) are identical
sum(pg_combination_id_to_records.index.unique() == combination_id_to_records.index.unique())

In [None]:
# compare combination id to columns dataframes
compare_patterns = [combination_id_to_columns.loc[x].drop("Key").equals(df_comb.loc[x]) for x in df_comb.index]
sum(compare_patterns)

In [None]:
# m._combination_id_to_columns.head() # id and missingness pattern for each unique combination 

In [None]:
# session = PlotSession(df, "session_151021.json")
# session = PlotSession(df)

In [None]:
# session.add_plot("a")

In [None]:
# session.add_plot(name="b", based_on="a")

In [None]:
# session.add_plot(name="c", based_on="b")

In [None]:
# session.save("session_151021.json")