In [31]:
# Imports
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy import text
from sqlalchemy.engine import result
from sqlalchemy import inspect
import pandas as pd
import plotly.io as pio
pio.renderers.default = 'notebook'
pio.templates.default = "plotly_white"
import plotly.offline as pyo
pyo.init_notebook_mode(connected=True)
import plotly.express as px
import plotly.graph_objects as go
import datetime
from datetime import datetime
import numpy as np
from dotenv import dotenv_values


In [32]:
# Grab variables from .env and create engine for accessing the database

config = dotenv_values(".env")

user = config['DB_USER']
password = config['DB_PASSWORD']
host = config['HOST']
port = config['PORT']
database = config['DATABASE']
remove_ids = config['REMOVE_IDS']
 

engine = create_engine(f"oracle+cx_oracle://{user}:{password}@{host}:{port}/?service_name={database}&encoding=UTF-8&nencoding=UTF-8", echo=False)

today_date = datetime.today().strftime('%m-%d-%Y')

## How many visualizations are used in each analysis?

The following gives the current number of visualizations saved with each analysis

In [33]:
# Look at the current number of visualizations per analysis
query = f"""
    select 
        count(*) as number_analyses
        , num_visualizations 
    from edauserce.analysis@ceprods.login_comment
    where user_id not in ({remove_ids})
    group by num_visualizations 
    order by num_visualizations
"""

sql = text(query)
with engine.connect() as conn:
    result = conn.execute(sql)
    df = pd.DataFrame(result)

df.head()

Unnamed: 0,number_analyses,num_visualizations
0,1928,0
1,1770,1
2,352,2
3,202,3
4,255,4


In [34]:
# Plot!
fig = px.bar(df,
              x='num_visualizations',
              y='number_analyses',
              title=f"""ClinepiDB: Number of visualizations per analysis 12-01-2023 - {today_date}""",
)
fig.update_layout(xaxis=dict(
                      title="Number of visualizations"
                  ),
                  yaxis=dict(
                      title="Number of analyses"
                  )
)
fig.show()

In [29]:
# Save plot
fig.write_image("visualizations_per_analysis.pdf", format='pdf')

## How many specific types of vizs get created?

Are some visualizaion types more popular than others? Note these types are across all computations.

In [35]:
# Get data

query = f"""
with analysis_descriptions as (
    select eda.analysis_id
        , to_char(eda.creation_time, 'yyyy-MM-DD') as creation_day
        , analysis_descriptor_table.*
        , eda.analysis_descriptor as analysis_descriptor_json
    from edauserce.analysis@ceprods.login_comment eda,
         json_table(eda.analysis_descriptor, '$'
           columns (
             nested path '$.computations[*]' columns (
                computation_type varchar2(50) path '$.descriptor.type',
                nested path '$.visualizations[*]' columns (
                    visualization_type varchar2(50) path '$.descriptor.type'
                )
             )
           )
         ) analysis_descriptor_table
    where eda.user_id not in ({remove_ids})
    order by creation_day, eda.analysis_id
), 
grouped_visualizations as (
    select creation_day
        , visualization_type
        , count(*) as number_of_visualizations
    from analysis_descriptions
    group by creation_day, visualization_type
)
select 
    creation_day
    , visualization_type
    , sum(number_of_visualizations) over (partition by visualization_type order by creation_day) as cum_sum_visualizations
    , number_of_visualizations
from grouped_visualizations
order by creation_day
"""

sql = text(query)
with engine.connect() as conn:
    result = conn.execute(sql)
    df_vizs = pd.DataFrame(result)

df_vizs.head()

Unnamed: 0,creation_day,visualization_type,cum_sum_visualizations,number_of_visualizations
0,2021-09-20,,3,3
1,2021-09-21,,24,21
2,2021-09-21,scatterplot,2,2
3,2021-09-22,barplot,4,4
4,2021-09-22,,38,14


In [36]:
# Plot
# Plot by visualization type
fig = px.line(
  df_vizs,
  x='creation_day',
  y='cum_sum_visualizations',
  color='visualization_type'
)

fig.update_layout(
  title="ClinepiDB: Cumulative number of visualizations",
  xaxis=dict(
    title="Creation day"
  ),
  yaxis=dict(
    title="Cumulative number of visualizations"
  )
)

fig

In [35]:
fig.write_image("cumulative_plots.pdf", format='pdf')

## Usage by study

How many analyses are created per study?

In [37]:
query = f"""
with study_analyses_by_day as (
    select name
        , study_id
        , to_char(creation_time, 'YYYY-MM-DD') as creation_day
    from edauserce.analysis@ceprods.login_comment eda_analysis
    inner join apidbtuning.datasetpresenter dp
    on eda_analysis.study_id = dp.dataset_presenter_id
    where study_id like 'DS_%'
    and user_id not in ({remove_ids})
)
, aggregation as (
    select name
        , creation_day
        , count(*) as count_analyses_created
    from study_analyses_by_day
    group by name, creation_day
)
select name
    , creation_day
    , count_analyses_created
    , sum(count_analyses_created) over (partition by name order by creation_day) as cum_sum_analyses_created
from aggregation
order by creation_day
"""

sql = text(query)
with engine.connect() as conn:
    result = conn.execute(sql)
    df_study_analyses = pd.DataFrame(result)

df_study_analyses.head()

Unnamed: 0,name,creation_day,count_analyses_created,cum_sum_analyses_created
0,ISASimple_Gates_GEMS_gems1_case_control_RSRC,2021-09-20,3,3
1,ISASimple_Gates_GEMS_gems1_case_control_RSRC,2021-09-21,2,5
2,ISASimple_ICEMR_PRISM_cohort_RSRC,2021-09-21,2,2
3,ISASimple_ICEMR_South_Asia_surveillance_RSRC,2021-09-21,19,19
4,ISASimple_ICEMR_South_Asia_surveillance_RSRC,2021-09-22,21,40


In [38]:
# Plot! We want a line plot with all in gray except the HMP studies.

# Approach - plot all in gray, then add the trace lines for hmp studies after
study_names = pd.unique(df_study_analyses.name)

fig = px.line(
  df_study_analyses,
  x='creation_day',
  y='cum_sum_analyses_created',
  color='name'
)

fig.update_layout(
  title="ClinepiDB: Analyses of curated studies",
  xaxis_title="Creation day",
  yaxis_title="Cumulative number of analyses"
)

fig.show()

In [44]:
fig.write_image("analyses_by_study.pdf", format='pdf')