In [85]:
import pandas as pd
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go
import plotly.colors
import numpy as np
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans


In [86]:
DATA_DIR = Path("../data")

TFIDF = pd.read_parquet(DATA_DIR / "corpus_f4_tfidf.parquet")
BIG_PCA = pd.read_parquet(DATA_DIR / "corpus_f5_pca.parquet")
SMALL_PCA = pd.read_parquet(DATA_DIR / "corpus_f5_small_pca.parquet")

ARTICLES = pd.read_parquet(DATA_DIR / "all_articles.parquet")
CITATIONS = pd.read_parquet(DATA_DIR / "citation_count.parquet")
ARTICLES = ARTICLES.merge(CITATIONS, left_on="pmcid", right_on="pmcid")

THETA = pd.read_parquet(DATA_DIR / "corpus_f5_theta.parquet")
PHI = pd.read_parquet(DATA_DIR / "corpus_f5_phi.parquet")

THETA = THETA.merge(ARTICLES.set_index("pmcid"), left_index=True, right_index=True)

In [87]:
THETA.index.name = "pmcid"

In [88]:
(
    pd.melt(
        PHI.reset_index(),
        id_vars="topic_id",
        var_name="term_str",
        value_name="w"
    ).groupby("topic_id")
    [["term_str", "w"]]
    .apply(lambda x: x.nlargest(10, "w"))
    .reset_index()
    .groupby("topic_id")['term_str']
    .apply(lambda x: " ".join(x))
    .to_frame()
    .style.set_properties(subset=['term_str'], **{'width': '300px'})
)

Unnamed: 0_level_0,term_str
topic_id,Unnamed: 1_level_1
0,studies review articles study used included clinical reported methods trials
1,patient patients medication study care alert alerts hospital use results
2,data clinical patient patients care health information quality based study
3,data research network using used clinical database source information use
4,health data social population patients gender opioid risk race age
5,study participants use time health patients care using used studies
6,data health research information use clinical care healthcare support patient
7,patients study drug clinical data using treatment patient results time
8,text used clinical performance model based using information concepts terms
9,model data models patients using prediction performance risk used time


In [89]:
tsne = TSNE(random_state=42)
coords = tsne.fit_transform(THETA[[x for x in range(10)]])

THETA["x"] = coords[:, 0]
THETA["y"] = coords[:, 1]

In [90]:
fig = go.Figure(
    data = go.Scatter(
        x=THETA["x"],
        y=THETA["y"],
        mode='markers',
        marker=dict(
            size=3,
            color= np.log1p(THETA["citation_count"]),
            colorscale='Bluered',
            showscale=False
        ),
        text=THETA["title"]
    )
)

fig.update_layout(
    width=800,
    height=800,
    template="simple_white",
    title="t-SNE of topic distribution",
    xaxis_title="t-SNE Dimension 1",
    yaxis_title="t-SNE Dimension 2"
)

fig.show()
fig.write_image("topic_tsne.png")

In [91]:
kmeans = KMeans(n_clusters=8, random_state=42)
kmeans.fit(THETA[[x for x in range(10)]])
THETA["cluster"] = kmeans.labels_

In [92]:
def map_sequence(s):
    colormap = plotly.colors.qualitative.Plotly
    return [colormap[x] for x in s]

fig = go.Figure(
    data = go.Scatter(
        x=THETA["x"],
        y=THETA["y"],
        mode='markers',
        marker=dict(
            size=3,
            color=map_sequence(THETA["cluster"]),
            showscale=False
        ),
        text=THETA["title"]
    )
)

fig.update_layout(
    width=800,
    height=800,
    template="simple_white",
    title="t-SNE of topic distribution",
    xaxis_title="t-SNE Dimension 1",
    yaxis_title="t-SNE Dimension 2"
)

fig.show()
fig.write_image("topic_tsne_clustered.png")

In [93]:
(
    THETA.groupby("cluster", as_index=False)
        .apply(lambda x: x.nlargest(10, "citation_count"), include_groups=False)
        [['title', 'citation_count']]
        .reset_index(0)
        .rename(columns={'level_0': 'cluster'})
        .reset_index(drop=True)
        .style.set_properties(subset=['title'], **{'width': '300px'})
)

Unnamed: 0,cluster,title,citation_count
0,0,A review of approaches to identifying patient phenotype cohorts using electronic health records,365
1,0,Pharmacovigilance from social media: mining adverse drug reaction mentions using sequence labeling with word embedding cluster features,354
2,0,CLAMP – a toolkit for efficiently building customized clinical natural language processing pipelines,225
3,0,Machine-learned solutions for three stages of clinical information extraction: the state of the art at i2b2 2010,160
4,0,RobotReviewer: evaluation of a system for automatically assessing bias in clinical trials,147
5,0,Automated methods for the summarization of electronic health records,129
6,0,Diagnosis code assignment: models and evaluation metrics,125
7,0,Recommending MeSH terms for annotating biomedical articles,103
8,0,Evaluating the state of the art in disorder recognition and normalization of the clinical narrative,95
9,0,Self-reported COVID-19 symptoms on Twitter: an analysis and a research resource,91


In [96]:
figure_data = (
    THETA.groupby(["cluster", "pubyear"])["PMID"]
    .count()
    .to_frame()
    .unstack()
    .fillna(0)
    .stack(future_stack=True)
    .reset_index()
)

figure_data.columns = ["cluster", "pubyear", "count"]
figure_data["pct_count"] = figure_data["count"] / figure_data.groupby("pubyear")[
    "count"
].transform("sum")
figure_data = figure_data[
    (figure_data["pubyear"] < 2024) & (figure_data["pubyear"] > 2010)
]

# color="cluster", color_discrete_sequence=px.colors.qualitative.D3
fig = px.line(
    figure_data,
    x="pubyear",
    y="pct_count",
    facet_col="cluster",
    facet_col_wrap=4,
)

fig.for_each_yaxis(lambda y: y.update(title=""))
fig.for_each_xaxis(lambda y: y.update(title=""))

fig.add_annotation(
    x=-0.1, y=0.5, text="Percent of Total Articles", textangle=-90, xref="paper", yref="paper", showarrow=False,
)

fig.add_annotation(
    x=0.5, y=-0.1, text="Publication Year", xref="paper", yref="paper", showarrow=False,
)

fig.update_layout(
    width=800,
    height=800,
    template="simple_white",
    title="Cluster distribution over time",
)

fig.show()
fig.write_image("cluster_distribution.png")