# Exploration: Document-Level TFIDF and PCA

Andrew Barros  
DS5001  
02 May 20024  


In [13]:
import pandas as pd
from pathlib import Path
import plotly.graph_objects as go
from sklearn.manifold import TSNE

In [14]:
DATA_DIR = Path("../data")

TFIDF = pd.read_parquet(DATA_DIR / "corpus_f4_tfidf.parquet")
BIG_PCA = pd.read_parquet(DATA_DIR / "corpus_f5_pca.parquet")
SMALL_PCA = pd.read_parquet(DATA_DIR / "corpus_f5_small_pca.parquet")
ARTICLES = pd.read_parquet(DATA_DIR / "all_articles.parquet")

In [15]:
SMALL_PCA = SMALL_PCA.merge(ARTICLES.set_index("pmcid"), left_index=True, right_index=True)
BIG_PCA = BIG_PCA.merge(ARTICLES.set_index("pmcid"), left_index=True, right_index=True)

In [16]:
SMALL_PCA['year'] = SMALL_PCA['sortdate'].dt.year
BIG_PCA['year'] = BIG_PCA['sortdate'].dt.year

# Visualize all PCA Dimensions

In [17]:
fig = go.Figure(
    data = go.Splom(
        dimensions=[
            dict(label='PC1', values=SMALL_PCA[0]),
            dict(label='PC2', values=SMALL_PCA[1]),
            dict(label='PC3', values=SMALL_PCA[2]),
            dict(label='PC4', values=SMALL_PCA[3]),
            dict(label='PC5', values=SMALL_PCA[4]),
            dict(label='PC6', values=SMALL_PCA[5]),
        ],
        showupperhalf=False,
        diagonal_visible=False,
        marker=dict(
            size=1,
        )
    )
)
fig.update_layout(
    title="PCA of TF-IDF vectors",
    width=800,
    height=800,
    template="simple_white",
)

fig.show()
fig.write_image("tfidf_pca.png")

Here we present the first six PCA dimensions of the TF-IDF vectors.

Interperation:
* There is not a lot of correlation between the first five PCA components
* There is less correlation between PC6 and the other visualized components.

# T-SNE of TF-IDF PCA Dimensions

In [20]:
tsne = TSNE(random_state=42)
coords = tsne.fit_transform(SMALL_PCA[[x for x in range(10)]])
SMALL_PCA["tsne_0"] = coords[:, 0]
SMALL_PCA["tsne_1"] = coords[:, 1]

In [22]:
# fig = px.scatter(SMALL_PCA, x="tsne_0", y="tsne_1", color="year", hover_name="title", width=800, height=800)
fig = go.Figure(
    data = go.Scatter(
        x=SMALL_PCA["tsne_0"],
        y=SMALL_PCA["tsne_1"],
        mode="markers",
        marker=dict(
            color=SMALL_PCA["year"],
            colorscale="Viridis",
            size=4,
        ),
        text=SMALL_PCA["title"],
        hoverinfo="text",
    )
)

fig.update_layout(
    width=800,
    height=800,
    template="simple_white",
    xaxis_title="t-SNE Dimenstion 1",
    yaxis_title="t-SNE Dimenstion 2",
    title="t-SNE of TF-IDF vectors",
)
fig.show()
fig.write_image("tfidf_tsne.png")

Interpretation:

* There are several intriging clusters that develop on this figure. For example
    * The nine articles centered near (-18, -33) all relate to decentralized technology such as blockchain and federated learning
    * The 14 articles centered near (12, 37) all relate to predictive models for sepsis (a serious infection)
    * The larger cluster around (-44, -11) all relate to patient portal and mobile health application usage
* In the less well defined clusters articles with a similar meaning appear close together: For example,
    * Near (-20, 14) are several articles relating to electronic health record usage metrics ("A qualitative stufy of provider burnout: do medical scribs hinder or help?", "Associations of physician burnout wiht organization electronic health record support and after-hours charting", "The influence of a sprint optimization and training intervention on time spend in the electronic health record (EHR)")
    * Near (37, -15) there are a several articles about NLP of EHR data ("Discovering body site and severity modifiers in clinical texts", "Functional evaluation of out-of-the-box text-mining tools for data-mining tasks", and "Weakly supervised natural language processing for asessing patient-centered outocmes following prostate cancer treatment")
* Overall, this approach appears to have captured the article meaning with reasonable accuracy