In [25]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
from sklearn.manifold import TSNE
from sklearn.cluster import AgglomerativeClustering
import plotly.express as px
import numpy as np
import plotly.io as pio

pio.renderers.default = "notebook"

In [3]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()
model.to("cuda" if torch.cuda.is_available() else "cpu")

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [30]:
file_path = "./abstract_cat_wos.xls"
data = pd.read_excel(file_path)

# Extract just the abstracts and categories
ab_wos_cat = data[["Abstract", "WoS Categories"]]
ab_wos_cat.head()

Unnamed: 0,Abstract,WoS Categories
0,ObjectiveSexual victimization experience is a ...,"Education & Educational Research; Public, Envi..."
1,Binary success/failure data is an important ty...,Statistics & Probability
2,Two studies examined relations of humor styles...,"Psychology, Social"
3,This paper examines the hitherto unexplored su...,Political Science
4,This study's objective is to examine the role ...,"Business, Finance"


In [13]:
cleaned_data = ab_wos_cat.copy()
cleaned_data = cleaned_data.dropna()
cleaned_data.loc[:, "Primary Category"] = cleaned_data["WoS Categories"].apply(
    lambda x: x.split(";")[0].strip()
)
cleaned_data = cleaned_data.drop("WoS Categories", axis=1)
cleaned_data.head()

Unnamed: 0,Abstract,Primary Category
0,ObjectiveSexual victimization experience is a ...,Education & Educational Research
1,Binary success/failure data is an important ty...,Statistics & Probability
2,Two studies examined relations of humor styles...,"Psychology, Social"
3,This paper examines the hitherto unexplored su...,Political Science
4,This study's objective is to examine the role ...,"Business, Finance"


In [None]:
unique_cats = cleaned_data["Primary Category"].unique()
sorted_categories = np.sort(unique_cats)

In [14]:
def encode_abstracts(abstracts, tokenizer, model, batch_size=8):
    embeddings = []
    for i in range(0, len(abstracts), batch_size):
        batch_texts = abstracts[i : i + batch_size]
        inputs = tokenizer(
            batch_texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512,
        )
        inputs = inputs.to("cuda" if torch.cuda.is_available() else "cpu")
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.pooler_output.cpu().numpy())
    return np.vstack(embeddings)

In [18]:
embeddings = encode_abstracts(cleaned_data["Abstract"].tolist(), tokenizer, model)

# VERSION 1

In [20]:
# Dimensionality reduction and clustering
tsne_2d = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne_2d.fit_transform(embeddings)

tsne_3d = TSNE(n_components=3, random_state=42)
embeddings_3d = tsne_3d.fit_transform(embeddings)

clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=0)
clusters = clustering.fit_predict(embeddings)

In [36]:
def plot_interactive(embeddings, clusters, categories, dimensions=2):
    df = pd.DataFrame(
        embeddings, columns=[f"Component {i+1}" for i in range(dimensions)]
    )
    df["Category"] = categories
    df["Cluster"] = clusters
    if dimensions == 2:
        fig = px.scatter(
            df,
            x="Component 1",
            y="Component 2",
            color="Category",
            hover_data=["Cluster"],
        )
    else:
        fig = px.scatter_3d(
            df,
            x="Component 1",
            y="Component 2",
            z="Component 3",
            color="Category",
            hover_data=["Cluster"],
        )

    category_color_mapping = {
        category: trace.marker.color
        for category, trace in zip(sorted(set(categories)), fig.data)
    }

    # Write category-color mapping to a text file
    with open("category_color_mapping.txt", "w") as f:
        for category, color in category_color_mapping.items():
            f.write(f"{category}: {color}\n")

    fig.write_html("plot.html")


# generate plots
categories = cleaned_data["Primary Category"].tolist()
plot_interactive(embeddings_2d, clusters, categories, dimensions=2)
# plot_interactive(embeddings_3d, clusters, categories, dimensions=3)

# VERSION 2

In [34]:
tsne_embeddings = TSNE(n_components=2, random_state=42).fit_transform(embeddings)

# hierarchical clustering
clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=0)
clusters = clustering_model.fit_predict(embeddings)

# Visualization
df = pd.DataFrame(tsne_embeddings, columns=["TSNE1", "TSNE2"])
df["Cluster"] = clusters
df["Category"] = data["WoS Categories"].apply(lambda x: x.split(";")[0])

fig_2d = px.scatter(
    df,
    x="TSNE1",
    y="TSNE2",
    color="Category",
    symbol="Cluster",
    hover_data=["Category", "Cluster"],
)
fig_2d.update_traces(marker=dict(size=10, opacity=0.7), selector=dict(mode="markers"))
fig_2d.write_html("plot2.html")