In [8]:
import pandas as pd

# Load savedrecs excel file
file_path = './abstract_cat_wos.xls'
data = pd.read_excel(file_path)

# Display first few rows
#data.head()

In [None]:
# Extract just the abstracts and categories
ab_wos_cat = data[['Abstract', 'WoS Categories']]

ab_wos_cat.head()

In [None]:
# Get rid of rows with any missing values
cleaned_data = ab_wos_cat.dropna()
cleaned_data.head()

In [None]:
# Just get first category
cleaned_data = cleaned_data.drop('cat', axis=1)
cleaned_data.loc[:, 'Primary Category'] = cleaned_data['WoS Categories'].apply(lambda x: x.split(';')[0].strip())
cleaned_data.head()

In [None]:
# Raw Visualization
import matplotlib.pyplot as plt
from collections import Counter

primary_cat_counts = Counter(cleaned_data['Primary Category'])

# 20 most common categories
common_cats = primary_cat_counts.most_common(20)
primary_cats, primary_counts = zip(*common_cats)

# Bar plot
plt.figure(figsize=(20, 18))
plt.barh(primary_cats, primary_counts, color='blue')
plt.xlabel('Frequency')
plt.ylabel('Primary Category')
plt.title('20 Most Common Cats')
plt.gca().invert_yaxis() # Puts highest frequency at top instead of bottom
plt.show()

In [27]:
# Scatter Plot
from sklearn.manifold import TSNE
# plt, pd
from transformers import BertModel, BertTokenizer
import torch
from torch.utils.data import TensorDataset, DataLoader
import numpy as np

In [40]:
# Create the category_to_int mapping
unique_categories = cleaned_data['Primary Category'].unique()
category_to_int = {category: i for i, category in enumerate(unique_categories)}


In [41]:
# Step 1: Load BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)


In [42]:
# Encode texts with BERT
def encode_texts(texts, tokenizer, model, batch_size=8):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    
    # Tokenize texts and create batches
    tokens = tokenizer(texts, padding=True, 
                       truncation=True, max_length=512, 
                       return_tensors="pt")
    dataset = TensorDataset(tokens.input_ids, tokens.attention_mask)
    dataloader = DataLoader(dataset, batch_size=batch_size)
    
    # Encode texts in batches
    embeddings = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            embeddings.append(outputs.last_hidden_state[:, 0, :].cpu().numpy())
    embeddings = np.vstack(embeddings)
    return embeddings

In [43]:
# Step 2: Encode abstracts
texts = cleaned_data['Abstract'].tolist()
embeddings = encode_texts(texts, tokenizer, model)

In [39]:
# Step 3: Apply t-SNE
tsne = TSNE(n_components=2, random_state=42)
reduced_embeddings = tsne.fit_transform(embeddings)

In [44]:
# Colors for scatter plot
colors = cleaned_data['Primary Category'].map(category_to_int)

In [None]:
# Step 4: Visualization
plt.figure(figsize=(12, 10))
plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=colors, cmap='hsv', alpha=0.5)
plt.colorbar()
plt.title('t-SNE Visualization of Abstracts with BERT Embeddings')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.show()

In [48]:
def encode_abstracts(abstracts):
    inputs = tokenizer(abstracts, padding=True, truncation=True, max_length=512, return_tensors='pt')
    outputs = model(**inputs)
    # Use the pooled output for sentence-level representations
    embeddings = outputs.pooler_output
    return embeddings.detach().numpy()

In [49]:
def reduce_dimensions(embeddings):
    tsne = TSNE(n_components=2, random_state=42)
    reduced_embeddings = tsne.fit_transform(embeddings)
    return reduced_embeddings

In [None]:
file_path2 = './abstract_cat_wos.xls'
data2 = pd.read_excel(file_path)

abstracts = [str(abstract) for abstract in data2['Abstract'].tolist() if str(abstract).strip()]
categories = data2['WoS Categories'].tolist()

embeddings2 = encode_abstracts(abstracts)
reduced_embeddings2 = reduce_dimensions(embeddings)

unique_categories2 = list(set(categories))
category2int = {category: i for i, category in enumerate(unique_categories2)}
clr = [category2int[category] for category in categories]

# Visualize
plt.figure(figsize=(12, 10))
plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=colors, cmap='viridis', alpha=0.5)
plt.colorbar(ticks=range(len(unique_categories)))
plt.clim(-0.5, len(unique_categories)-0.5)
plt.title('Visualization of Abstracts by Category')
plt.show()