In [1]:
# libraries
import joblib
import matplotlib.pyplot as plt
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.cluster import KMeans
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AutoModel
import torch.nn as nn
import torch.nn.functional as F

Using the trained roBERTa embeddings as the initial embedding to pretrain the model using contrastive loss to learn the embeddings, create, train and run a Deep Clustering by Semantic Contrastive Learning model

In [2]:
# Load the full, now labelled data from the file
reddit = joblib.load(
    "/Users/seshat/Documents/GitHub/labor_sentiment_analysis/data/pickle/reddit_labelled.pkl"
)

# Load trained model and tokenizer
save_path = "/Users/seshat/Documents/GitHub/labor_sentiment_analysis/models/roBERTa"
roberta_model = RobertaForSequenceClassification.from_pretrained(save_path)
tokenizer = RobertaTokenizer.from_pretrained(save_path)

# Ensure the model is in evaluation mode
roberta_model.eval()

# Tokenize and encode
def tokenize_texts(texts, tokenizer, max_length=128):
    return tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
    )

tokenized_inputs = tokenize_texts(reddit['text_processed'].tolist(), tokenizer)

## DATA PREPROCESSING COMPLETE

Tips for Effective Implementation

    Batch Sampling: Ensure diverse positive and negative pairs in each batch for effective contrastive learning.
    Augmentations: Use augmentations (e.g., synonym replacement, backtranslation) to create robust positive pairs.
    Initialization: Use K-Means for an initial clustering of embeddings.
    Hyperparameters:
        Tune ττ (temperature) in contrastive loss.
        Experiment with the projection dimension in the projection head.
        Adjust λλ to balance losses.

In [3]:
# Custom Dataset class
# inputs fed in batches
class RedditDataset(Dataset):
    def __init__(self, tokenized_inputs):
        self.input_ids = tokenized_inputs["input_ids"]
        self.attention_mask = tokenized_inputs["attention_mask"]

    def __len__(self):
        return self.input_ids.size(0)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
        }

# Initialize dataset and loader
dataset = RedditDataset(tokenized_inputs)
dataloader = DataLoader(dataset, 
                        batch_size=32, 
                        shuffle=True)

Justify using ReLU()

In [4]:
# Deep Clustering by Semantic Constrastive Learning Model
class DCSCModel(nn.Module):
    def __init__(self, pretrained_model, num_clusters=10, projection_dim=128):
        super(DCSCModel, self).__init__()
        self.encoder = pretrained_model.roberta # RoBERTa encoder
        self.projection_head = nn.Sequential(
            nn.Linear(self.encoder.config.hidden_size, projection_dim),
            nn.ReLU(),
            nn.Linear(projection_dim, projection_dim),
        )
        self.cluster_head = nn.Linear(projection_dim, num_clusters)

    def forward(self, input_ids, attention_mask):
        # extract embeddings
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_embeddings = outputs.last_hidden_state[:, 0, :]

        # pass embeddings through projection and clustering heads
        projections = self.projection_head(cls_embeddings)
        clusters = self.cluster_head(projections)
        return projections, clusters

# Contrastive Loss
# cosine similarity and temp parameter
def contrastive_loss(projections, temperature=0.1):
    # normalize projections
    projections = F.normalize(projections, dim=1)
    similarity_matrix = torch.mm(projections, projections.T) / temperature
    labels = torch.arange(len(projections)).to(projections.device)
    loss = F.cross_entropy(similarity_matrix, labels)
    return loss

# Clustering Loss
# kl divergence loss
def clustering_loss(cluster_logits, target_distribution, batch_indices):
    cluster_probs = F.softmax(cluster_logits, dim=1)
    target_dist_batch = target_distribution[batch_indices]
    return F.kl_div(cluster_probs.log(), target_dist_batch, reduction="batchmean")

# Combined Loss
# balances with lambda weight
def compute_loss(contrastive_loss, clustering_loss, lambda_weight=0.5):
    return lambda_weight * contrastive_loss + (1 - lambda_weight) * clustering_loss

In [5]:
# Training Loop
def train_model(model, dataloader, optimizer, num_clusters, epochs=10, lambda_weight=0.5):
    model.train()
    
    # initialize k-means for initial embeddings
    embeddings_list = []
    for batch in dataloader:
        with torch.no_grad():
            input_ids = batch["input_ids"]
            attention_mask = batch["attention_mask"]
            projections, _ = model(input_ids, attention_mask)
            embeddings_list.append(projections)
    
    embeddings = torch.cat(embeddings_list)
    kmeans = KMeans(n_clusters=num_clusters).fit(embeddings.cpu().numpy())
    target_distribution = torch.tensor(kmeans.transform(embeddings.cpu().numpy())).to(embeddings.device)

    # training loop
    for epoch in range(epochs):
        for batch_idx, batch in enumerate(dataloader):
            input_ids = batch["input_ids"]
            attention_mask = batch["attention_mask"]

            # forward
            projections, clusters = model(input_ids, attention_mask)

            # compute loss
            loss_contrastive = contrastive_loss(projections)

            batch_indices = torch.arange(batch_idx * len(batch['input_ids']), 
                                         (batch_idx + 1) * len(batch['input_ids'])).to(projections.device)
            lost_cluster = clustering_loss(clusters, target_distribution, batch_indices)
            loss = compute_loss(loss_contrastive, lost_cluster, lambda_weight)

            # backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # recompute target distro using kmeans
        if epoch % 2 == 0:
            embeddings_list = []
            for batch in dataloader:
                with torch.no_grad():
                    input_ids = batch["input_ids"]
                    attention_mask = batch["attention_mask"]
                    projections, _ = model(input_ids, attention_mask)
                    embeddings_list.append(projections)

            embeddings = torch.cat(embeddings_list)
            kmeans = KMeans(n_clusters=num_clusters).fit(embeddings.cpu().numpy())
            target_distribution = torch.tensor(kmeans.transform(embeddings.cpu().numpy())).to(embeddings.device)
        print(f"Epoch {epoch+1}, Loss: {loss.item()}")

In [6]:
# Initialize model and optimizer
num_clusters = 10
projection_dim = 128
model = DCSCModel(
    roberta_model, 
    num_clusters=num_clusters, 
    projection_dim=projection_dim
)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

In [None]:
# Train the model
train_model(model, dataloader, optimizer, num_clusters, epochs=10, lambda_weight=0.5)

Epoch 1, Loss: 15.159804344177246
Epoch 2, Loss: 13.22829532623291
Epoch 3, Loss: 13.227643966674805
Epoch 4, Loss: 9.851664543151855
Epoch 5, Loss: 9.850968360900879


In [None]:
import os
import joblib

# Define the save path
save_folder = "/Users/seshat/Documents/GitHub/labor_sentiment_analysis/models/DCSC"
os.makedirs(save_folder, exist_ok=True)

# Save the model state dictionary
model_save_path = os.path.join(save_folder, "dcsc_model.pth")
torch.save(model.state_dict(), model_save_path)

# Save the tokenizer
tokenizer_save_path = os.path.join(save_folder, "tokenizer.pkl")
joblib.dump(tokenizer, tokenizer_save_path)

print(f"Model and tokenizer saved to {save_folder}")

Evaluation and Comparison

    Compare results with K-Means on pre-trained embeddings and DeepCluster.
    Analyze cluster quality using NMI, ARI, and visualize with t-SNE/UMAP.
    Use Latent Dirichlet Allocation (LDA) or Dynamic Topic Modeling (DTM) to explore cluster themes.

defend softmax

In [None]:
# Cluster Assignments
def get_cluster_assignments(model, dataloader):
    model.eval()
    all_clusters = []
    all_projections = []
    all_inputs = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"]
            attention_mask = batch["attention_mask"]

            projections, cluster_logits = model(input_ids, attention_mask)
            cluster_probs = F.softmax(cluster_logits, dim=1)
            cluster_assignments = torch.argmax(cluster_probs, dim=1)

            all_clusters.append(cluster_assignments.cpu())
            all_projections.append(projections.cpu())
            all_inputs.extend(input_ids.cpu())
    return torch.cat(all_clusters), torch.cat(all_projections), all_inputs

cluster_assignments, projections, inputs = get_cluster_assignments(model, dataloader)

In [None]:
# Visualize Clusters
from sklearn.decomposition import PCA

# reduce dimentions
pca = PCA(n_components=2)
reduced_projections = pca.fit_transform(projections.numpy())

# scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    reduced_projections[:, 0], 
    reduced_projections[:, 1],
    c=cluster_assignments, 
    cmap="viridis",
    alpha=0.7
)
plt.colorbar(scatter, label="Cluster")
plt.title("Cluster Visualization")
plt.xlabel("PCA Dimension 1")
plt.ylabel("PCA Dimension 2")
plt.show()

In [None]:
# Examine each cluster
texts = reddict['text_processed'].tolist()
clusters = cluster_assignments.numpy()

# group by cluster
cluster_texts = {i: [] for i in range(num_clusters)}
for text, cluster in zip(texts, clusters):
    cluster_texts[cluster].append(text)

for cluster_id, texts in cluster_texts.items():
    print(f"Cluster {cluster_id}")
    print("Example texts:")
    print("\n".join(texts[:5]))

In [None]:
# Keyword Extraction
from collections import Counter
from nltk.tokenize import word_tokenize

cluster_keywords = {}
for cluster_id, texts in cluster_texts.items():
    all_words = [word for text in texts for word in word_tokenize(text)]
    cluster_keywords[cluster_id] = Counter(all_words).most_common(10)

for cluster_id, keywords in cluster_keywords.items():
    print(f"Cluster {cluster_id} Keywords:")
    print(", ".join([word for word, count in keywords]))

In [None]:
# Compare labels in clusters
import pandas as pd

labels_nb = reddit["label_nb"].tolist()
labels_rob = reddit["label_rob"].tolist()

data = pd.DataFrame(
    {"text": texts, 
     "label_nb": labels_nb, 
     "label_rob": labels_rob, 
     "cluster": clusters}
)
# Calculate mean sentiment for each cluster by model
cluster_sentiments = data.groupby("cluster")[["label_nb", "label_rob"]].mean()

# Count sentiment distribution (e.g., positive, neutral, negative) for each model
cluster_distribution = (
    data.groupby("cluster")[["label_nb", "label_rob"]]
    .value_counts(normalize=True)
    .unstack()
)

# Display results
print("Average Sentiments by Cluster:")
print(cluster_sentiments)

print("\nSentiment Distribution by Cluster:")
print(cluster_distribution)


# Plot mean sentiment scores for each model
cluster_sentiments.plot(kind="bar", figsize=(12, 6), alpha=0.8)
plt.title("Mean Sentiment Scores by Cluster")
plt.xlabel("Cluster")
plt.ylabel("Mean Sentiment")
plt.legend(["Label NB", "Label Rob"], loc="upper left")
plt.show()

# Plot sentiment distributions
for model in ["label_nb", "label_rob"]:
    distribution = cluster_distribution[model]
    distribution.plot(kind="bar", stacked=True, figsize=(12, 6), alpha=0.8)
    plt.title(f"Sentiment Distribution for {model} by Cluster")
    plt.xlabel("Cluster")
    plt.ylabel("Proportion")
    plt.legend(["Negative", "Neutral", "Positive"], loc="upper left")
    plt.show()

# Add 'agree' column
data["agreement"] = data["label_nb"] == data["label_rob"]

# Calculate agreement percentage in each cluster
agreement_by_cluster = data.groupby("cluster")["agreement"].mean() * 100

print("Agreement Between Models by Cluster (%):")
print(agreement_by_cluster)

# Visualize agreement
agreement_by_cluster.plot(kind="bar", figsize=(12, 6), color="skyblue", alpha=0.8)
plt.title("Agreement Between Models by Cluster (%)")
plt.xlabel("Cluster")
plt.ylabel("Agreement (%)")
plt.show()

# LDA

In [None]:
from gensim import corpora
from gensim.models import LdaModel
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string


# Preprocess the text data
def preprocess_texts(texts):
    stop_words = set(stopwords.words("english"))
    preprocessed_texts = []
    for text in texts:
        # Tokenize and remove punctuation
        tokens = [word.lower() for word in word_tokenize(text) if word.isalpha()]
        # Remove stop words
        filtered_tokens = [word for word in tokens if word not in stop_words]
        preprocessed_texts.append(filtered_tokens)
    return preprocessed_texts


# Preprocess texts
texts = reddit["text"].tolist()
preprocessed_texts = preprocess_texts(texts)

# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(preprocessed_texts)
corpus = [dictionary.doc2bow(text) for text in preprocessed_texts]

# Fit LDA model
num_topics = 10  # Number of topics
lda_model = LdaModel(
    corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=10, random_state=42
)

# Print the topics
topics = lda_model.print_topics(num_words=10)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

In [None]:
import pyLDAvis.gensim_models

# Prepare the visualization
lda_vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

# Display the visualization
pyLDAvis.show(lda_vis)