In [7]:
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
import matplotlib.pyplot as plt

from gensim.models import LdaModel
from gensim.corpora import Dictionary

In [1]:
input_dataset = 'with_content_cleaned.csv'
img_folder = 'images'

In [3]:
start_topics = 10
limit_topics = 50
step_topics = 10

In [None]:
class LDAModel(nn.Module):
    def __init__(self, vocab_size, num_topics):
        super(LDAModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, num_topics)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input):
        # Assuming 'input' is the input document representation (e.g., Bag-of-Words)
        topic_distribution = self.embedding(input)
        topic_distribution = self.softmax(topic_distribution)
        return topic_distribution

In [4]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=10, step=10):
    coherence_values = []

    # Pad sequences to a consistent length
    padded_corpus = [torch.LongTensor([entry[0] for entry in doc]) for doc in corpus]
    padded_corpus = pad_sequence(padded_corpus, batch_first=True)

    for num_topics in range(start, limit, step):
        model = LDAModel(vocab_size=len(dictionary), num_topics=num_topics)

        # Move the model to GPU if available
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)

        # Convert your corpus to a PyTorch tensor and move to GPU
        input_data = padded_corpus.to(device)

        # Forward pass to get the topic distribution
        topic_distribution = model(input_data)

        # Compute coherence values (dummy value, replace with actual coherence calculation)
        coherence_values.append(torch.sum(topic_distribution).item())

    return coherence_values

In [5]:
# Read the chunk
df = pd.read_csv(input_dataset)
    
# Assuming 'clean_content' is the cleaned text column
cleaned_text = df['clean_content']
    
# Tokenize the cleaned text
tokenized_text = [text.split() for text in cleaned_text]

del cleaned_text

# Create a dictionary representation of the documents
dictionary = Dictionary(tokenized_text)

# Create a bag-of-words corpus
corpus = [dictionary.doc2bow(doc) for doc in tokenized_text]

# Compute coherence values
coherence_values = compute_coherence_values(dictionary, corpus, tokenized_text, limit_topics, start=start_topics, step=step_topics)

# Plot the coherence values and save the plot
x = range(start_topics, limit_topics, step_topics)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
    
# Save the plot with a unique name for each chunk
output_plot_file = os.path.join(img_folder, 'coherence_plot.png')
plt.savefig(output_plot_file)
plt.close()