In [None]:
"""
This code was written using CDC AI Chatbot. A variety of prompts were used, including questions and prompts to 
    correct bugs, memory issues(ie too little resources available), generate comments, etc.

maintenance: alan hamm(pqn7)
apr 2024
"""

In [1]:
import torch  # PyTorch library for deep learning and GPU acceleration
from torch.utils.data import DataLoader  # Provides an iterator over a dataset for efficient batch processing
from tqdm import tqdm  # Creates progress bars to visualize the progress of loops or tasks
from sklearn.feature_extraction.text import CountVectorizer  # Converts text documents into numerical representations
from sklearn.decomposition import LatentDirichletAllocation  # Implements Latent Dirichlet Allocation (LDA) for topic modeling
from gensim.models import LdaModel  # Implements LDA for topic modeling using the Gensim library
from gensim.corpora import Dictionary  # Represents a collection of text documents as a bag-of-words corpus
import os  # Provides functions for interacting with the operating system, such as creating directories
import pickle  # Allows objects to be serialized and deserialized to/from disk
import itertools  # Provides various functions for efficient iteration and combination of elements
import numpy as np  # Library for numerical computing in Python, used for array operations and calculations
from time import time  # Measures the execution time of code snippets or functions
import pprint as pp  # Pretty-printing library, used here to format output in a readable way
import multiprocessing
import pickle
#import dask.array as da
#from dask.diagnostics import ProgressBar
from tqdm.notebook import tqdm
from scipy.sparse import csr_matrix
from gensim.models import CoherenceModel
import gensim

In [22]:
# Define the range of number of topics for LDA and step size
start_topics = 50
end_topics = 150
step_size = 5

# Specify output directories for log file, model outputs, and images generated.
log_dir = "C:/_harvester/data/lda-models/2010s_html.json/"
model_dir = "C:/_harvester/data/lda-models/2010s_html.json/lda-models/"
image_dir = "C:/_harvester/data/lda-models/2010s_html.json/visuals/"

# Create directories if they don't exist.
os.makedirs(log_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)
os.makedirs(image_dir, exist_ok=True)

In [23]:
# Check if CUDA is available
if torch.cuda.is_available():
    # Get the number of available GPUs
    num_gpus = torch.cuda.device_count()
    print(f"Number of available GPUs: {num_gpus}")
    
    for i in range(num_gpus):
        # Get the properties of each GPU device
        gpu_properties = torch.cuda.get_device_properties(i)
        
        print(f"\nGPU Device {i} Properties:")
        print(f"Device Name: {gpu_properties.name}")
        print(f"Total Memory: {gpu_properties.total_memory / 1024**3:.2f} GB")
        print(f"Multiprocessor Count: {gpu_properties.multi_processor_count}")
        print(f"CUDA Capability Major Version: {gpu_properties.major}")
        print(f"CUDA Capability Minor Version: {gpu_properties.minor}")
else:
    print("CUDA is not available.")

# Set device to GPU if available, otherwise use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# verify if CUDA is being used or the CPU
if device is not None:
    # Check if PyTorch is currently using the GPU
    if torch.backends.cudnn.enabled:
        print("PyTorch is using the GPU.")
    else:
        print("PyTorch is using the CPU.")
else:
    print("The device is neither using the GPU nor CPU. An error has ocurred.")

Number of available GPUs: 1

GPU Device 0 Properties:
Device Name: NVIDIA RTX A3000 12GB Laptop GPU
Total Memory: 12.00 GB
Multiprocessor Count: 32
CUDA Capability Major Version: 8
CUDA Capability Minor Version: 6
PyTorch is using the GPU.


In [24]:
cores = multiprocessing.cpu_count() - 1 # Count the number of cores in a computer

In [25]:
# The parameter `alpha` in Latent Dirichlet Allocation (LDA) represents the concentration parameter of the Dirichlet 
# prior distribution for the topic-document distribution.
# It controls the sparsity of the resulting document-topic distributions.

# A lower value of `alpha` leads to sparser distributions, meaning that each document is likely to be associated with fewer topics.
# Conversely, a higher value of `alpha` encourages documents to be associated with more topics, resulting in denser distributions.

# The choice of `alpha` affects the balance between topic diversity and document specificity in LDA modeling.
alpha_values = np.arange(0.01, 1, 0.3).tolist()

In [26]:
# In Latent Dirichlet Allocation (LDA) topic analysis, the beta parameter represents the concentration 
# parameter of the Dirichlet distribution used to model the topic-word distribution. It controls the 
# sparsity of topics by influencing how likely a given word is to be assigned to a particular topic.

# A higher value of beta encourages topics to have a more uniform distribution over words, resulting in more 
# general and diverse topics. Conversely, a lower value of beta promotes sparser topics with fewer dominant words.

# The choice of beta can impact the interpretability and granularity of the discovered topics in LDA.
beta_values = np.arange(0.01, 1, 0.3).tolist()

In [27]:
gamma_threshold_values = np.arange(0.001, 0.011, 0.001).tolist()

In [28]:
# Define your dataset as a list of a list of tokenized sentences or load data from a file
with open(r"C:\_harvester\data\lda-models\2010s_html.json\word2vec-2010s\tokenized-texts-out\tokenized_sents-w-bigrams.pkl", "rb") as fp:
    texts_out = pickle.load(fp)

# Convert tokenized sentences to text documents by joining tokens with space separator
documents = [' '.join(tokens) for tokens in texts_out]

In [29]:
# Convert text data to numerical representation using CountVectorizer
pp.pprint("Begin the vectorization...")
started = time()

# CountVectorizer is a class in scikit-learn used to convert text documents into numerical representations.
# It builds a vocabulary of known words from the text data and transforms each document into a sparse matrix
# representing the frequency of each word in the document.
vectorizer = CountVectorizer()


# The fit_transform() method of CountVectorizer performs two steps:
# 1. It learns the vocabulary from the input documents and assigns a unique integer index to each word in the vocabulary.
# 2. It transforms the input documents into a sparse matrix representation, where each row corresponds to a document,
#    and each column represents the count of a specific word in that document.
# The resulting X is a sparse matrix (specifically, a scipy.sparse.csr_matrix) where each row represents a document,
# and each column represents a unique word in the vocabulary. The values in X indicate the frequency of each word in
# the corresponding document.
# Note: The fit_transform() method both fits the model to the data (learns vocabulary) and transforms it (creates matrix).
X = vectorizer.fit_transform(documents) # Convert text documents into numerical representations using CountVectorizer
pp.pprint(f"Vectorization completed in {round((time() - started) / 60, 2) } minutes.")

'Begin the vectorization...'
'Vectorization completed in 0.08 minutes.'


### Create COO format sparse tensor from the sparse matrix and move it to the device

The following code creates a COO (Coordinate) format sparse tensor from a given sparse matrix, represented by X. 
This conversion is performed to optimize storage and computation efficiency when working with large-scale data, 
especially on devices like GPUs that offer parallel processing capabilities.

To create the COO format sparse tensor, several steps are involved:
1. Extracting the row and column indices of non-zero elements in the sparse matrix X using `nonzero()` method.
2. Stacking these row and column indices vertically to create a 2D numpy array where each column represents an index pair.
3. Converting this index pairs numpy array into a PyTorch tensor, ensuring that the data type is set as long integer.
4. Extracting the non-zero values from the sparse matrix X using `.data` attribute.
5. Converting these non-zero values numpy array into a PyTorch tensor, ensuring that the data type is set as float.
6. Obtaining the shape of the sparse matrix X as a tuple representing (number of rows, number of columns).
7. Creating a `torch.Size` object encapsulating the shape information for creating a COO format sparse tensor.

Finally, all these components - COO indices, values, and shape - are used to create a COO format sparse tensor 
using `torch.sparse_coo_tensor()`. The resulting tensor is then moved to the specified device (e.g., GPU) 
using `.to(device)` for efficient computation if available.

In [30]:
# Create COO format sparse tensor from the sparse matrix and move it to the device
pp.pprint("Begin creation of sparse tensor...")
started = time()

# The purpose of this code is to create a COO (Coordinate) format sparse tensor from a given sparse matrix, represented by X.
# In this specific line, we are extracting the indices of non-zero elements in the sparse matrix X using `nonzero()` method.
# The `nonzero()` method returns a tuple of arrays, where each array represents the indices of non-zero elements along a particular dimension.
# np.vstack() function vertically stacks these arrays obtained from `nonzero()` to create a 2D array where each column represents the row and 
# column indices of a non-zero element.
# torch.from_numpy() converts this 2D numpy array into a PyTorch tensor. We use `.long()` to ensure that the data type is set as long integer.
# Finally, `.to(device)` moves the resulting tensor to the specified device (e.g., GPU) for efficient computation if available.
# This conversion to COO format and moving it to the device is often necessary when working with large sparse matrices in deep learning or 
# other computations.
# It allows for efficient storage and computation on devices like GPUs, which can significantly speed up operations involving large-scale data.
coo_indices = torch.from_numpy(np.vstack((X.nonzero()[0], X.nonzero()[1]))).long().to(device)


# The purpose of this code is to create a COO (Coordinate) format sparse tensor from a given sparse matrix, represented by X, 
# specifically for the values of non-zero elements.
# In this specific line, we are extracting the non-zero values from the sparse matrix X using `.data` attribute.
# `X.data` returns an array containing only the non-zero values of X.
# torch.from_numpy() converts this numpy array into a PyTorch tensor. We use `.float()` to ensure that the data type is set as float.
# Finally, `.to(device)` moves the resulting tensor to the specified device (e.g., GPU) for efficient computation if available.
# This conversion to COO format and moving it to the device is often necessary when working with large sparse matrices in deep 
# learning or other computations.
# It allows for efficient storage and computation on devices like GPUs, which can significantly speed up operations involving large-scale data.
coo_values = torch.from_numpy(X.data).float().to(device)


# The purpose of this code is to create a `torch.Size` object representing the shape of a COO (Coordinate) format sparse tensor.
# In this specific line, we are creating a `torch.Size` object using `X.shape`.
# `X.shape` returns a tuple representing the shape of the sparse matrix X, where the first element is the number of rows and the second 
# element is the number of columns.
# The resulting `torch.Size` object, `coo_shape`, encapsulates this shape information.
# This step is needed when creating a COO format sparse tensor because it requires specifying the shape of the resulting tensor.
# By obtaining and storing this shape information in `coo_shape`, we can ensure that our COO format sparse tensor has the correct dimensions.
# Having accurate shape information is crucial for performing operations on tensors and ensuring compatibility with other tensors or operations 
# in subsequent steps.
coo_shape = torch.Size(X.shape)


# The purpose of this code is to create a COO (Coordinate) format sparse tensor from the given COO indices, values, and shape.
# It also moves the resulting tensor to the specified device (e.g., GPU) for efficient computation if available.
# In this specific line, we are using `torch.sparse_coo_tensor()` to create a sparse tensor in COO format.
# The function takes three arguments:
# - `coo_indices`: The indices of non-zero elements in the sparse tensor.
# - `coo_values`: The values corresponding to each index in `coo_indices`.
# - `coo_shape`: The shape of the resulting sparse tensor.
# Finally, `.to(device)` moves the resulting tensor to the specified device (e.g., GPU) for efficient computation if available.
# This step is needed when working with large sparse matrices in deep learning or other computations.
# Sparse tensors allow us to efficiently represent and operate on matrices that have a significant number of zero elements,
# which can be common in many real-world datasets. By creating a COO format sparse tensor and moving it to a device like GPU,
# we can perform computations more efficiently and take advantage of parallel processing capabilities offered by GPUs.
X_tensor = torch.sparse_coo_tensor(coo_indices, coo_values, coo_shape).to(device)
pp.pprint(f"Create COO format sparse tensor completed in {round((time() - started) / 60, 2) } minutes.")

'Begin creation of sparse tensor...'
'Create COO format sparse tensor completed in 0.0 minutes.'


In [31]:

import pyLDAvis
# Create a Gensim Dictionary from the tokenized sentences
dictionary = Dictionary(texts_out)

# here is a custom method to be used to avoid this runtime error...
# RuntimeError: Batches of sparse tensors are not currently supported by the default collate_fn; 
def custom_collate(batch):
    # Coalesce and concatenate the entire batch of sparse tensors
    coalesced_batch = torch.cat([b.coalesce().unsqueeze(0) for b in batch], dim=0)
    return coalesced_batch

def coherence_score(X, topics, metric='c_v', vectorizer=None):
    # Convert X to a list of documents
    documents = [list(doc) for doc in X]

    # Create a dictionary and corpus from the documents
    dictionary = gensim.corpora.Dictionary(documents)
    corpus = [dictionary.doc2bow(doc) for doc in documents]

    # Create a topic model using the given topics
    topic_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=len(set(topics)), random_state=42)
    
    # Compute the coherence score using the CoherenceModel
    coherence_model = CoherenceModel(model=topic_model, texts=documents, dictionary=dictionary, coherence=metric)
    
    return coherence_model.get_coherence()

for n_topics in range(start_topics, end_topics + 1, step_size):
    ################################
    #   MODEL WITH SCIKIT- LEARN   #
    ################################
    # Initialize LDA model using scikit-learn
    pp.pprint("Begin the sklearn LDA modeling...")
    started = time()

    # Convert text documents into a matrix of token counts using Dask array
    #X_dask = da.from_array(X.toarray(), chunks=(7500, X.shape[1]))# X is the vectorized documents

    # n_jobs is an integer, specifying the maximum number of concurrently running workers. If 1 is given, 
    # no joblib parallelism is used at all, which is useful for debugging. If set to -1, all CPUs are 
    # used. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. For example with n_jobs=-2, all CPUs but one are used.
    lda_model_sklearn = LatentDirichletAllocation(n_components=n_topics, learning_method='online', n_jobs=-2)
    pp.pprint(f"The sklearn LDA modelling completed in {round((time() - started) / 60, 2) } minutes.")

    # Transfer the trained model to GPU if available
    #pp.pprint("Begin transfer of trained model to GPU if available...")
    #if torch.cuda.is_available():
    #    lda_model_torch = lda_model_sklearn.transform(X_tensor.to_dense().to(device))
    #    pp.pprint("The trained model was successfully transferred to the GPU")
    #else:
    #    lda_model_torch = lda_model_sklearn.transform(X_tensor.to_dense())
    #    pp.pprint("The trained model was not transferred to the GPU")

    # Fit the model on your data
    pp.pprint("Fit the model on the data...")
    started = time()
    # Train scikit-learn LDA model with progress bar visualization
    dataloader_sklearn = DataLoader(X_tensor, batch_size=16, shuffle=True, collate_fn=custom_collate)

    with tqdm(total=len(dataloader_sklearn), desc=f"Training scikit-learn LDA with {n_topics} topics", leave=False) as pbar:
        for batch in dataloader_sklearn:
            # Perform partial_fit on each sparse tensor in the batch
            for b in batch:
                dense_b = b.to_dense()  # Convert sparse tensor to dense tensor
                dense_b_cpu = dense_b.cpu()  # Copy dense tensor from GPU to CPU
                    
                # Reshape the input array
                reshaped_array = dense_b_cpu.numpy().reshape(-1, 1)
                    
                lda_model_sklearn.partial_fit(reshaped_array)  # Pass the reshaped array to partial_fit

            pbar.update(1)
        
    # Save the output using pickle
    with open(f"C:/_harvester/data/lda-models/2010s_html.json/sklearn-fit-output/fit-output-{n_topics}.pkl", 'wb') as file:
        pickle.dump(lda_model_sklearn, file)
    pp.pprint(f"Model fitting completed in {round((time() - started) / 60, 2) } minutes.")

    # Set the topic-word distributions of the scikit-learn LDA model to random values
    pp.pprint("Begin set the topic-word distributions of the scikit-learn LDA model to random values")
    started = time()
    lda_model_sklearn.components_ = torch.randn((n_topics, X.shape[1]), device=device)
    pp.pprint(f"Set the topic-word distributions completed in {round((time() - started) / 60, 2) } minutes.")

    # Get topic-word distributions from trained scikit-learn LDA model
    pp.pprint("Begin get topic-word distributions from trained scikit-learn LDA model")
    started = time()
    topic_word_distributions_sklearn = lda_model_sklearn.components_
    pp.pprint(f"Get of topic-word distributions completed in {round((time() - started) / 60, 2) } minutes.")
                    
    # Save the generated scikit-learn LDA model
    model_filename = os.path.join(model_dir, f"lda_model_sklearn_{n_topics}_topics.model")
    started = time()
    torch.save(lda_model_sklearn, model_filename)
    pp.pprint(f"Model saving completed in {round(time() - started, 2)} seconds.")

    ###########################
    #   MODEL WITH GENSIM     #
    ###########################
    for alpha, beta, gamma_threshold in itertools.product(alpha_values, beta_values, gamma_threshold_values):


        # Train Gensim LDA model with progress bar visualization
        corpus = [dictionary.doc2bow(doc) for doc in texts_out]
        
        #pbar_gensim = tqdm(total=len(corpus), desc=f"Training Gensim LDA with {n_topics} topics", leave=False)
        
        started = time()
        lda_model_gensim = LdaModel(corpus=corpus, 
                                    id2word=dictionary, 
                                    num_topics=n_topics,
                                    alpha=alpha, 
                                    eta=beta, 
                                    random_state=75,
                                    passes=10,
                                    chunksize=int(len(corpus)/cores+1),
                                    gamma_threshold=gamma_threshold, 
                                    per_words_topics=True).to(device)
        
        # Get the total number of iterations
        total_iterations = lda_model_gensim.passes * len(corpus)

        # Create a progress bar with total iterations
        with tqdm(total=total_iterations, desc="Training LDA model", leave=False) as pbar:
            # Iterate over each pass and corpus
            for i in range(lda_model_gensim.passes):
                for doc in corpus:
                    # Train the LDA model on each document
                    lda_model_gensim.update([doc])
                    
                    # Update the progress bar
                    pbar.update(1)

        pp.pprint(f"Gensim LDA model for {n_topics} topics completed in {round((time() - started) / 60, 2) } minutes.")

        convergence_score = lda_model_gensim.bound(corpus)

        perplexity_score = lda_model_gensim.log_perplexity(corpus)

        #pbar_gensim.close()

        # Get topic-word distributions from trained Gensim LDA model
        pp.pprint("Begin get topic-word distributions from trained Gensim LDA model")
        started = time()
        topic_word_distributions_gensim = lda_model_gensim.get_topics()
        pp.pprint(f"Get of topic-word distributions completed in {round((time() - started) / 60, 2) } minutes.")

        # Compare metrics and update best model if necessary
        pp.pprint("Begin comparison of metrics and updating of model if necessary")
        c_v_score_sklearn = coherence_score(X=X.toarray(), topics=topic_word_distributions_sklearn.argmax(axis=1), metric='c_v', vectorizer=vectorizer)
        pp.pprint(f"Metric comparison completed in {round((time() - started) / 60, 2) } minutes.")
    
        c_v_score_gensim = 0
        pbar_coherence = tqdm(total=len(texts_out), desc="Calculating Coherence Value - Gensim")
        
        for doc in texts_out:
            bow = dictionary.doc2bow(doc)
            c_v_score_gensim += coherence_score(topics=lda_model_gensim.get_document_topics(bow), texts=[doc], dictionary=dictionary, coherence='c_v')
            pbar_coherence.update(1)
            
        pbar_coherence.close()
            
        c_v_score_gensim /= len(texts_out)

        print(f"Comparison of scikit-learn LDA and Gensim LDA with {n_topics}:")
        print(f"Coherence Value (c_v) - scikit-learn: {c_v_score_sklearn}")
        print(f"Coherence Value (c_v) - Gensim: {c_v_score_gensim}")

        # Save the best Gensim LDA model
        best_model_gensim_filename = os.path.join(model_dir, f"best_model_gensim_{n_topics}_topics.model")
        lda_model_gensim.save(best_model_gensim_filename)

        # Generate and save a visualization for the best Gensim LDA model
        vis_data = pyLDAvis.gensim.prepare(lda_model_gensim, corpus, dictionary)
        vis_html_filename = os.path.join(image_dir, f"lda_visualization_{n_topics}_topics.html")
        pyLDAvis.save_html(vis_data, vis_html_filename)

        # Log metrics to a file
        log_filename = os.path.join(log_dir, "lda_metrics.log")

        with open(log_filename, 'a') as log_file:
            log_file.write(f"Number of Topics: {n_topics}\n")
            log_file.write(f"Alpha: {alpha}\n")
            log_file.write(f"Beta: {beta}\n")
            log_file.write(f"Gamma Threshold: {gamma_threshold}\n")
            log_file.write(f"Coherence Value (c_v) - scikit-learn: {c_v_score_sklearn}\n")
            log_file.write(f"Coherence Value (c_v) - Gensim: {c_v_score_gensim}\n")
            log_file.write(f"Convergence Score - Gensim: {convergence_score}\n")
            log_file.write(f"Log Perplexity - Gensim: {perplexity_score}\n\n")

'Begin the sklearn LDA modeling...'
'The sklearn LDA modelling completed in 0.0 minutes.'
'Fit the model on the data...'


Training scikit-learn LDA with 50 topics:   0%|          | 0/26047 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# Generate visualizations for each saved LDA model
for n_topics in range(start_topics, end_topics + 1, step_size):
    
        # Load the saved LDA model
    model_filename = os.path.join(model_dir, f"lda_model_{n_topics}_topics.pth")
    lda_model = torch.load(model_filename)

    # Generate pyLDAvis visualization
    vis_data = pyLDAvis.sklearn.prepare(lda_model, X.toarray(), vectorizer)
    
     # Save pyLDAvis visualization as HTML file
     vis_html_filename = os.path.join(model_dir, f"lda_visualization_{n_topics}_topics.html")
     pyLDAvis.save_html(vis_data, vis_html_filename)

     # Generate t-SNE plot for topic-word distributions
     tsne_plot(lda_model.components_, vectorizer.get_feature_names(), n_topics)

     # Generate word cloud for each topic
     for topic_idx in range(n_topics):
         generate_word_cloud(lda_model.components_[topic_idx], vectorizer.get_feature_names(), topic_idx)


In [None]:
def tsne_plot(topic_word_distributions, feature_names, n_topics):
    """
    Generates a t-SNE plot for the given topic-word distributions.
    
    Args:
        topic_word_distributions (ndarray): Topic-word distributions from LDA model.
        feature_names (list): List of feature names from CountVectorizer.
        n_topics (int): Number of topics in LDA model.
    """
    tsne = TSNE(n_components=2)
    tsne_results = tsne.fit_transform(topic_word_distributions.T)
    
    plt.figure(figsize=(10, 6))
    
    for i in range(n_topics):
        plt.scatter(tsne_results[:, 0], tsne_results[:, 1], label=f"Topic {i+1}")
        
        for j, txt in enumerate(feature_names):
            plt.annotate(txt, (tsne_results[j, 0], tsne_results[j, 1]))
            
    plt.title("t-SNE Plot of Topic-Word Distributions")
    plt.legend()
    plt.show()

In [None]:
def generate_word_cloud(topic_distribution, feature_names, topic_idx):
   """
   Generates a word cloud based on the given topic distribution and feature names.

   Args:
       topic_distribution (ndarray): Topic distribution from LDA model.
       feature_names (list): List of feature names from CountVectorizer.
       topic_idx (int): Index of the topic.
   """
   # Create a dictionary of words and their corresponding weights in the topic distribution
   word_weights = {feature_names[i]: weight for i, weight in enumerate(topic_distribution)}

   # Generate word cloud visualization
   wc = WordCloud(background_color='white')
   wc.generate_from_frequencies(word_weights)

   # Plot the word cloud
   plt.figure(figsize=(8, 6))
   plt.imshow(wc, interpolation='bilinear')
   plt.axis('off')
   plt.title(f"Word Cloud for Topic {topic_idx + 1}")
   plt.show()

In [None]:
import torch
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from gensim.models import LdaModel
from gensim.corpora import Dictionary

def compare_models_sklearn_gensim(sklearn_models, gensim_models, data):
    """
    Compares scikit-learn's LatentDirichletAllocation (LDA) models with gensim's LdaModel.
    
    Args:
        sklearn_models (list): List of scikit-learn LDA models.
        gensim_models (list): List of gensim LdaModel.
        data (list): List of tokenized sentences.
    """
    # Convert tokenized sentences to text documents by joining tokens with space separator
    documents = [' '.join(tokens) for tokens in data]

    # Convert text data to numerical representation using CountVectorizer
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(documents)

    # Create a PyTorch tensor from the sparse matrix and move it to the device
    X_tensor = torch.from_numpy(X.toarray()).float()

    # Create a Gensim Dictionary from the tokenized sentences
    dictionary = Dictionary(data)
    
    for i, (sk_model, gs_model) in enumerate(zip(sklearn_models, gensim_models)):
        print(f"Comparison for Model {i+1}:")
        
        # Compare coherence values using Gensim's CoherenceModel
        coherence_sk = sk_model.score(X)
        
        pbar = tqdm(total=len(data), desc="Calculating Coherence Value - Gensim")
        coherence_gs = 0
        
        for doc in data:
            bow = dictionary.doc2bow(doc)
            coherence_gs += gs_model.log_perplexity([bow])
            pbar.update(1)
        
        pbar.close()
        
        coherence_gs /= len(data)
        
        print(f"Coherence Value - scikit-learn: {coherence_sk}")
        print(f"Coherence Value - Gensim: {coherence_gs}\n")

# Example usage:
sklearn_models = [lda_model_100_topics, lda_model_200_topics]
gensim_models = [lda_gensim_100_topics, lda_gensim