In [None]:
"""
This code was written using CDC AI Chatbot. A variety of prompts were used, including questions and prompts to 
    correct bugs, memory issues(ie too little resources available), generate comments, etc.

maintenance: alan hamm(pqn7)
apr 2024
"""

In [None]:
import torch  # PyTorch library for deep learning and GPU acceleration
from torch.utils.data import DataLoader  # Provides an iterator over a dataset for efficient batch processing
from tqdm import tqdm  # Creates progress bars to visualize the progress of loops or tasks
from sklearn.feature_extraction.text import CountVectorizer  # Converts text documents into numerical representations
from sklearn.decomposition import LatentDirichletAllocation  # Implements Latent Dirichlet Allocation (LDA) for topic modeling
from gensim.models import LdaModel  # Implements LDA for topic modeling using the Gensim library
from gensim.models import LdaMulticore
from gensim.corpora import Dictionary  # Represents a collection of text documents as a bag-of-words corpus
from gensim.models import CoherenceModel
import gensim
import json

import os  # Provides functions for interacting with the operating system, such as creating directories
import pickle  # Allows objects to be serialized and deserialized to/from disk
import itertools  # Provides various functions for efficient iteration and combination of elements
import numpy as np  # Library for numerical computing in Python, used for array operations and calculations
from time import time  # Measures the execution time of code snippets or functions
import pprint as pp  # Pretty-printing library, used here to format output in a readable way
import multiprocessing
import pandas as pd

from tqdm.notebook import tqdm
from scipy.sparse import csr_matrix
#from scipy.sparse.linalg import triu

import pyLDAvis

import dask
import dask
from dask.distributed import Client, LocalCluster #, LocalCUDACluster
from dask.diagnostics import ProgressBar
import dask.bag as db
import torch
import pickle
import itertools
from gensim.models import Word2Vec
import cupy as cp
import webbrowser
from torchtext.vocab import GloVe
from gensim.models import KeyedVectors
import torchtext.vocab as vocab
import logging
from gensim.models.callbacks import PerplexityMetric, ConvergenceMetric, CoherenceMetric

In [None]:
logging.shutdown()

In [None]:
# Dask dashboard throws deprecation warnings w.r.t. Bokeh
import warnings
from bokeh.util.deprecation import BokehDeprecationWarning

# Disable Bokeh deprecation warnings
warnings.filterwarnings("ignore", category=BokehDeprecationWarning)

#BokehDeprecationWarning: 'circle() method with size value' was deprecated in Bokeh 3.4.0 and will be removed, use 'scatter(size=...) instead' instead.
#BokehDeprecationWarning: 'circle() method with size value' was deprecated in Bokeh 3.4.0 and will be removed, use 'scatter(size=...) instead' instead.
#BokehDeprecationWarning: 'square() method' was deprecated in Bokeh 3.4.0 and will be removed, use "scatter(marker='square', ...) instead" instead.

In [None]:
# Define the range of number of topics for LDA and step size
start_topics = 74
end_topics = 102
step_size = 2

MIN_YEAR = 2010
MAX_YEAR = 2020

# Specify output directories for log file, model outputs, and images generated.
log_dir = "C:/_harvester/data/lda-models/2010s_html.json/"
model_dir = "C:/_harvester/data/lda-models/2010s_html.json/lda-models/"
image_dir = "C:/_harvester/data/lda-models/2010s_html.json/visuals/"

# Create directories if they don't exist.
os.makedirs(log_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)
os.makedirs(image_dir, exist_ok=True)

In [None]:
# Check if CUDA is available
if torch.cuda.is_available():
    # Get the number of available GPUs
    num_gpus = torch.cuda.device_count()
    print(f"Number of available GPUs: {num_gpus}")
    
    for i in range(num_gpus):
        # Get the properties of each GPU device
        gpu_properties = torch.cuda.get_device_properties(i)
        
        print(f"\nGPU Device {i} Properties:")
        print(f"Device Name: {gpu_properties.name}")
        print(f"Total Memory: {gpu_properties.total_memory / 1024**3:.2f} GB")
        print(f"Multiprocessor Count: {gpu_properties.multi_processor_count}")
        print(f"CUDA Capability Major Version: {gpu_properties.major}")
        print(f"CUDA Capability Minor Version: {gpu_properties.minor}")
else:
    print("CUDA is not available.")

# Set device to GPU if available, otherwise use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# verify if CUDA is being used or the CPU
if device is not None:
    # Check if PyTorch is currently using the GPU
    if torch.backends.cudnn.enabled:
        print("PyTorch is using the GPU.")
        cuda_version = torch.version.cuda
        print("CUDA Version:", cuda_version)
    else:
        print("PyTorch is using the CPU.")
else:
    print("The device is neither using the GPU nor CPU. An error has ocurred.")

In [None]:
cores = multiprocessing.cpu_count() - 1 # Count the number of cores in a computer

In [None]:
# The parameter `alpha` in Latent Dirichlet Allocation (LDA) represents the concentration parameter of the Dirichlet 
# prior distribution for the topic-document distribution.
# It controls the sparsity of the resulting document-topic distributions.

# A lower value of `alpha` leads to sparser distributions, meaning that each document is likely to be associated with fewer topics.
# Conversely, a higher value of `alpha` encourages documents to be associated with more topics, resulting in denser distributions.

# The choice of `alpha` affects the balance between topic diversity and document specificity in LDA modeling.
alpha_values = np.arange(0.01, 1, 0.3).tolist()
alpha_values += ['symmetric', 'asymmetric']

In [None]:
# In Latent Dirichlet Allocation (LDA) topic analysis, the beta parameter represents the concentration 
# parameter of the Dirichlet distribution used to model the topic-word distribution. It controls the 
# sparsity of topics by influencing how likely a given word is to be assigned to a particular topic.

# A higher value of beta encourages topics to have a more uniform distribution over words, resulting in more 
# general and diverse topics. Conversely, a lower value of beta promotes sparser topics with fewer dominant words.

# The choice of beta can impact the interpretability and granularity of the discovered topics in LDA.
beta_values = np.arange(0.01, 1, 0.3).tolist()
beta_values += ['symmetric']

In [None]:
# Define your dataset as a list of a list of tokenized sentences or load data from a file
def get_texts_out(year):
    year = int(year)
    with open(f"C:/_harvester/data/tokenized-sentences/10s/{year}-tokenized_sents-w-bigrams.pkl", "rb") as fp:
        texts_out = pickle.load(fp)

    print(f"This is the get_texts_out() function. The size of the return is {len(texts_out)}")
    return texts_out

#pp.pprint(get_texts_out(2010))


In [None]:

from typing import List, Optional
def coherence_score(X: List[List[str]], n_topics: int, metric: str = 'c_v', vectorizer: Optional[str] = None, glove: Optional[GloVe] = None) -> float:
    """
    Compute the coherence score for a given set of topics and documents.

    Args:
        X (list): List of documents.
        topics (list): List of topic assignments for each document.
        metric (str, optional): Coherence metric to use. Defaults to 'c_v'.
        vectorizer (str, optional): Vectorizer to use. Defaults to None.

    Returns:
        float: Coherence score.

    """
    if vectorizer == 'glove':
        # Load pre-trained GloVe embeddings
        # load the scattered embedding vectors from across Dask workers
        #glove = GloVe(vectors=embedding_vectors)

        # Move the embeddings to the GPU device if available
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        if device.type == "cuda":
            print("CUDA is being used by GloVe.")
            print("Number of GPUs available:", torch.cuda.device_count())
            print("Current GPU:", torch.cuda.get_device_name(0))
            
        else:
            print("CUDA is not being used by GloVe. Using CPU instead.")

        # Convert X to a list of documents
        documents = [list(doc) for doc in X]

        # Convert documents into numerical representations using GloVe
        document_vectors = []
        
        batch_size = 1000  # Set the batch size
        for i in range(0, len(documents), batch_size):
            batch_docs = documents[i:i+batch_size]
            doc_vectors = [[glove[word] for word in doc] for doc in batch_docs]
            document_vectors.extend(doc_vectors)
        
        X_gpu = []
        
        num_vectors_to_print = 5  # Number of vectors to print
        
        for doc_vecs in document_vectors:
            doc_gpu = [vec.to(device) for vec in doc_vecs]
            X_gpu.append(doc_gpu)
            
            if num_vectors_to_print > 0:
                # Verify if tensors are on GPU and print their devices
                for vec in doc_gpu:
                    print("The vector is on the GPU:", vec.device)
                    num_vectors_to_print -= 1
    
    else:
        print("Vectorizer is not set to 'glove'.")


    # Create a dictionary and corpus from the documents
    dictionary = Dictionary(X)
    corpus = [dictionary.doc2bow(doc) for doc in X]

    # Create a topic model using the given topics
    topic_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=n_topics, random_state=42)

    # Compute the coherence score using the CoherenceModel
    coherence_model = CoherenceModel(model=topic_model, texts=X, dictionary=dictionary, coherence=metric)

    return coherence_model.get_coherence()

In [None]:
import socket

def check_port_in_use(port):
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    sock.settimeout(1)  # Set a timeout for the connection attempt
    try:
        sock.connect(('localhost', port))  # Connect to the specified port
        sock.close()  # Close the socket connection
        return True  # Port is in use
    except ConnectionRefusedError:
        return False  # Port is not in use or closed

def close_port(port):
    if check_port_in_use(port):
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock.settimeout(1)  # Set a timeout for the connection attempt
        try:
            sock.connect(('localhost', port))  # Connect to the specified port
            sock.close()  # Close the socket connection
            print(f"Port {port} is now closed.")
        except ConnectionRefusedError:
            print(f"Port {port} could not be closed.")
    else:
        print(f"Port {port} is already closed or not in use.")

In [None]:
from gensim import corpora

def create_lda_batches(filename, batch_size):
    with open(filename, 'r') as jsonfile:
        data = json.load(jsonfile)
    
    # Create a dictionary from the data
    dictionary = corpora.Dictionary(data)
    
    num_batches = len(data) // batch_size
    remainder = len(data) % batch_size
    
    batches = []
    
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size
        batch_data = data[start_idx:end_idx]
        
        # Convert the text data into bag-of-words representation using the dictionary
        batch_bow = [dictionary.doc2bow(text) for text in batch_data]
        
        batches.append(batch_bow)
    
    if remainder > 0:
        last_batch_data = data[-remainder:]
        
        # Convert the text data into bag-of-words representation using the dictionary
        last_batch_bow = [dictionary.doc2bow(text) for text in last_batch_data]
        
        batches.append(last_batch_bow)
    
    return batches

In [None]:
def create_tokenized_sentences(batches):
    tokenized_sentences = []
    
    for batch in batches:
        batch_sentences = []
        
        for doc in batch:
            sentence_tokens = [token[0] for token in doc]
            batch_sentences.append(sentence_tokens)
        
        tokenized_sentences.append(batch_sentences)
    
    return tokenized_sentences

In [None]:
import dask.delayed
import logging
#logging.basicConfig(filename=f"C:/_harvester/data/lda-models/2010s_html.json/logs/model_callback.log",
#                                format="%(asctime)s:%(levelname)s:%(message)s",
#                                level=logging.NOTSET)

    
if __name__=="__main__":
    # Create a multiprocessing context using the "spawn" method
    # This method is recommended for certain platforms, such as Windows or Jupyter Notebook, to avoid conflicts
    #ctx = multiprocessing.get_context("spawn")

    # Create a Pool of worker processes using the multiprocessing context
    # The number of worker processes is cores - 1
    # This ensures that one CPU core is left available for other tasks or system operations
    #pool = ctx.Pool(cores - 1)

    try:
        # Check if the Dask client is connected to a scheduler
        if client.status == "running":
            # Close the Dask client
            client.close()
            print("Dask client closed preemptively.")
        else:
            print("Dask client is not connected to a scheduler.")
    except Exception as e:
        print(f"The Dask client was not connected: {e}")

    # Load the saved embedding vectors from TorchText GloVe library
    glove = vocab.Vectors('glove.840B.300d.txt', 'C:/_harvester/GloVe/')

    # Get the embedding vectors and vocabulary from TorchText GloVe library
    embedding_vectors = glove.vectors

    # Move the embeddings to the GPU device if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    embedding_vectors = embedding_vectors.to(device)

    # Verify if CUDA is being used by checking the device type
    if device.type == "cuda":
        print("CUDA is being used by GloVe.")
    else:
        print("CUDA is not being used by GloVe. Using CPU instead.")

    # Convert embedding vectors to a NumPy array (on CPU)
    embedding_array = embedding_vectors.cpu().numpy()

    # Dictionary to hold the metrics that are generated
    metrics_csv = {
        'n_topics': [],
        'alpha': [],
        'beta': [],
        'cv_score': [],
        'convergence_score': [],
        'log_perplexity': [],
        'time_to_complete': []
    }

    # close the port if it's open
    #close_port(8787)
    
    # Specify the local directory path
    DASK_DIR = '/_harvester/tmp-dask-out'

    # specify Dask dashboard port
    #DASHBOARD_PORT = "60481"
    """
    # Set the GPU memory limit
    gpu_memory_limit = "10GB"
    # Set the CUDA_VISIBLE_DEVICES environment variable to specify which GPUs to use
    os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"  # Specify GPU device IDs
    # Create a Dask local cluster with the specified local directory and GPU memory limit
    #cluster = LocalCluster(local_directory=DASK_DIR, device_memory_limit=gpu_memory_limit)
    cluster = LocalCluster(local_directory=DASK_DIR)
    client = Client(cluster)
    """
    # Deploy a Single-Machine Multi-GPU Cluster
    # https://medium.com/@aryan.gupta18/end-to-end-recommender-systems-with-merlin-part-1-89fabe2fa05b
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Specify GPU device IDs
    protocol = "tcp"  # "tcp" or "ucx"
    num_gpus = 1
    NUM_GPUS=[0]
    cores = multiprocessing.cpu_count() - 1 # Count the number of cores in a computer
    visible_devices = ",".join([str(n) for n in NUM_GPUS])  # Select devices to place workers
    device_limit_frac = 0.7  # Spill GPU-Worker memory to host at this limit.
    device_pool_frac = 0.8
    part_mem_frac = 0.15

    # Manually specify the total device memory size (in bytes)
    device_size = 10 * 1024 * 1024 * 1024  # GPU has 12GB but setting at 10GB
            
    ram_memory_limit = "75GB" # Set the RAM memory limit (per worker)
    device_limit = int(device_limit_frac * device_size)
    device_pool_size = int(device_pool_frac * device_size)
    part_size = int(part_mem_frac * device_size)

    cluster = LocalCluster(
            n_workers=(multiprocessing.cpu_count()-2),
            threads_per_worker=2,
            #processes=False,
            memory_limit=ram_memory_limit,
            local_directory=DASK_DIR,
            dashboard_address=":8787",
            protocol="tcp",
    )


    # Create the distributed client
    client = Client(cluster)

    # Get information about workers from scheduler
    workers_info = client.scheduler_info()["workers"]

    # Iterate over workers and set their memory limits
    for worker_id, worker_info in workers_info.items():
        worker_info["memory_limit"] = ram_memory_limit

    # Verify that memory limits have been set correctly
    #for worker_id, worker_info in workers_info.items():
    #    print(f"Worker {worker_id}: Memory Limit - {worker_info['memory_limit']}")

    # verify that Dask is being used in your code, you can check the following:
    # Check if the Dask client is connected to a scheduler:
    if client.status == "running":
        print("Dask client is connected to a scheduler.")
        # Scatter the embedding vectors across Dask workers
    else:
        print("Dask client is not connected to a scheduler.")

    # Check if Dask workers are running:
    if len(client.scheduler_info()["workers"]) > 0:
        print("Dask workers are running.")
    else:
        print("No Dask workers are running.")

    #@dask.delayed
    def train_model(n_topics, alpha, beta):
        #dictionary = Dictionary()  # Create an empty dictionary
        combined_corpus = []  # Initialize list to store combined corpus
        combined_text = []

        passes = 11  # Number of passes

        #print("We are before the loop.")
        for year in tqdm(range(MIN_YEAR, MAX_YEAR), desc="Training LDA models"):
            #print(f"This is the year value that is extracted from the Range {year}")
            with open(f"C:/_harvester/data/tokenized-sentences/10s/{year}-tokenized_sents-w-bigrams.json", "r") as fp:
                texts_out =  json.load(fp)

            dictionary = Dictionary(texts_out)

            if len(dictionary) > 0:
                corpus = [dictionary.doc2bow(doc) for doc in texts_out]

                #perplexity_logger = PerplexityMetric(corpus=corpus, logger='shell')
                if year == MIN_YEAR:
                    # Enable progress bar during LDA model training
                    pbar = tqdm(total=passes, desc=f"Training LDA model for year {year}")
                    lda_model_gensim = LdaModel(corpus=corpus,
                                                id2word=dictionary,
                                                num_topics=n_topics,
                                                alpha=alpha,
                                                eta=beta,
                                                random_state=75,
                                                passes=passes,
                                                #workers = cores,
                                                chunksize=4000,
                                                per_word_topics=True,
                                                #callbacks=[pbar]
                                            )
                    pbar.update(1)
                else:
                    lda_model_gensim.update(corpus)

                combined_text += texts_out
                dictionary.add_documents(texts_out)  # Update the dictionary with new documents
                combined_corpus.extend(corpus)  # Extend the combined corpus with current year's corpus
                
                # Convert tensors to strings in combined_text
                #documents = [[str(w.item()) if isinstance(w, torch.Tensor) else str(w) for w in doc] for doc in combined_text]

                # Create a new dictionary using modified documents
                dictionary = Dictionary(combined_text)
        return lda_model_gensim, combined_corpus, combined_text, dictionary
    
        
    results = []
    corpus_output = []

    # Calculate the total number of iterations for the progress bar
    total_iterations = len(range(start_topics, end_topics + 1, step_size)) * len(alpha_values) * len(beta_values)

    # Create a tqdm progress bar
    #progress_bar = tqdm(total=total_iterations, desc="Training LDA models")

    print("Training LDA models.")
    for n_topics in range(start_topics, end_topics + 1, step_size):
        for alpha, beta in itertools.product(alpha_values, beta_values):
            # Submit train_model function as a task to Dask cluster and get future object
            future = dask.delayed(train_model)(n_topics, alpha, beta)
            results.append(future)

    # Compute lda_models using Dask
    with dask.config.set(scheduler='distributed'):
        try:
            lda_models = dask.compute(*results, progressbar=True)
        except Exception as e:
            print(e)
            print("\n")
            for result in results:
                if not isinstance(result, dask.delayed.Delayed):
                    print("Invalid element found in results:", result)

    progress_bar = tqdm(total=len(lda_models), desc="Calculating metrics")
    for (lda_model_gensim, combined_corpus, combined_text, dictionary), \
            (alpha, beta) in zip(lda_models, itertools.product(alpha_values, beta_values)):
        # Compute convergence score
        convergence_score = lda_model_gensim.bound(combined_corpus)

        # Compute perplexity score
        perplexity_score = lda_model_gensim.log_perplexity(combined_corpus)

        # Compute coherence score
        c_v_score = coherence_score(X=combined_text, n_topics=n_topics,
                                    vectorizer='glove', glove=glove)

        # Save the best Gensim LDA model
        print("Saving the Gensim LDA model.")
        best_model_gensim_filename = os.path.join(model_dir, f"gensim_topics({n_topics})_alpha({alpha})_beta({beta}).model")
        lda_model_gensim.save(best_model_gensim_filename)

        # Add metrics to dictionary
        metrics_csv['n_topics'].append(n_topics)
        metrics_csv['alpha'].append(alpha)
        metrics_csv['beta'].append(beta)
        metrics_csv['cv_score'].append(c_v_score)
        metrics_csv['convergence_score'].append(convergence_score)
        metrics_csv['log_perplexity'].append(perplexity_score)

        # Log metrics to a file
        log_filename_txt = os.path.join(log_dir, "lda_metrics.txt")

        with open(log_filename_txt, 'a') as log_file:
                log_file.write(f"Number of Topics: {n_topics}  |  ")
                log_file.write(f"Alpha: {alpha}  |  ")
                log_file.write(f"Beta: {beta}  |  ")
                log_file.write(f"Coherence Value (c_v) - Gensim: {c_v_score}  |  ")
                log_file.write(f"Convergence Score - Gensim: {convergence_score}  |  ")
                log_file.write(f"Log Perplexity - Gensim: {perplexity_score}\n")

        progress_bar.update(1)
    progress_bar.close()
    pd.DataFrame(metrics_csv).to_pickle('C:/_harvester/data/lda-models/2010s_html.json/2010s-lda_tuning_results.pkl')
    pd.DataFrame(metrics_csv).to_csv('C:/_harvester/data/lda-models/2010s_html.json/2010s-lda_tuning_results.csv', index=False)   

    # Close the Dask client and cluster when done
    client.close()
    #cluster.close(timeout=60)
    cluster.close()
    logging.shutdown()


In [None]:
# Close the Dask client and cluster when done
client.close()
cluster.close(timeout=60)


In [None]:
def tsne_plot(topic_word_distributions, feature_names, n_topics):
    """
    Generates a t-SNE plot for the given topic-word distributions.
    
    Args:
        topic_word_distributions (ndarray): Topic-word distributions from LDA model.
        feature_names (list): List of feature names from CountVectorizer.
        n_topics (int): Number of topics in LDA model.
    """
    tsne = TSNE(n_components=2)
    tsne_results = tsne.fit_transform(topic_word_distributions.T)
    
    plt.figure(figsize=(10, 6))
    
    for i in range(n_topics):
        plt.scatter(tsne_results[:, 0], tsne_results[:, 1], label=f"Topic {i+1}")
        
        for j, txt in enumerate(feature_names):
            plt.annotate(txt, (tsne_results[j, 0], tsne_results[j, 1]))
            
    plt.title("t-SNE Plot of Topic-Word Distributions")
    plt.legend()
    plt.show()

In [None]:
def generate_word_cloud(topic_distribution, feature_names, topic_idx):
   """
   Generates a word cloud based on the given topic distribution and feature names.

   Args:
       topic_distribution (ndarray): Topic distribution from LDA model.
       feature_names (list): List of feature names from CountVectorizer.
       topic_idx (int): Index of the topic.
   """
   # Create a dictionary of words and their corresponding weights in the topic distribution
   word_weights = {feature_names[i]: weight for i, weight in enumerate(topic_distribution)}

   # Generate word cloud visualization
   wc = WordCloud(background_color='white')
   wc.generate_from_frequencies(word_weights)

   # Plot the word cloud
   plt.figure(figsize=(8, 6))
   plt.imshow(wc, interpolation='bilinear')
   plt.axis('off')
   plt.title(f"Word Cloud for Topic {topic_idx + 1}")
   plt.show()

In [None]:
import torch
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from gensim.models import LdaModel
from gensim.corpora import Dictionary

def compare_models_sklearn_gensim(sklearn_models, gensim_models, data):
    """
    Compares scikit-learn's LatentDirichletAllocation (LDA) models with gensim's LdaModel.
    
    Args:
        sklearn_models (list): List of scikit-learn LDA models.
        gensim_models (list): List of gensim LdaModel.
        data (list): List of tokenized sentences.
    """
    # Convert tokenized sentences to text documents by joining tokens with space separator
    documents = [' '.join(tokens) for tokens in data]

    # Convert text data to numerical representation using CountVectorizer
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(documents)

    # Create a PyTorch tensor from the sparse matrix and move it to the device
    X_tensor = torch.from_numpy(X.toarray()).float()

    # Create a Gensim Dictionary from the tokenized sentences
    dictionary = Dictionary(data)
    
    for i, (sk_model, gs_model) in enumerate(zip(sklearn_models, gensim_models)):
        print(f"Comparison for Model {i+1}:")
        
        # Compare coherence values using Gensim's CoherenceModel
        coherence_sk = sk_model.score(X)
        
        pbar = tqdm(total=len(data), desc="Calculating Coherence Value - Gensim")
        coherence_gs = 0
        
        for doc in data:
            bow = dictionary.doc2bow(doc)
            coherence_gs += gs_model.log_perplexity([bow])
            pbar.update(1)
        
        pbar.close()
        
        coherence_gs /= len(data)
        
        print(f"Coherence Value - scikit-learn: {coherence_sk}")
        print(f"Coherence Value - Gensim: {coherence_gs}\n")

# Example usage:
sklearn_models = [lda_model_100_topics, lda_model_200_topics]
gensim_models = [lda_gensim_100_topics, lda_gensim