In [1]:
import warnings
warnings.filterwarnings("ignore", category=ResourceWarning)

In [12]:
import pandas as pd
from gensim import corpora
from gensim.models import LdaModel
import numpy as np
import os
import itertools
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
save_dir = './saved_models'

df = pd.read_pickle('preprocessed_df.pkl')

# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(df['Tokens'])
corpus = [dictionary.doc2bow(text) for text in df['Tokens']]
texts = df['Tokens']

In [3]:
num_topics_list = [2, 3, 4]
alpha_list = ['symmetric']
beta_list = ['symmetric']
passes_list = [10, 20]
iterations_list = [50, 100]
results = []

In [16]:
def evaluate_lda_models(corpus, dictionary, texts, num_topics_list, alpha_list, beta_list, passes_list, iterations_list, metrics=('coherence', 'diversity')):
    """
    Train LDA models with a given set of hyperparameters and metrics, then save the model parameters and evaluation metrics in a DataFrame.

    Parameters:
    - corpus: The BoW corpus.
    - dictionary: The Gensim dictionary.
    - texts: The list of tokenized texts.
    - num_topics_list: List of values for the number of topics to try.
    - alpha_list: List of values for the alpha hyperparameter.
    - beta_list: List of values for the beta hyperparameter.
    - passes_list: List of values for the number of passes during training.
    - iterations_list: List of values for the number of iterations during training.
    - metrics: Tuple of metrics to evaluate ('coherence', 'diversity', or both).
    
    Returns:
    - results_df: A DataFrame containing the evaluation results and model parameters.
    """
    results = []

    # Generate all combinations of the hyperparameters
    param_grid = list(itertools.product(num_topics_list, alpha_list, beta_list, passes_list, iterations_list))

    # Iterate through each parameter combination
    for idx, (num_topics, alpha, beta, passes, iterations) in enumerate(param_grid):
        try:
            print(f"Training model {idx+1}/{len(param_grid)} with num_topics={num_topics}, alpha={alpha}, beta={beta}, passes={passes}, iterations={iterations}")

            # Train the LDA model
            lda_model = LdaModel(
                corpus=corpus,
                id2word=dictionary,
                num_topics=num_topics,
                alpha=alpha,
                eta=beta,
                passes=passes,
                iterations=iterations,
                random_state=42
            )

            # Initialize result dictionary
            result = {
                'model_id': idx + 1,
                'num_topics': num_topics,
                'alpha': alpha,
                'beta': beta,
                'passes': passes,
                'iterations': iterations,
            }

            # Evaluate the model based on specified metrics
            if 'coherence' in metrics:
                coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
                coherence_lda = coherence_model_lda.get_coherence()
                result['coherence_score'] = coherence_lda

            if 'diversity' in metrics:
                topic_words = lda_model.show_topics(num_topics=num_topics, num_words=10, formatted=False)
                unique_words = set()
                total_words = 0

                for _, words in topic_words:
                    words = [word for word, _ in words]
                    unique_words.update(words)
                    total_words += len(words)

                topic_diversity = len(unique_words) / total_words if total_words > 0 else 0
                result['topic_diversity'] = topic_diversity

            # Append the result to the results list
            results.append(result)

        except Exception as e:
            print(f"An error occurred while training model {idx+1}: {e}")
            continue  # Skip this iteration if there's an error

    # Convert results to a DataFrame and sort by coherence_score or other metric
    results_df = pd.DataFrame(results)

    if 'coherence' in metrics:
        results_df = results_df.sort_values(by='coherence_score', ascending=False)

    return results_df

# Example Usage
results_df = evaluate_lda_models(
    corpus=corpus,
    dictionary=dictionary,
    texts=texts,
    num_topics_list=[2, 3, 4],
    alpha_list=[0.01, 0.1, 'auto'],
    beta_list=[0.01, 0.1, 'auto'],
    passes_list=[10, 20],
    iterations_list=[100, 200],
    metrics=('coherence', 'diversity')
)

# Display the top 5 results
print(results_df.head(5))

# Save the results DataFrame for future reference
results_df_path = 'saved_models/lda_model_results_summary.csv'
os.makedirs('saved_models', exist_ok=True)
results_df.to_csv(results_df_path, index=False)

Training model 1/144 with num_topics=2, alpha=0.01, beta=0.01, passes=10, iterations=100
Training model 2/144 with num_topics=2, alpha=0.01, beta=0.01, passes=10, iterations=200
Training model 3/144 with num_topics=2, alpha=0.01, beta=0.01, passes=20, iterations=100
Training model 4/144 with num_topics=2, alpha=0.01, beta=0.01, passes=20, iterations=200
Training model 5/144 with num_topics=2, alpha=0.01, beta=0.1, passes=10, iterations=100
Training model 6/144 with num_topics=2, alpha=0.01, beta=0.1, passes=10, iterations=200
Training model 7/144 with num_topics=2, alpha=0.01, beta=0.1, passes=20, iterations=100
Training model 8/144 with num_topics=2, alpha=0.01, beta=0.1, passes=20, iterations=200
Training model 9/144 with num_topics=2, alpha=0.01, beta=auto, passes=10, iterations=100
Training model 10/144 with num_topics=2, alpha=0.01, beta=auto, passes=10, iterations=200
Training model 11/144 with num_topics=2, alpha=0.01, beta=auto, passes=20, iterations=100
Training model 12/144 w

In [17]:
results_df

Unnamed: 0,model_id,num_topics,alpha,beta,passes,iterations,coherence_score,topic_diversity
111,112,5,0.01,0.01,20,200,0.213740,0.6
123,124,5,0.1,0.01,20,200,0.213740,0.6
127,128,5,0.1,0.1,20,200,0.213740,0.6
131,132,5,0.1,auto,20,200,0.213740,0.6
135,136,5,auto,0.01,20,200,0.213358,0.6
...,...,...,...,...,...,...,...,...
28,29,2,auto,0.1,10,100,0.182687,0.8
29,30,2,auto,0.1,10,200,0.182687,0.8
32,33,2,auto,auto,10,100,0.182687,0.8
33,34,2,auto,auto,10,200,0.182687,0.8
