In [77]:
import string
import spacy
spacy_nlp = spacy.load("en_core_web_sm")
from spacy.lang.en.stop_words import STOP_WORDS

import utility_functions as utils
import importlib
import pandas as pd
from itertools import chain
import numpy as np
from tqdm import tqdm
import json
from scipy.sparse import dok_matrix
import matplotlib.pyplot as plt

from contextualized_topic_models.models.ctm import ZeroShotTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation

# Octis is the library which can use different implemented topic modelling techniques
from octis.preprocessing.preprocessing import Preprocessing
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.models.LDA import LDA
from octis.models.CTM import CTM
from octis.models.ETM import ETM
from octis.models.NeuralLDA import NeuralLDA
from octis.models.NMF import NMF

importlib.reload(utils)
data = './preprocessed_df.pkl'

In [None]:
df = pd.read_pickle(data)
df[['Artist', 'Song', 'Tokens', 'Lyrics', 'Coast']].head()

In [None]:
# Create the corpus required by OCTIS to build up the dataset
with open('corpus.tsv', 'w', encoding='utf-8') as file:
    for lyrics in df['Lyrics']:
        if pd.notna(lyrics):
            file.write(lyrics + '\n')

In [None]:
# Flatten all tokens to create a single list of words
vocab = set(chain.from_iterable(df['Tokens'].tolist()))

# Save as vocabulary.json
with open("./vocabulary.json", 'w') as f:
    json.dump(list(vocab), f)

In [None]:
# Initialize the document-term matrix
num_docs = len(df)
num_terms = len(vocab)
doc_term_matrix = dok_matrix((num_docs, num_terms), dtype=np.int32)

# Build the matrix
token_to_index = {word: idx for idx, word in enumerate(vocab)}

for doc_idx, tokens in enumerate(df['Tokens']):
    for token in tokens:
        if token in token_to_index:
            word_idx = token_to_index[token]
            doc_term_matrix[doc_idx, word_idx] += 1

# Convert the matrix to a sparse format JSON
sparse_matrix = []
for (doc_idx, word_idx), freq in doc_term_matrix.items():
    sparse_matrix.append([doc_idx, word_idx, int(freq)])

# Save as doc_term_matrix.json
with open("./doc_term_matrix.json", 'w') as f:
    json.dump(sparse_matrix, f)

In [None]:
# Initialize preprocessing
preprocessor = Preprocessing(
    vocabulary=None,
    max_features=None,
    remove_punctuation=True,
    punctuation=string.punctuation,
    lemmatize=True,
    min_chars=2,
    min_words_docs=0,
    save_original_indexes=True,
    min_df=0.05,
    max_df=0.8,
    split=True
)

dataset = preprocessor.preprocess_dataset(documents_path="./corpus.tsv")

In [67]:
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity

# Initialize the coherence and diversity metrics
coherence_cv = Coherence(topk=10, measure='c_v')
coherence_umass = Coherence(topk=10, measure='u_mass')
topic_diversity = TopicDiversity(topk=10)

In [84]:
from itertools import product

# Define hyperparameter grids for each model
param_grids = {
    'LDA': {
        'num_topics': [2, 3, 4, 5],
        'iterations': [500, 1000],
        'random_state': [42]
    },
    'CTM': {
        'num_topics': [2, 3, 4, 5],
        'num_epochs': [5, 10],
    },
    'ETM': {
        'num_topics': [2, 3, 4, 5],
        'num_epochs': [50, 100],
    },
    'NeuralLDA': {
        'num_topics': [2, 3, 4, 5],
        'num_epochs': [50, 100],
        'lr': [2e-3, 1e-3],
    },
    'NMF': {
        'num_topics': [2, 3, 4, 5],
        'random_state': [42]
    }
}

def evaluate_coherence(model_output):
    # Initialize the coherence metric
    coherence_cv = Coherence(topk=10, measure='c_v')

    # Calculate and return coherence score
    coherence_score = coherence_cv.score(model_output)
    return coherence_score

def parameter_search(model_name, dataset, param_grid):
    param_combinations = list(product(*param_grid.values()))
    best_score = -float('inf')
    best_params = None
    best_model_output = None

    # Use tqdm to track progress in parameter search
    for params in tqdm(param_combinations, desc=f"Searching {model_name} Params"):
        # Create parameter dict
        param_dict = dict(zip(param_grid.keys(), params))

        # Initialize and train the model based on model name and params
        if model_name == 'LDA':
            model = LDA(**param_dict)
        elif model_name == 'CTM':
            model = CTM(**param_dict)
        elif model_name == 'ETM':
            model = ETM(**param_dict)
        elif model_name == 'NeuralLDA':
            model = NeuralLDA(**param_dict)
        elif model_name == 'NMF':
            model = NMF(**param_dict)

        # Train the model
        model_output = model.train_model(dataset)

        # Evaluate the model (e.g., using coherence score)
        score = evaluate_coherence(model_output)

        if score > best_score:
            best_score = score
            best_params = param_dict
            best_model_output = model_output

    return best_model_output, best_params, best_score

In [85]:
best_models = {}

# Perform hyperparameter search for each model with progress bars
for model_name, param_grid in tqdm(param_grids.items(), desc="Overall Model Parameter Search"):
    best_output, best_params, best_score = parameter_search(model_name, dataset, param_grid)
    best_models[model_name] = {'output': best_output, 'params': best_params, 'score': best_score}
    print(f"Best {model_name} Params: {best_params} with Score: {best_score}")

In [86]:
# Summarize results
import pandas as pd

summary = []
for model_name, model_info in best_models.items():
    summary.append({
        'Model': model_name,
        'Best_Params': model_info['params'],
        'Best_Score': model_info['score']
    })

summary_df = pd.DataFrame(summary)
print(summary_df)

In [95]:
best_models['LDA']

In [96]:
best_models['CTM']