In [1]:
import pandas as pd
import itertools
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.models.ctm import CombinedTM
from gensim.models.coherencemodel import CoherenceModel
from gensim import corpora
import matplotlib.pyplot as plt
import os
import importlib
import utility_functions as utils
importlib.reload(utils)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
num_topics_list = [2, 3, 4]
ctm_epochs_list = [5, 10, 20]
ctm_learning_rates = [2e-3, 5e-3, 1e-2]
ctm_batch_sizes = [64, 128]

# Create all combinations of parameters
ctm_param_grid = list(itertools.product(num_topics_list, ctm_epochs_list, ctm_learning_rates, ctm_batch_sizes))

In [3]:
# Load your dataframe
df = pd.read_pickle('./preprocessed_df.pkl')

if isinstance(df['Tokens'].iloc[0], str):
    import ast
    df['Tokens'] = df['Tokens'].apply(ast.literal_eval)

texts = df['Tokens']
texts_bow = [' '.join(tokens) for tokens in df['Tokens']]
df['Lyrics'] = df['Lyrics'].apply(utils.light_preprocessing)
documents = df['Lyrics']

In [4]:
df

Unnamed: 0,Artist,Album,Song,Coast,Release Year,Tempo1,Tempo2,Duration (s),Sample Rate (Hz),Path,Lyrics,Tokens,Processed_Lyrics
0,Big L,Lifestylez Ov Da Poor and Dangerous,8 Iz Enuff.mp3,east_coast,1995,96.774194,48.000000,298.840000,48000,/Users/borosabel/Documents/Uni/Thesis/PopMIR/D...,yo my crew is in the house terra herb mcgruff ...,"[crew, house, bless, big, mike, imma, set, fol...",crew house terra herb mcgruff buddah bless big...
1,Big L,Lifestylez Ov Da Poor and Dangerous,Da Graveyard.mp3,east_coast,1995,93.750000,46.511628,323.760000,48000,/Users/borosabel/Documents/Uni/Thesis/PopMIR/D...,it's the number one crew in the area big l be ...,"[number, one, crew, big, nigga, men, win, kill...",number one crew area big lightin nigga incense...
2,Big L,Lifestylez Ov Da Poor and Dangerous,I Don't Understand It.mp3,east_coast,1995,93.750000,47.244094,260.226667,48000,/Users/borosabel/Documents/Uni/Thesis/PopMIR/D...,there are too many mc's who are overrated you ...,"[many, mcs, ask, even, supposed, make, rap, kn...",many mcs overrated ask even supposed make rap ...
3,Big L,Lifestylez Ov Da Poor and Dangerous,"No Endz, No Skinz.mp3",east_coast,1995,100.000000,50.420168,208.733333,48000,/Users/borosabel/Documents/Uni/Thesis/PopMIR/D...,let me get to the point real quick when ya poc...,"[let, point, real, quick, pocket, thick, mad, ...",let get point real quick pocket thick mad chic...
4,Big L,Lifestylez Ov Da Poor and Dangerous,MVP.mp3,east_coast,1995,86.956522,43.478261,218.866667,48000,/Users/borosabel/Documents/Uni/Thesis/PopMIR/D...,a yo spark up the phillies and pass the stout ...,"[pass, make, quick, money, grip, ass, street, ...",spark phillies pass stout make quick money gri...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1363,Dr.Dre,The Chronic,Dr. Dre - The Day the Niggaz Took Over (feat. ...,west_coast,1992,93.750000,46.875000,273.206000,48000,/Users/borosabel/Documents/Uni/Thesis/PopMIR/D...,i'ma say this and i'ma end mine if you ain't d...,"[say, end, mine, point, one, south, shit, need...",say end mine africans united states period poi...
1364,Dr.Dre,The Chronic,"Dr. Dre - Bitches Ain't Shit (feat. Jewell, Sn...",west_coast,1992,92.307692,46.153846,287.207625,48000,/Users/borosabel/Documents/Uni/Thesis/PopMIR/D...,bitches ain't shit but hoes and tricks bitches...,"[bitch, shit, hoe, trick, bitch, shit, hoe, tr...",bitch shit hoe trick bitch shit hoe trick lick...
1365,Dr.Dre,The Chronic,Dr. Dre - Stranded On Death Row (feat. Bushwic...,west_coast,1992,90.909091,45.801527,287.335333,48000,/Users/borosabel/Documents/Uni/Thesis/PopMIR/D...,"yes it is i "" says me and all who agree are mo...","[yes, say, three, yes, house, sure, want, talk...",yes say agree three yes house sure want talk h...
1366,Dr.Dre,The Chronic,Dr. Dre - Nuthin' but a ＂G＂ Thang (feat. Snoop...,west_coast,1992,95.238095,47.244094,238.677917,48000,/Users/borosabel/Documents/Uni/Thesis/PopMIR/D...,one two three and to the four snoop doggy dogg...,"[one, two, three, four, dog, dr, dre, door, re...",one two three four snoop doggy dog dr dre door...


In [5]:
tp = TopicModelDataPreparation("all-mpnet-base-v2", max_seq_length=512)
training_dataset = tp.fit(text_for_contextual=documents, text_for_bow=texts_bow)



Batches:   0%|          | 0/7 [00:00<?, ?it/s]

In [6]:
dictionary = corpora.Dictionary(texts)

In [7]:
# Function to evaluate CTM models and save results into a DataFrame
def evaluate_ctm_models(training_dataset, tp, texts, dictionary, ctm_param_grid, metrics=('coherence',), save_dir='saved_models'):
    """
    Evaluate CTM models with a given set of hyperparameters and metrics, and save results to a DataFrame.

    Parameters:
    - training_dataset: The training dataset prepared by TopicModelDataPreparation.
    - tp: The TopicModelDataPreparation object with vocabulary info.
    - texts: The list of tokenized texts.
    - dictionary: The Gensim dictionary.
    - ctm_param_grid: List of tuples for hyperparameters (num_topics, epochs, learning_rate, batch_size).
    - metrics: Tuple of metrics to evaluate ('coherence', 'diversity', or both).
    - save_dir: Directory to save the results DataFrame.

    Returns:
    - results_df: A DataFrame containing the evaluation results.
    """
    ctm_results = []

    # Create directory if it doesn't exist
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # Iterate through each parameter combination
    for idx, (num_topics, epochs, learning_rate, batch_size) in enumerate(ctm_param_grid):
        try:
            print(f"Training CTM model {idx+1}/{len(ctm_param_grid)} with num_topics={num_topics}, epochs={epochs}, learning_rate={learning_rate}, batch_size={batch_size}")

            # Initialize the model
            ctm_model = CombinedTM(
                bow_size=len(tp.vocab),
                contextual_size=768,
                n_components=num_topics,
                num_epochs=epochs,
                batch_size=batch_size,
                activation='softplus',
                dropout=0.2,
                solver='adam',
                num_data_loader_workers=0,
            )

            # Train the model
            ctm_model.fit(training_dataset)

            # Get topics
            ctm_topics = ctm_model.get_topic_lists(10)

            # Initialize result dictionary
            result = {
                'model_id': idx + 1,
                'num_topics': num_topics,
                'epochs': epochs,
                'learning_rate': learning_rate,
                'batch_size': batch_size
            }

            # Evaluate the model based on specified metrics
            if 'coherence' in metrics:
                coherence_model_ctm = CoherenceModel(
                    topics=ctm_topics,
                    texts=texts,
                    dictionary=dictionary,
                    coherence='c_v'
                )
                coherence_ctm = coherence_model_ctm.get_coherence()
                result['coherence_score'] = coherence_ctm

            if 'diversity' in metrics:
                unique_words = set()
                total_words = 0

                for topic in ctm_topics:
                    unique_words.update(topic)
                    total_words += len(topic)

                topic_diversity = len(unique_words) / total_words if total_words > 0 else 0
                result['topic_diversity'] = topic_diversity

            # Save the result
            ctm_results.append(result)

        except Exception as e:
            print(f"An error occurred while training model {idx+1}: {e}")
            continue  # Skip this iteration if there's an error

    # Create a DataFrame from the results
    results_df = pd.DataFrame(ctm_results)

    # Sort by coherence_score if it is one of the metrics
    if 'coherence' in metrics:
        results_df = results_df.sort_values(by='coherence_score', ascending=False)

    # Save results DataFrame for future reference
    results_df_path = os.path.join(save_dir, 'ctm_model_results_summary.csv')
    results_df.to_csv(results_df_path, index=False)

    return results_df

# Example Usage
results_df = evaluate_ctm_models(
    training_dataset=training_dataset,
    tp=tp,
    texts=texts,
    dictionary=dictionary,
    ctm_param_grid=ctm_param_grid,
    metrics=('coherence', 'diversity'),
    save_dir='saved_models'
)

Training CTM model 1/54 with num_topics=2, epochs=5, learning_rate=0.002, batch_size=64


Epoch: [5/5]	 Seen Samples: [6720/6840]	Train Loss: 1250.9378662109375	Time: 0:00:00.210703: : 5it [00:01,  3.54it/s]
100%|██████████| 22/22 [00:00<00:00, 75.52it/s]


Training CTM model 2/54 with num_topics=2, epochs=5, learning_rate=0.002, batch_size=128


Epoch: [5/5]	 Seen Samples: [6400/6840]	Train Loss: 1245.0567749023437	Time: 0:00:00.159747: : 5it [00:00,  6.06it/s]
100%|██████████| 11/11 [00:00<00:00, 90.40it/s]


Training CTM model 3/54 with num_topics=2, epochs=5, learning_rate=0.005, batch_size=64


Epoch: [5/5]	 Seen Samples: [6720/6840]	Train Loss: 1249.2088855561756	Time: 0:00:00.197293: : 5it [00:01,  4.76it/s]
100%|██████████| 22/22 [00:00<00:00, 165.66it/s]


Training CTM model 4/54 with num_topics=2, epochs=5, learning_rate=0.005, batch_size=128


Epoch: [5/5]	 Seen Samples: [6400/6840]	Train Loss: 1264.5521728515625	Time: 0:00:00.147815: : 5it [00:00,  6.54it/s]
100%|██████████| 11/11 [00:00<00:00, 96.94it/s]


Training CTM model 5/54 with num_topics=2, epochs=5, learning_rate=0.01, batch_size=64


Epoch: [5/5]	 Seen Samples: [6720/6840]	Train Loss: 1247.5923025948662	Time: 0:00:00.214302: : 5it [00:01,  4.48it/s]
100%|██████████| 22/22 [00:00<00:00, 180.85it/s]


Training CTM model 6/54 with num_topics=2, epochs=5, learning_rate=0.01, batch_size=128


Epoch: [5/5]	 Seen Samples: [6400/6840]	Train Loss: 1249.9501831054688	Time: 0:00:00.145440: : 5it [00:00,  6.22it/s]
100%|██████████| 11/11 [00:00<00:00, 97.99it/s]


Training CTM model 7/54 with num_topics=2, epochs=10, learning_rate=0.002, batch_size=64


Epoch: [10/10]	 Seen Samples: [13440/13680]	Train Loss: 1210.1102236793154	Time: 0:00:00.197613: : 10it [00:01,  5.20it/s]
100%|██████████| 22/22 [00:00<00:00, 175.66it/s]


Training CTM model 8/54 with num_topics=2, epochs=10, learning_rate=0.002, batch_size=128


Epoch: [10/10]	 Seen Samples: [12800/13680]	Train Loss: 1226.56953125	Time: 0:00:00.152233: : 10it [00:01,  6.61it/s]    
100%|██████████| 11/11 [00:00<00:00, 95.93it/s]


Training CTM model 9/54 with num_topics=2, epochs=10, learning_rate=0.005, batch_size=64


Epoch: [10/10]	 Seen Samples: [13440/13680]	Train Loss: 1209.660900297619	Time: 0:00:00.212532: : 10it [00:01,  5.29it/s]
100%|██████████| 22/22 [00:00<00:00, 184.11it/s]


Training CTM model 10/54 with num_topics=2, epochs=10, learning_rate=0.005, batch_size=128


Epoch: [10/10]	 Seen Samples: [12800/13680]	Train Loss: 1227.9771484375	Time: 0:00:00.147343: : 10it [00:01,  6.76it/s]  
100%|██████████| 11/11 [00:00<00:00, 97.36it/s]


Training CTM model 11/54 with num_topics=2, epochs=10, learning_rate=0.01, batch_size=64


Epoch: [10/10]	 Seen Samples: [13440/13680]	Train Loss: 1207.6840239025298	Time: 0:00:00.215175: : 10it [00:02,  5.00it/s]
100%|██████████| 22/22 [00:00<00:00, 172.40it/s]


Training CTM model 12/54 with num_topics=2, epochs=10, learning_rate=0.01, batch_size=128


Epoch: [10/10]	 Seen Samples: [12800/13680]	Train Loss: 1222.3059814453125	Time: 0:00:00.143861: : 10it [00:01,  6.61it/s]
100%|██████████| 11/11 [00:00<00:00, 96.93it/s]


Training CTM model 13/54 with num_topics=2, epochs=20, learning_rate=0.002, batch_size=64


Epoch: [20/20]	 Seen Samples: [26880/27360]	Train Loss: 1200.9877232142858	Time: 0:00:00.199593: : 20it [00:03,  5.32it/s]
100%|██████████| 22/22 [00:00<00:00, 186.96it/s]


Training CTM model 14/54 with num_topics=2, epochs=20, learning_rate=0.002, batch_size=128


Epoch: [20/20]	 Seen Samples: [25600/27360]	Train Loss: 1210.2033447265626	Time: 0:00:00.141540: : 20it [00:02,  7.10it/s]
100%|██████████| 11/11 [00:00<00:00, 100.10it/s]


Training CTM model 15/54 with num_topics=2, epochs=20, learning_rate=0.005, batch_size=64


Epoch: [20/20]	 Seen Samples: [26880/27360]	Train Loss: 1200.9680466424852	Time: 0:00:00.184613: : 20it [00:03,  5.16it/s]
100%|██████████| 22/22 [00:00<00:00, 190.54it/s]


Training CTM model 16/54 with num_topics=2, epochs=20, learning_rate=0.005, batch_size=128


Epoch: [20/20]	 Seen Samples: [25600/27360]	Train Loss: 1204.1236206054687	Time: 0:00:00.129817: : 20it [00:02,  7.14it/s]
100%|██████████| 11/11 [00:00<00:00, 103.70it/s]


Training CTM model 17/54 with num_topics=2, epochs=20, learning_rate=0.01, batch_size=64


Epoch: [20/20]	 Seen Samples: [26880/27360]	Train Loss: 1199.5248151506696	Time: 0:00:00.166265: : 20it [00:03,  5.57it/s]
100%|██████████| 22/22 [00:00<00:00, 193.01it/s]


Training CTM model 18/54 with num_topics=2, epochs=20, learning_rate=0.01, batch_size=128


Epoch: [20/20]	 Seen Samples: [25600/27360]	Train Loss: 1220.3905639648438	Time: 0:00:00.144550: : 20it [00:02,  6.76it/s]
100%|██████████| 11/11 [00:00<00:00, 103.19it/s]


Training CTM model 19/54 with num_topics=3, epochs=5, learning_rate=0.002, batch_size=64


Epoch: [5/5]	 Seen Samples: [6720/6840]	Train Loss: 1237.635463169643	Time: 0:00:00.250372: : 5it [00:01,  4.27it/s] 
100%|██████████| 22/22 [00:00<00:00, 152.31it/s]


Training CTM model 20/54 with num_topics=3, epochs=5, learning_rate=0.002, batch_size=128


Epoch: [5/5]	 Seen Samples: [6400/6840]	Train Loss: 1250.7444091796874	Time: 0:00:00.149228: : 5it [00:00,  6.06it/s]
100%|██████████| 11/11 [00:00<00:00, 99.53it/s]


Training CTM model 21/54 with num_topics=3, epochs=5, learning_rate=0.005, batch_size=64


Epoch: [5/5]	 Seen Samples: [6720/6840]	Train Loss: 1245.0137125651042	Time: 0:00:00.180328: : 5it [00:00,  5.35it/s]
100%|██████████| 22/22 [00:00<00:00, 195.76it/s]


Training CTM model 22/54 with num_topics=3, epochs=5, learning_rate=0.005, batch_size=128


Epoch: [5/5]	 Seen Samples: [6400/6840]	Train Loss: 1263.6735107421875	Time: 0:00:00.141100: : 5it [00:00,  6.75it/s]
100%|██████████| 11/11 [00:00<00:00, 102.60it/s]


Training CTM model 23/54 with num_topics=3, epochs=5, learning_rate=0.01, batch_size=64


Epoch: [5/5]	 Seen Samples: [6720/6840]	Train Loss: 1243.4447719029017	Time: 0:00:00.173802: : 5it [00:00,  5.45it/s]
100%|██████████| 22/22 [00:00<00:00, 190.25it/s]


Training CTM model 24/54 with num_topics=3, epochs=5, learning_rate=0.01, batch_size=128


Epoch: [5/5]	 Seen Samples: [6400/6840]	Train Loss: 1254.5193603515625	Time: 0:00:00.135826: : 5it [00:00,  7.04it/s]
100%|██████████| 11/11 [00:00<00:00, 101.45it/s]


Training CTM model 25/54 with num_topics=3, epochs=10, learning_rate=0.002, batch_size=64


Epoch: [10/10]	 Seen Samples: [13440/13680]	Train Loss: 1204.8066871279761	Time: 0:00:00.165760: : 10it [00:01,  5.54it/s]
100%|██████████| 22/22 [00:00<00:00, 191.84it/s]


Training CTM model 26/54 with num_topics=3, epochs=10, learning_rate=0.002, batch_size=128


Epoch: [10/10]	 Seen Samples: [12800/13680]	Train Loss: 1225.8701416015624	Time: 0:00:00.133110: : 10it [00:01,  6.92it/s]
100%|██████████| 11/11 [00:00<00:00, 100.83it/s]


Training CTM model 27/54 with num_topics=3, epochs=10, learning_rate=0.005, batch_size=64


Epoch: [10/10]	 Seen Samples: [13440/13680]	Train Loss: 1205.183349609375	Time: 0:00:00.176496: : 10it [00:01,  5.51it/s]
100%|██████████| 22/22 [00:00<00:00, 191.25it/s]


Training CTM model 28/54 with num_topics=3, epochs=10, learning_rate=0.005, batch_size=128


Epoch: [10/10]	 Seen Samples: [12800/13680]	Train Loss: 1227.7889526367187	Time: 0:00:00.143218: : 10it [00:01,  7.01it/s]
100%|██████████| 11/11 [00:00<00:00, 103.92it/s]


Training CTM model 29/54 with num_topics=3, epochs=10, learning_rate=0.01, batch_size=64


Epoch: [10/10]	 Seen Samples: [13440/13680]	Train Loss: 1214.873302641369	Time: 0:00:00.174520: : 10it [00:01,  5.44it/s]
100%|██████████| 22/22 [00:00<00:00, 178.23it/s]


Training CTM model 30/54 with num_topics=3, epochs=10, learning_rate=0.01, batch_size=128


Epoch: [10/10]	 Seen Samples: [12800/13680]	Train Loss: 1214.6564453125	Time: 0:00:00.141243: : 10it [00:01,  7.02it/s]  
100%|██████████| 11/11 [00:00<00:00, 98.73it/s]


Training CTM model 31/54 with num_topics=3, epochs=20, learning_rate=0.002, batch_size=64


Epoch: [20/20]	 Seen Samples: [26880/27360]	Train Loss: 1196.177001953125	Time: 0:00:00.166926: : 20it [00:03,  5.57it/s] 
100%|██████████| 22/22 [00:00<00:00, 190.24it/s]


Training CTM model 32/54 with num_topics=3, epochs=20, learning_rate=0.002, batch_size=128


Epoch: [20/20]	 Seen Samples: [25600/27360]	Train Loss: 1208.958154296875	Time: 0:00:00.137808: : 20it [00:02,  7.04it/s] 
100%|██████████| 11/11 [00:00<00:00, 99.09it/s]


Training CTM model 33/54 with num_topics=3, epochs=20, learning_rate=0.005, batch_size=64


Epoch: [20/20]	 Seen Samples: [26880/27360]	Train Loss: 1191.4795968191963	Time: 0:00:00.213732: : 20it [00:03,  5.43it/s]
100%|██████████| 22/22 [00:00<00:00, 192.71it/s]


Training CTM model 34/54 with num_topics=3, epochs=20, learning_rate=0.005, batch_size=128


Epoch: [20/20]	 Seen Samples: [25600/27360]	Train Loss: 1196.7789428710937	Time: 0:00:00.139709: : 20it [00:02,  7.14it/s]
100%|██████████| 11/11 [00:00<00:00, 97.96it/s]


Training CTM model 35/54 with num_topics=3, epochs=20, learning_rate=0.01, batch_size=64


Epoch: [20/20]	 Seen Samples: [26880/27360]	Train Loss: 1207.3134591238838	Time: 0:00:00.175164: : 20it [00:03,  5.59it/s]
100%|██████████| 22/22 [00:00<00:00, 188.56it/s]


Training CTM model 36/54 with num_topics=3, epochs=20, learning_rate=0.01, batch_size=128


Epoch: [20/20]	 Seen Samples: [25600/27360]	Train Loss: 1223.1539306640625	Time: 0:00:00.133257: : 20it [00:02,  7.08it/s]
100%|██████████| 11/11 [00:00<00:00, 100.78it/s]


Training CTM model 37/54 with num_topics=4, epochs=5, learning_rate=0.002, batch_size=64


Epoch: [5/5]	 Seen Samples: [6720/6840]	Train Loss: 1239.0535481770833	Time: 0:00:00.169442: : 5it [00:00,  5.56it/s]
100%|██████████| 22/22 [00:00<00:00, 197.73it/s]


Training CTM model 38/54 with num_topics=4, epochs=5, learning_rate=0.002, batch_size=128


Epoch: [5/5]	 Seen Samples: [6400/6840]	Train Loss: 1245.09658203125	Time: 0:00:00.142868: : 5it [00:00,  6.78it/s]  
100%|██████████| 11/11 [00:00<00:00, 101.34it/s]


Training CTM model 39/54 with num_topics=4, epochs=5, learning_rate=0.005, batch_size=64


Epoch: [5/5]	 Seen Samples: [6720/6840]	Train Loss: 1226.292712983631	Time: 0:00:00.182867: : 5it [00:00,  5.27it/s] 
100%|██████████| 22/22 [00:00<00:00, 197.46it/s]


Training CTM model 40/54 with num_topics=4, epochs=5, learning_rate=0.005, batch_size=128


Epoch: [5/5]	 Seen Samples: [6400/6840]	Train Loss: 1238.8992919921875	Time: 0:00:00.143268: : 5it [00:00,  5.93it/s]
100%|██████████| 11/11 [00:00<00:00, 95.81it/s]


Training CTM model 41/54 with num_topics=4, epochs=5, learning_rate=0.01, batch_size=64


Epoch: [5/5]	 Seen Samples: [6720/6840]	Train Loss: 1226.1017775762648	Time: 0:00:00.172764: : 5it [00:00,  5.50it/s]
100%|██████████| 22/22 [00:00<00:00, 199.91it/s]


Training CTM model 42/54 with num_topics=4, epochs=5, learning_rate=0.01, batch_size=128


Epoch: [5/5]	 Seen Samples: [6400/6840]	Train Loss: 1239.1460693359375	Time: 0:00:00.136872: : 5it [00:00,  6.94it/s]
100%|██████████| 11/11 [00:00<00:00, 100.74it/s]


Training CTM model 43/54 with num_topics=4, epochs=10, learning_rate=0.002, batch_size=64


Epoch: [10/10]	 Seen Samples: [13440/13680]	Train Loss: 1207.147914341518	Time: 0:00:00.179173: : 10it [00:01,  5.47it/s]
100%|██████████| 22/22 [00:00<00:00, 191.94it/s]


Training CTM model 44/54 with num_topics=4, epochs=10, learning_rate=0.002, batch_size=128


Epoch: [10/10]	 Seen Samples: [12800/13680]	Train Loss: 1232.5323974609375	Time: 0:00:00.141209: : 10it [00:01,  6.93it/s]
100%|██████████| 11/11 [00:00<00:00, 100.34it/s]


Training CTM model 45/54 with num_topics=4, epochs=10, learning_rate=0.005, batch_size=64


Epoch: [10/10]	 Seen Samples: [13440/13680]	Train Loss: 1200.844249906994	Time: 0:00:00.182443: : 10it [00:01,  5.37it/s]
100%|██████████| 22/22 [00:00<00:00, 198.77it/s]


Training CTM model 46/54 with num_topics=4, epochs=10, learning_rate=0.005, batch_size=128


Epoch: [10/10]	 Seen Samples: [12800/13680]	Train Loss: 1221.1052490234374	Time: 0:00:00.147468: : 10it [00:01,  6.54it/s]
100%|██████████| 11/11 [00:00<00:00, 100.49it/s]


Training CTM model 47/54 with num_topics=4, epochs=10, learning_rate=0.01, batch_size=64


Epoch: [10/10]	 Seen Samples: [13440/13680]	Train Loss: 1208.3815162295386	Time: 0:00:00.174089: : 10it [00:01,  5.48it/s]
100%|██████████| 22/22 [00:00<00:00, 194.20it/s]


Training CTM model 48/54 with num_topics=4, epochs=10, learning_rate=0.01, batch_size=128


Epoch: [10/10]	 Seen Samples: [12800/13680]	Train Loss: 1214.664453125	Time: 0:00:00.134001: : 10it [00:01,  7.04it/s]   
100%|██████████| 11/11 [00:00<00:00, 100.10it/s]


Training CTM model 49/54 with num_topics=4, epochs=20, learning_rate=0.002, batch_size=64


Epoch: [20/20]	 Seen Samples: [26880/27360]	Train Loss: 1201.345970517113	Time: 0:00:00.169996: : 20it [00:03,  5.54it/s] 
100%|██████████| 22/22 [00:00<00:00, 191.24it/s]


Training CTM model 50/54 with num_topics=4, epochs=20, learning_rate=0.002, batch_size=128


Epoch: [20/20]	 Seen Samples: [25600/27360]	Train Loss: 1198.191748046875	Time: 0:00:00.138559: : 20it [00:02,  7.09it/s] 
100%|██████████| 11/11 [00:00<00:00, 100.11it/s]


Training CTM model 51/54 with num_topics=4, epochs=20, learning_rate=0.005, batch_size=64


Epoch: [20/20]	 Seen Samples: [26880/27360]	Train Loss: 1195.790748232887	Time: 0:00:00.187376: : 20it [00:03,  5.36it/s] 
100%|██████████| 22/22 [00:00<00:00, 189.16it/s]


Training CTM model 52/54 with num_topics=4, epochs=20, learning_rate=0.005, batch_size=128


Epoch: [20/20]	 Seen Samples: [25600/27360]	Train Loss: 1194.63837890625	Time: 0:00:00.135661: : 20it [00:02,  7.09it/s]  
100%|██████████| 11/11 [00:00<00:00, 98.07it/s]


Training CTM model 53/54 with num_topics=4, epochs=20, learning_rate=0.01, batch_size=64


Epoch: [20/20]	 Seen Samples: [26880/27360]	Train Loss: 1194.6287086123511	Time: 0:00:00.186980: : 20it [00:03,  5.54it/s]
100%|██████████| 22/22 [00:00<00:00, 186.79it/s]


Training CTM model 54/54 with num_topics=4, epochs=20, learning_rate=0.01, batch_size=128


Epoch: [20/20]	 Seen Samples: [25600/27360]	Train Loss: 1204.0016235351563	Time: 0:00:00.139357: : 20it [00:02,  7.05it/s]
100%|██████████| 11/11 [00:00<00:00, 102.87it/s]


In [8]:
results_df

Unnamed: 0,model_id,num_topics,epochs,learning_rate,batch_size,coherence_score,topic_diversity
50,51,4,20,0.005,64,0.344316,1.0
34,35,3,20,0.01,64,0.324974,1.0
14,15,2,20,0.005,64,0.324572,1.0
8,9,2,10,0.005,64,0.321018,1.0
48,49,4,20,0.002,64,0.3191,0.875
17,18,2,20,0.01,128,0.316204,1.0
44,45,4,10,0.005,64,0.311485,0.875
39,40,4,5,0.005,128,0.311039,0.975
53,54,4,20,0.01,128,0.310354,0.975
49,50,4,20,0.002,128,0.306156,0.975


In [11]:
results_df = pd.read_csv('saved_models/ctm_model_results_summary.csv')
best_params = results_df.iloc[0]  # Assuming the first row has the best score after sorting

# Extract hyperparameters
best_num_topics = int(best_params['num_topics'])
best_epochs = int(best_params['epochs'])
best_learning_rate = best_params['learning_rate']
best_batch_size = int(best_params['batch_size'])

best_ctm_model = CombinedTM(
    bow_size=len(tp.vocab),
    contextual_size=768,
    n_components=best_num_topics,
    num_epochs=best_epochs,
    batch_size=best_batch_size,
    activation='softplus',
    dropout=0.2,
    solver='adam',
    num_data_loader_workers=0,
)

# Train the model using the original training dataset
best_ctm_model.fit(training_dataset)

Epoch: [20/20]	 Seen Samples: [26880/27360]	Train Loss: 1191.5256405784971	Time: 0:00:00.190655: : 20it [00:04,  4.84it/s]
100%|██████████| 22/22 [00:00<00:00, 181.66it/s]
