In [1]:
import pandas as pd
import itertools
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.models.ctm import CombinedTM
from gensim.models.coherencemodel import CoherenceModel
from gensim import corpora
import os
import importlib
from PopMusicInformationRetrieval import utility_functions as utils

importlib.reload(utils)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
num_topics_list = [2, 3, 4]
ctm_epochs_list = [5, 10, 20]
ctm_learning_rates = [2e-3, 5e-3, 1e-2]
ctm_batch_sizes = [64, 128]

# Create all combinations of parameters
ctm_param_grid = list(itertools.product(num_topics_list, ctm_epochs_list, ctm_learning_rates, ctm_batch_sizes))

In [3]:
# Load your dataframe
df = pd.read_pickle('../Data/dataframes/preprocessed_df.pkl')

if isinstance(df['Tokens'].iloc[0], str):
    import ast
    df['Tokens'] = df['Tokens'].apply(ast.literal_eval)

texts = df['Tokens']
texts_bow = [' '.join(tokens) for tokens in df['Tokens']]
df['Lyrics'] = df['Lyrics'].apply(utils.light_preprocessing)
documents = df['Lyrics']

In [4]:
df

Unnamed: 0,Artist,Album,Song,Coast,Release Year,Tempo1,Tempo2,Duration (s),Sample Rate (Hz),Path,Lyrics,Tokens,Processed_Lyrics
0,Big L,Lifestylez Ov Da Poor and Dangerous,8 Iz Enuff.mp3,east_coast,1995,96.774194,48.000000,298.840000,48000,/Users/borosabel/Documents/Uni/Thesis/PopMIR/D...,yo my crew is in the house terra herb mcgruff ...,"[crew, house, bless, big, mike, imma, set, fol...",crew house terra herb mcgruff buddah bless big...
1,Big L,Lifestylez Ov Da Poor and Dangerous,Da Graveyard.mp3,east_coast,1995,93.750000,46.511628,323.760000,48000,/Users/borosabel/Documents/Uni/Thesis/PopMIR/D...,it's the number one crew in the area big l be ...,"[number, one, crew, big, nigga, men, win, kill...",number one crew area big lightin nigga incense...
2,Big L,Lifestylez Ov Da Poor and Dangerous,I Don't Understand It.mp3,east_coast,1995,93.750000,47.244094,260.226667,48000,/Users/borosabel/Documents/Uni/Thesis/PopMIR/D...,there are too many mc's who are overrated you ...,"[many, mcs, ask, even, supposed, make, rap, kn...",many mcs overrated ask even supposed make rap ...
3,Big L,Lifestylez Ov Da Poor and Dangerous,"No Endz, No Skinz.mp3",east_coast,1995,100.000000,50.420168,208.733333,48000,/Users/borosabel/Documents/Uni/Thesis/PopMIR/D...,let me get to the point real quick when ya poc...,"[let, point, real, quick, pocket, thick, mad, ...",let get point real quick pocket thick mad chic...
4,Big L,Lifestylez Ov Da Poor and Dangerous,MVP.mp3,east_coast,1995,86.956522,43.478261,218.866667,48000,/Users/borosabel/Documents/Uni/Thesis/PopMIR/D...,a yo spark up the phillies and pass the stout ...,"[pass, make, quick, money, grip, ass, street, ...",spark phillies pass stout make quick money gri...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1363,Dr.Dre,The Chronic,Dr. Dre - The Day the Niggaz Took Over (feat. ...,west_coast,1992,93.750000,46.875000,273.206000,48000,/Users/borosabel/Documents/Uni/Thesis/PopMIR/D...,i'ma say this and i'ma end mine if you ain't d...,"[say, end, mine, point, one, south, shit, need...",say end mine africans united states period poi...
1364,Dr.Dre,The Chronic,"Dr. Dre - Bitches Ain't Shit (feat. Jewell, Sn...",west_coast,1992,92.307692,46.153846,287.207625,48000,/Users/borosabel/Documents/Uni/Thesis/PopMIR/D...,bitches ain't shit but hoes and tricks bitches...,"[bitch, shit, hoe, trick, bitch, shit, hoe, tr...",bitch shit hoe trick bitch shit hoe trick lick...
1365,Dr.Dre,The Chronic,Dr. Dre - Stranded On Death Row (feat. Bushwic...,west_coast,1992,90.909091,45.801527,287.335333,48000,/Users/borosabel/Documents/Uni/Thesis/PopMIR/D...,"yes it is i "" says me and all who agree are mo...","[yes, say, three, yes, house, sure, want, talk...",yes say agree three yes house sure want talk h...
1366,Dr.Dre,The Chronic,Dr. Dre - Nuthin' but a ＂G＂ Thang (feat. Snoop...,west_coast,1992,95.238095,47.244094,238.677917,48000,/Users/borosabel/Documents/Uni/Thesis/PopMIR/D...,one two three and to the four snoop doggy dogg...,"[one, two, three, four, dog, dr, dre, door, re...",one two three four snoop doggy dog dr dre door...


In [5]:
tp = TopicModelDataPreparation("all-mpnet-base-v2", max_seq_length=512)
training_dataset = tp.fit(text_for_contextual=documents, text_for_bow=texts_bow)



Batches:   0%|          | 0/7 [00:00<?, ?it/s]

In [6]:
dictionary = corpora.Dictionary(texts)

In [7]:
# Function to evaluate CTM models and save results into a DataFrame
def evaluate_ctm_models(training_dataset, tp, texts, dictionary, ctm_param_grid, metrics=('coherence',), save_dir='saved_models'):
    """
    Evaluate CTM models with a given set of hyperparameters and metrics, and save results to a DataFrame.

    Parameters:
    - training_dataset: The training dataset prepared by TopicModelDataPreparation.
    - tp: The TopicModelDataPreparation object with vocabulary info.
    - texts: The list of tokenized texts.
    - dictionary: The Gensim dictionary.
    - ctm_param_grid: List of tuples for hyperparameters (num_topics, epochs, learning_rate, batch_size).
    - metrics: Tuple of metrics to evaluate ('coherence', 'diversity', or both).
    - save_dir: Directory to save the results DataFrame.

    Returns:
    - results_df: A DataFrame containing the evaluation results.
    """
    ctm_results = []

    # Create directory if it doesn't exist
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # Iterate through each parameter combination
    for idx, (num_topics, epochs, learning_rate, batch_size) in enumerate(ctm_param_grid):
        try:
            print(f"Training CTM model {idx+1}/{len(ctm_param_grid)} with num_topics={num_topics}, epochs={epochs}, learning_rate={learning_rate}, batch_size={batch_size}")

            # Initialize the model
            ctm_model = CombinedTM(
                bow_size=len(tp.vocab),
                contextual_size=768,
                n_components=num_topics,
                num_epochs=epochs,
                batch_size=batch_size,
                activation='softplus',
                dropout=0.2,
                solver='adam',
                num_data_loader_workers=0,
            )

            # Train the model
            ctm_model.fit(training_dataset)

            # Get topics
            ctm_topics = ctm_model.get_topic_lists(10)

            # Initialize result dictionary
            result = {
                'model_id': idx + 1,
                'num_topics': num_topics,
                'epochs': epochs,
                'learning_rate': learning_rate,
                'batch_size': batch_size
            }

            # Evaluate the model based on specified metrics
            if 'coherence' in metrics:
                coherence_model_ctm = CoherenceModel(
                    topics=ctm_topics,
                    texts=texts,
                    dictionary=dictionary,
                    coherence='c_v'
                )
                coherence_ctm = coherence_model_ctm.get_coherence()
                result['coherence_score'] = coherence_ctm

            if 'diversity' in metrics:
                unique_words = set()
                total_words = 0

                for topic in ctm_topics:
                    unique_words.update(topic)
                    total_words += len(topic)

                topic_diversity = len(unique_words) / total_words if total_words > 0 else 0
                result['topic_diversity'] = topic_diversity

            # Save the result
            ctm_results.append(result)

        except Exception as e:
            print(f"An error occurred while training model {idx+1}: {e}")
            continue  # Skip this iteration if there's an error

    # Create a DataFrame from the results
    results_df = pd.DataFrame(ctm_results)

    # Sort by coherence_score if it is one of the metrics
    if 'coherence' in metrics:
        results_df = results_df.sort_values(by='coherence_score', ascending=False)

    # Save results DataFrame for future reference
    results_df_path = os.path.join(save_dir, 'ctm_model_results_summary.csv')
    results_df.to_csv(results_df_path, index=False)

    return results_df

# Example Usage
results_df = evaluate_ctm_models(
    training_dataset=training_dataset,
    tp=tp,
    texts=texts,
    dictionary=dictionary,
    ctm_param_grid=ctm_param_grid,
    metrics=('coherence', 'diversity'),
    save_dir='saved_models'
)

Training CTM model 1/54 with num_topics=2, epochs=5, learning_rate=0.002, batch_size=64


Epoch: [5/5]	 Seen Samples: [6720/6840]	Train Loss: 1241.2576788039435	Time: 0:00:00.200692: : 5it [00:01,  4.34it/s]
100%|██████████| 22/22 [00:00<00:00, 145.77it/s]


Training CTM model 2/54 with num_topics=2, epochs=5, learning_rate=0.002, batch_size=128


Epoch: [5/5]	 Seen Samples: [6400/6840]	Train Loss: 1261.4403564453125	Time: 0:00:00.152729: : 5it [00:00,  6.53it/s]
100%|██████████| 11/11 [00:00<00:00, 88.69it/s]


Training CTM model 3/54 with num_topics=2, epochs=5, learning_rate=0.005, batch_size=64


Epoch: [5/5]	 Seen Samples: [6720/6840]	Train Loss: 1248.3433779761904	Time: 0:00:00.246352: : 5it [00:01,  4.62it/s]
100%|██████████| 22/22 [00:00<00:00, 161.35it/s]


Training CTM model 4/54 with num_topics=2, epochs=5, learning_rate=0.005, batch_size=128


Epoch: [5/5]	 Seen Samples: [6400/6840]	Train Loss: 1258.0240844726563	Time: 0:00:00.142985: : 5it [00:00,  6.85it/s]
100%|██████████| 11/11 [00:00<00:00, 96.81it/s]


Training CTM model 5/54 with num_topics=2, epochs=5, learning_rate=0.01, batch_size=64


Epoch: [5/5]	 Seen Samples: [6720/6840]	Train Loss: 1244.5738641648065	Time: 0:00:00.200665: : 5it [00:01,  4.54it/s]
  0%|          | 0/22 [00:00<?, ?it/s]


KeyboardInterrupt: 

In [7]:
results_df

NameError: name 'results_df' is not defined

In [9]:
results_df = pd.read_csv('../saved_models/ctm_model_results_summary.csv')
best_params = results_df.iloc[0]  # Assuming the first row has the best score after sorting

# Extract hyperparameters
best_num_topics = int(best_params['num_topics'])
best_epochs = int(best_params['epochs'])
best_learning_rate = best_params['learning_rate']
best_batch_size = int(best_params['batch_size'])

best_ctm_model = CombinedTM(
    bow_size=len(tp.vocab),
    contextual_size=768,
    n_components=best_num_topics,
    num_epochs=best_epochs,
    batch_size=best_batch_size,
    activation='softplus',
    dropout=0.2,
    solver='adam',
    num_data_loader_workers=0,
)

# Train the model using the original training dataset
best_ctm_model.fit(training_dataset)

FileNotFoundError: [Errno 2] No such file or directory: '../saved_models/ctm_model_results_summary.csv'

In [9]:
best_ctm_model.get_topics()

NameError: name 'best_ctm_model' is not defined