In [2]:
import string
import spacy
spacy_nlp = spacy.load("en_core_web_sm")
import numpy as np
from spacy.lang.en.stop_words import STOP_WORDS
import utility_functions as utils
import importlib
import pandas as pd
import matplotlib.pyplot as plt

# Octis is the library which can use different implemented topic modelling techniques
from octis.preprocessing.preprocessing import Preprocessing
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.models.CTM import CTM

importlib.reload(utils)

data = '/Users/borosabel/Documents/Uni/Thesis/PopMIR/Data/Audio/test.json'

custom_stop_words = list(STOP_WORDS)  # Existing stop words
custom_stop_words.extend(["ll", "ve", "'em", "em", "ho", "fo", "ah", "de"])  # Tokens which doesn't really make sense to have them.

In [3]:
df = pd.read_excel('/Users/borosabel/Documents/Uni/Thesis/PopMIR/Data/Excel/baseline_data.xlsx', engine='openpyxl')

In [4]:
df['Lyrics'] = df['Lyrics'].apply(utils.cleanup)

In [5]:
with open('corpus.tsv', 'w', encoding='utf-8') as file:
    for lyrics in df['Lyrics']:
        if pd.notna(lyrics):
            file.write(lyrics + '\n')

In [6]:
# Initialize preprocessing
preprocessor = Preprocessing(
    vocabulary=None,
    max_features=None,
    remove_punctuation=True,
    punctuation=string.punctuation,
    lemmatize=True,
    stopword_list=custom_stop_words,
    min_chars=2,
    min_words_docs=0,
    save_original_indexes=True,
    min_df=0.05, # a term must appear in at least 5% of the documents; otherwise, it will be discarded.
    max_df=0.8, # a term appearing in more than 80% of the documents will be discarded, as it might be too common and potentially less informative.
    split=False # We don't want train, validation and test split
)

dataset = preprocessor.preprocess_dataset(documents_path="/Users/borosabel/Documents/Uni/Thesis/PopMIR/Code/Lyrics/octis_dataset/corpus.tsv")

<b>As you can see there are 658 unique words in the vocabulary</b>

In [23]:
# I save the dataset because it exports a file called indexes.txt. It is important because during the preprocessing steps the preprocessor
# can get rid of a few documents if they don't have enough words or maybe they are too short. (For example intro files).
# I can load the indexes.txt file and use it as a list of indexes and apply it on the original dataframe so I don't have size mismatch.
dataset.save('./')

In [16]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

best_coherence = -1
best_diversity = -1
best_params = {}

topic_nums = [2, 3, 4, 5, 10, 15]
epoch_nums = [100, 200, 300]

results = []

for num_topics in topic_nums:
    for num_epochs in epoch_nums:
        model = CTM(num_topics=num_topics, inference_type="combined", num_epochs=num_epochs, use_partitions=False, bert_model="bert-base-nli-mean-tokens")
        model_output = model.train_model(dataset)

        # Coherence
        coherence = Coherence(texts=dataset.get_corpus(), topk=10)
        coherence_score = coherence.score(model_output)

        # Topic Diversity
        diversity = TopicDiversity(topk=10)
        diversity_score = diversity.score(model_output)

        # Collect results for plotting and evaluation
        results.append({
            'num_topics': num_topics,
            'num_epochs': num_epochs,
            'coherence': coherence_score,
            'diversity': diversity_score
        })

        # Update best model conditions
        if coherence_score > best_coherence and diversity_score > best_diversity:
            best_coherence = coherence_score
            best_diversity = diversity_score
            best_params = {'num_topics': num_topics, 'num_epochs': num_epochs}

print(f"Best Coherence: {best_coherence}")
print(f"Best Diversity: {best_diversity}")
print(f"Best Parameters: {best_params}")

# Plotting results
results_df = pd.DataFrame(results)
plt.figure(figsize=(16, 6))
sns.lineplot(data=results_df, x='num_topics', y='coherence', hue='num_epochs', marker='o', label='Coherence')
sns.lineplot(data=results_df, x='num_topics', y='diversity', hue='num_epochs', marker='o', label='Diversity')
plt.title('Coherence and Diversity by Topics and Epochs')
plt.xlabel('Number of Topics')
plt.ylabel('Score')
plt.legend()
plt.grid(True)
plt.show()

In [17]:
results

In [10]:
model = CTM(num_topics=best_params['num_topics'], inference_type="combined", num_epochs=best_params['num_epochs'], use_partitions=False, bert_model="bert-base-nli-mean-tokens")
model_output = model.train_model(dataset)

In [28]:
model = CTM(num_topics=3, inference_type="combined", num_epochs=100, use_partitions=False, bert_model="bert-base-nli-mean-tokens")
model_output = model.train_model(dataset)

In [29]:
np.array(model_output['topics'])

In [11]:
model_output['topics'][0] # Vulgarity

In [12]:
model_output['topics'][1]

In [13]:
# model_output['topics'][2] # Romance

In [14]:
# model_output['topics'][3] # Music

In [15]:
# model_output['topics'][4] # Day, Street-Life, Struggle

In [16]:
coherence = Coherence(texts=dataset.get_corpus(), topk=10)  # Adjust 'topk' as needed

# Setting up Diversity Metric
topic_diversity = TopicDiversity(topk=10)

# Calculate Metrics
coherence_score = coherence.score(model_output)
diversity_score = topic_diversity.score(model_output)

print("Coherence Score:", coherence_score)
print("Diversity Score:", diversity_score)

<b>Let's check which document belongs mostly to which topic</b>

In [17]:
model_output['topic-document-matrix']

In [18]:
# We decide on the topic based on the maximum probability of the topic.
dominant_topic_indices = np.argmax(model_output['topic-document-matrix'], axis=0)

In [19]:
# Every row is a document and the colum shows which is the most probable topic index.
dominant_topic_indices

In [20]:
# We load the previously saved indexes file because in this way we can filter the documents by index. Some documents may not included in the analysis due to the preprocessing steps
file_path = 'indexes.txt'

# Read the file and convert each line to an integer
with open(file_path, 'r') as file:
    indices = [int(line.strip()) for line in file]

In [21]:
# We have 1368 rows in the original dataset
df

In [22]:
filtered_df = df.iloc[indices]

In [23]:
filtered_df

In [24]:
# This is just to make sure that in the filtered dataframe we have the same ammount of songs than in the preprocessed dataset.
filtered_df.shape[0] == dominant_topic_indices.shape[0]

In [25]:
filtered_df['Topic Model'] = dominant_topic_indices

In [26]:
filtered_df.head()

In [27]:
# Save the data:
# filtered_df.to_excel("baseline_data_w_topics.xlsx", sheet_name="Sheet1", index=False)

<b>Check The Topic Distribution</b>

In [28]:
topic_coast_distribution = pd.crosstab(filtered_df['Topic Model'], filtered_df['Coast'])
print(topic_coast_distribution)

In [29]:
# Normalization of the topic distribution over the entire dataset
total_songs = topic_coast_distribution.sum().sum()
topic_coast_distribution_normalized = topic_coast_distribution / total_songs

Here we can see that one of the topic's large contributors are the west coast songs.
Also there should be a topic where the east coast contribution is larger than the west coast contribution.
The other 

In [30]:
colors = ['blue', 'red']
ax = (topic_coast_distribution_normalized * 100).plot(kind='bar', figsize=(10, 6), color=colors)
plt.title('Proportional Contribution of Each Coast to Topics (Global Normalization)')
plt.xlabel('Dominant Topic')
plt.ylabel('Percentage of Total Songs')
plt.xticks(rotation=0)
plt.legend(title='Coast')

# Adding percentage labels
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    if height > 0:  # Only add annotations for non-zero values
        ax.annotate(f'{height:.2f}%', (x + width/2, y + height*0.5), ha='center')

plt.show()

<b>Let's check the characteristics of the two topics which has huge differences between the coast distributions.</b>

In [41]:
topic2_df = filtered_df[filtered_df['Topic Model'] == 0]
topic4_df = filtered_df[filtered_df['Topic Model'] == 1]

<b>Vulgarity/Bad Words Topic</b>

In [42]:
topic2_artists = topic2_df[['Artist', 'Coast']]

In [43]:
topic2_artist_topic_distribution = pd.crosstab(topic2_artists['Artist'], topic2_artists['Coast'])
print(topic2_artist_topic_distribution)

In [44]:
total_songs_in_topic2 = topic2_artist_topic_distribution.sum().sum()
topic2_coast_distribution_normalized = topic2_artist_topic_distribution / total_songs_in_topic2

<b>Here we can see that the biggest contributor artists to this topic is Ice Cube, Too Short, Eazy-E</b>

In [45]:
colors = ['blue', 'red']
ax = (topic2_coast_distribution_normalized * 100).plot(kind='bar', figsize=(16, 6), color=colors)
plt.title('Proportional Contribution of Artists to The Current Topic')
plt.xlabel('Artist')
plt.ylabel('Percentage of Contribution')
plt.xticks(rotation=90)
plt.legend(title='Coast')

# Adding percentage labels
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    if height > 0:  # Only add annotations for non-zero values
        ax.annotate(f'{height:.2f}%', (x + width/2, y + height*0.5), ha='center')

plt.show()

In [46]:
topic4_artists = topic4_df[['Artist', 'Coast']]

In [47]:
topic4_artist_topic_distribution = pd.crosstab(topic4_artists['Artist'], topic4_artists['Coast'])
print(topic4_artist_topic_distribution)

In [48]:
total_songs_in_topic4 = topic4_artist_topic_distribution.sum().sum()
topic4_coast_distribution_normalized = topic4_artist_topic_distribution / total_songs_in_topic4

<b>At this topic the east coast artists gets distributed in a much more equal way. Most of the artist contributes to the topic from 6 to 10%</b>

In [49]:
colors = ['blue', 'red']
ax = (topic4_coast_distribution_normalized * 100).plot(kind='bar', figsize=(16, 6), color=colors)
plt.title('Proportional Contribution of Artists to The Current Topic')
plt.xlabel('Artist')
plt.ylabel('Percentage of Contribution')
plt.xticks(rotation=90)
plt.legend(title='Coast')

# Adding percentage labels
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    if height > 0:  # Only add annotations for non-zero values
        ax.annotate(f'{height:.2f}%', (x + width/2, y + height*0.5), ha='center')

plt.show()

In [144]:
artist_topic_distribution = pd.crosstab(df_artists_only['Artist'], df_artists_only['Topic Model'])
print(artist_topic_distribution)

In [152]:
import seaborn as sns

# Reset index to use "Artist" as a normal column
df_melted = artist_topic_distribution.reset_index().melt(id_vars='Artist', value_name='Number of Songs', var_name='Topic Model')

# Using a predefined palette
palette = sns.color_palette("bright")  # You can change "pastel" to "deep", "muted", "bright", "dark", or "colorblind"

# Create the plot
plt.figure(figsize=(15, 10))
sns.barplot(x='Artist', y='Number of Songs', hue='Topic Model', data=df_melted, palette=palette)
plt.title('Topic Distribution Across Different Artists')
plt.xlabel('Artist')
plt.ylabel('Number of Songs')
plt.xticks(rotation=90)  # Rotate labels to make them readable
plt.legend(title='Dominant Topic')
plt.show()

In [155]:
topic_word_matrix = model_output['topic-word-matrix']

In [158]:
plt.figure(figsize=(20, 10))
sns.heatmap(topic_word_matrix, cmap='viridis', linewidths=.5)
plt.title('Topic-Word Matrix')
plt.xlabel('Words')
plt.ylabel('Topics')
plt.show()

In [7]:
best_coherence = -1
best_params = {}
topic_nums = [5, 10, 15]
epoch_nums = [100, 200, 300]

for num_topics in topic_nums:
    for num_epochs in epoch_nums:
        model = CTM(num_topics=num_topics, inference_type="combined", num_epochs=num_epochs, use_partitions=False, bert_model="bert-base-nli-mean-tokens")
        model_output = model.train_model(dataset)
        # Assuming the model has a method to compute coherence
        coherence = Coherence(texts=dataset.get_corpus(), topk=10)  # Adjust 'topk' as needed
        # Setting up Diversity Metric
        topic_diversity = TopicDiversity(topk=10)
        # Calculate Metrics
        coherence = coherence.score(model_output)

        if coherence > best_coherence:
            best_coherence = coherence
            best_params = {'num_topics': num_topics, 'num_epochs': num_epochs}

print(f"Best Coherence: {best_coherence}")
print(f"Best Parameters: {best_params}")

In [41]:
model_output

In [78]:
# Assuming `model` is your trained topic model from OCTIS
topic_word_matrix = model_output['topic-word-matrix']
topic_document_matrix = model_output['topic-document-matrix']
vocabulary = dataset.get_vocabulary()
document_ids = range(len(topic_document_matrix[0]))  # Adjust based on how you've stored documents

# Assuming topic_document_matrix is an array where rows are topics and columns are documents
topic_sizes = np.sum(topic_document_matrix, axis=1)  # Sum over columns to get size per topic
normalized_sizes = (topic_sizes - np.min(topic_sizes)) / (np.max(topic_sizes) - np.min(topic_sizes)) * 100 + 10  # Normalize and scale

In [79]:
# Prepare top 10 words for tooltips
top_words_per_topic = []
for idx in range(len(topic_word_matrix)):
    top_words = sorted([(vocabulary[i], weight) for i, weight in enumerate(topic_word_matrix[idx])],
                       key=lambda x: x[1], reverse=True)[:10]  # Get top 10 words
    top_words_per_topic.append(", ".join([word for word, weight in top_words]))

In [74]:
from sklearn.manifold import TSNE

tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
tsne_results = tsne_model.fit_transform(topic_word_matrix)

In [87]:
import plotly.graph_objects as go
import plotly.express as px

fig = go.Figure()

for idx, coords in enumerate(tsne_results):
    topic_label = f"Topic {idx}"
    tooltip_info = f"{topic_label}: {top_words_per_topic[idx]}"

    fig.add_trace(go.Scatter(
        x=[coords[0]],
        y=[coords[1]],
        name=topic_label,
        marker=dict(size=normalized_sizes[idx]),  # Use normalized sizes for marker size
        text=tooltip_info,
        hoverinfo='text'
    ))

fig.update_layout(title="Topic Visualization",
                  xaxis_title="Component 1",
                  yaxis_title="Component 2",
                  hovermode='closest')
fig.show()

In [81]:
model_output['topics']

In [88]:
unique_topics = np.unique(filtered_df['Topic Model'])

In [90]:
document_topic_matrix = topic_document_matrix.T

# Apply t-SNE to the document-topic matrix
tsne = TSNE(n_components=2, random_state=42)
document_tsne_results = tsne.fit_transform(document_topic_matrix)

In [89]:
colors = px.colors.qualitative.Set1  # Using Plotly's qualitative color sets
topic_color_map = {topic: colors[i % len(colors)] for i, topic in enumerate(unique_topics)}
filtered_df['color'] = filtered_df['Topic Model'].map(topic_color_map)

In [92]:
import plotly.express as px

fig = px.scatter(
    filtered_df,
    x=document_tsne_results[:, 0],
    y=document_tsne_results[:, 1],
    color=filtered_df['Topic Model'].astype(str),  # Convert topics to string for coloring
    hover_data=['Song', 'Artist'],  # Assuming these columns exist in _df
    title="Songs in 2D Space by Topic",
    labels={"color": "Topic"}
)

fig.update_traces(marker=dict(size=10, line=dict(width=1, color='DarkSlateGrey')))
fig.show()
