In [1]:
import warnings
from urllib3.exceptions import NotOpenSSLWarning

warnings.filterwarnings("ignore", category=NotOpenSSLWarning)

In [4]:
import spacy
spacy_nlp = spacy.load("en_core_web_sm")
import utility_functions as utils
import importlib
import pandas as pd
import seaborn as sns

# Text processing
from gensim import corpora

# Visualization
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
from tqdm import tqdm
from gensim import models
from gensim.models.coherencemodel import CoherenceModel

importlib.reload(utils)

data = './preprocessed_df.pkl'

In [5]:
df = pd.read_pickle(data)
df[['Artist', 'Song', 'Tokens', 'Lyrics', 'Coast']].head()

In [6]:
# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(df['Tokens'])
dictionary.filter_extremes(no_below=5, no_above=0.5)
print(f"Number of unique tokens: {len(dictionary)}")

In [7]:
# Convert documents to a bag-of-words representation
corpus = [dictionary.doc2bow(text) for text in df['Tokens']]

In [8]:
from tqdm import tqdm
from gensim import models
from gensim.models import CoherenceModel

def compute_coherence_values(dictionary, corpus, texts, start, limit, step, chunksize, passes, iterations, eval_every):
    coherence_values = []
    model_list = []

    # Add tqdm for progress tracking
    for num_topics in tqdm(range(start, limit, step), desc="Computing coherence"):
        model = models.LdaModel(
            corpus=corpus,
            id2word=dictionary,
            num_topics=num_topics,
            random_state=42,
            chunksize=chunksize,
            passes=passes,
            iterations=iterations,
            alpha='auto',
            eta='auto',
            eval_every=eval_every
        )
        model_list.append(model)

        # Applies a sliding window and evaluates similarity of the top words in each topic. 
        coherencemodel = CoherenceModel(
            model=model,
            texts=texts,
            dictionary=dictionary,
            coherence='c_v'
        )

        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [9]:
start = 2
limit = 6
step = 1
chunksize = 200
passes = 15
iterations = 100
eval_every = 10

model_list, coherence_values = compute_coherence_values(
    dictionary=dictionary,
    corpus=corpus,
    texts=df['Tokens'],
    start=start,
    limit=limit,
    step=step,
    chunksize=chunksize,
    passes=passes,
    iterations=iterations,
    eval_every=eval_every
)

In [10]:
# Prepare data for Seaborn
x = list(range(start, limit, step))

# Create a barplot with Seaborn
plt.figure(figsize=(10, 6))
sns.barplot(x=x, y=coherence_values)

# Set plot labels and title
plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score")
plt.title("Coherence Scores for Different Numbers of Topics")

# Ensure integer values on the x-axis
plt.xticks(ticks=range(start, limit, step))

plt.show()

In [11]:
# Select the model with highest coherence
optimal_index = coherence_values.index(max(coherence_values))
optimal_model = model_list[optimal_index]
optimal_num_topics = x[optimal_index]

print(f'Optimal Number of Topics: {optimal_num_topics}')
print(f'Highest Coherence Score: {coherence_values[optimal_index]:.4f}')

In [13]:
# Prepare the visualization
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(optimal_model, corpus, dictionary)

# Display the visualization
pyLDAvis.display(vis)

In [14]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = sorted(row_list, key=lambda x: (x[1]), reverse=True)
        # Get the dominant topic, its percentage, and keywords
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # Only the dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])

                # Creating a temporary DataFrame to hold the new row
                temp_df = pd.DataFrame([[int(topic_num), round(prop_topic, 4), topic_keywords]],
                                       columns=['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'])

                # Use pd.concat() instead of append
                sent_topics_df = pd.concat([sent_topics_df, temp_df], ignore_index=True)

            else:
                break

    # Add original text to the DataFrame
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents.rename('Text')], axis=1)

    return sent_topics_df

# Apply the updated function
df_topic_sents_keywords = format_topics_sentences(
    ldamodel=optimal_model,
    corpus=corpus,
    texts=df['Lyrics']
)

# Format the output
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Display the result
df_dominant_topic.head()

In [15]:
# Assuming you have a 'Region' column indicating 'East' or 'West'
df_dominant_topic['Coast'] = df['Coast']

# Calculate the distribution of topics by region (East vs West) in percentages
topic_region_dist = pd.crosstab(df_dominant_topic['Dominant_Topic'],
                                df_dominant_topic['Coast'],
                                normalize='index') * 100
print(topic_region_dist)

In [16]:
# Reset the index to convert 'Dominant_Topic' from index to column
topic_region_dist = topic_region_dist.reset_index()

# Melt the DataFrame for easier plotting with seaborn
topic_region_dist_melted = topic_region_dist.melt(id_vars='Dominant_Topic',
                                                  value_vars=['east_coast', 'west_coast'],
                                                  var_name='Coast',
                                                  value_name='Percentage')

# Define custom colors for East Coast (blue) and West Coast (red)
palette = {"east_coast": "blue", "west_coast": "red"}

# Create the bar plot with custom colors
plt.figure(figsize=(12, 6))
sns.barplot(x='Dominant_Topic', y='Percentage', hue='Coast', data=topic_region_dist_melted, palette=palette)

# Set plot labels and title
plt.xlabel("Dominant Topic")
plt.ylabel("Percentage Contribution")
plt.title("Percentage Contribution of East and West Coast to Each Topic")

# Show the plot
plt.show()

In [17]:
df.head()