# Prep

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from collections import Counter
import spacy
import en_core_web_lg
import gensim
from gensim import corpora, models
from gensim.models import word2vec, CoherenceModel
from wordcloud import WordCloud
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.cluster import DBSCAN, AgglomerativeClustering, KMeans
import hdbscan
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap
import clean
import word_2dviz
import tqdm

pd.options.mode.chained_assignment = None
import pyLDAvis.gensim

pyLDAvis.enable_notebook()

import warnings

warnings.filterwarnings("ignore")

%load_ext lab_black

In [2]:
# Read from CSV
df = pd.read_csv("nightwish_lyrics.csv")

# Drop rows that no longer have lyrics
df_clean = df.dropna()

In [3]:
# Aggregate lyrics by song
df_song = (
    df_clean.groupby(["track_title"])["lyric_lemma"]
    .apply(lambda x: " ".join(x))
    .reset_index()
)

In [4]:
#  JOIN df_song with df_clean for album title and year
df_merged = df_song.merge(
    df_clean[["track_title", "album_title", "year"]],
    on="track_title",
    validate="one_to_many",
)

# Drop duplicated rows
df_merged.drop_duplicates(inplace=True)

# Reset index
df_merged.reset_index(inplace=True)

In [5]:
# List of studio albumns by Nightwish
studio_albums = [
    "Angels Fall First",
    "Oceanborn",
    "Wishmaster",
    "Century Child",
    "Once",
    "Dark Passion Play",
    "Imaginaerum",
    "Hvman. :||: Natvre.",
]

# Only keep in-album songs
df_merged = df_merged.loc[df_merged["album_title"].isin(studio_albums)]

# Reset index
df_merged.reset_index(inplace=True)

# Drop extra columns
df_merged.drop(["level_0", "index"], axis=1, inplace=True)

In [6]:
# Manually filter out more stopwords
filter = ["s", "oh", "ah", "o", "e", "ee", "ieee", "let"]

lyrics_clean = []

for lyric in df_merged["lyric_lemma"]:
    lyric_list_clean = [word for word in lyric.split() if word not in filter]
    lyric_clean = " ".join(lyric_list_clean)
    lyrics_clean.append(lyric_clean)

df_merged["lyric_lemma"] = lyrics_clean

In [7]:
# Tokenize words
lyrics = df_merged["lyric_lemma"].tolist()
lyrics = [i for i in lyrics if i]  # Remove empty strings
tokens = [lyric.split() for lyric in lyrics]

In [8]:
# Remove empty lists
tokens = [i for i in tokens if i]

In [9]:
# Create a "bag of words"
dictionary = corpora.Dictionary(tokens)
dictionary.filter_extremes(
    no_below=10, no_above=0.5
)  # Filter out words that are too rare or too common
corpus_bow = [dictionary.doc2bow(token) for token in tokens]

In [10]:
# Create TF-IDF
tfidf = models.TfidfModel(corpus_bow)
corpus_tfidf = tfidf[corpus_bow]

# Model

In [11]:
from gensim.models import LdaModel

ldamodel_bow = LdaModel(corpus_bow, id2word=dictionary, num_topics=3)

In [17]:
# Find topics across songs
def lda_model(n_topic, corpus):

    # Run LDA
    ldamodel = gensim.models.LdaMulticore(
        corpus, num_topics=n_topic, id2word=dictionary, passes=10
    )

    # Show topics
    for idx, topic in ldamodel.print_topics(-1):
        print(f"Topic: {idx} \nWords: {topic} \n")

    # Return model
    return ldamodel

In [18]:
ldamodel_bow2 = lda_model(3, corpus_bow)

Topic: 0 
Words: 0.043*"away" + 0.037*"night" + 0.036*"dream" + 0.029*"come" + 0.027*"world" + 0.025*"tale" + 0.023*"star" + 0.022*"want" + 0.022*"earth" + 0.021*"sing" 

Topic: 1 
Words: 0.057*"come" + 0.039*"wish" + 0.039*"rest" + 0.034*"dream" + 0.033*"night" + 0.029*"home" + 0.026*"deep" + 0.026*"day" + 0.023*"angel" + 0.023*"face" 

Topic: 2 
Words: 0.058*"heart" + 0.042*"world" + 0.040*"die" + 0.032*"dead" + 0.030*"lie" + 0.030*"heaven" + 0.028*"soul" + 0.026*"great" + 0.026*"time" + 0.024*"man" 



# Function

In [12]:
def dominant_topic(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(
                    pd.Series([int(topic_num), round(prop_topic, 4), topic_keywords]),
                    ignore_index=True,
                )
            else:
                break
    sent_topics_df.columns = ["Dominant_Topic", "Perc_Contribution", "Topic_Keywords"]

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return sent_topics_df

In [19]:
dominant_topic(ldamodel_bow, corpus_bow, tokens)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,0
0,0.0,0.7063,"dream, heart, wish, world, child, night, come,...","[wolf, love, come, take, home, dust, man, life..."
1,1.0,0.9533,"soul, sea, night, die, come, world, wish, hear...","[star, fall, darken, sky, new, world, bear, di..."
2,2.0,0.9741,"come, night, heart, dream, away, world, beauty...","[look, dot, home, love, know, hear, human, liv..."
3,2.0,0.9852,"come, night, heart, dream, away, world, beauty...","[baptise, perfect, doubt, heart, war, day, nee..."
4,2.0,0.9632,"come, night, heart, dream, away, world, beauty...","[angel, face, smile, headline, tragedy, smile,..."
...,...,...,...,...
86,2.0,0.9737,"come, night, heart, dream, away, world, beauty...","[want, siren, sing, hear, wolf, howl, sail, de..."
87,2.0,0.5878,"come, night, heart, dream, away, world, beauty...","[enchantress, come, say, meet, lake, tonight, ..."
88,2.0,0.9647,"come, night, heart, dream, away, world, beauty...","[seduce, dark, pain, rapture, like, ship, pass..."
89,0.0,0.9847,"dream, heart, wish, world, child, night, come,...","[deep, die, day, take, step, outside, innocent..."


In [20]:
dominant_topic(ldamodel_bow2, corpus_bow, tokens)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,0
0,1.0,0.8336,"come, wish, rest, dream, night, home, deep, da...","[wolf, love, come, take, home, dust, man, life..."
1,2.0,0.8578,"heart, world, die, dead, lie, heaven, soul, gr...","[star, fall, darken, sky, new, world, bear, di..."
2,1.0,0.8149,"come, wish, rest, dream, night, home, deep, da...","[look, dot, home, love, know, hear, human, liv..."
3,2.0,0.9857,"heart, world, die, dead, lie, heaven, soul, gr...","[baptise, perfect, doubt, heart, war, day, nee..."
4,1.0,0.6334,"come, wish, rest, dream, night, home, deep, da...","[angel, face, smile, headline, tragedy, smile,..."
...,...,...,...,...
86,0.0,0.7187,"away, night, dream, come, world, tale, star, w...","[want, siren, sing, hear, wolf, howl, sail, de..."
87,1.0,0.9591,"come, wish, rest, dream, night, home, deep, da...","[enchantress, come, say, meet, lake, tonight, ..."
88,1.0,0.7478,"come, wish, rest, dream, night, home, deep, da...","[seduce, dark, pain, rapture, like, ship, pass..."
89,1.0,0.9845,"come, wish, rest, dream, night, home, deep, da...","[deep, die, day, take, step, outside, innocent..."
