In [1]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Trends in the albums

Apart from analyzing the emotional pallete of each album I also want to extract a specific thematics - love, determination, cause which the author might want to bring into the public's awareness. 

In [2]:
songs = pd.read_csv('../data/results/processed_lyrics')

In [3]:
songs

Unnamed: 0,album,song,words,joined_words
0,11.TTPD,HowDidItEnd,"['hereby', 'conduct', 'postmortem', 'hot', 'ho...",hereby conduct postmortem hot house flower out...
1,11.TTPD,TheBolter,"['account', 'almost', 'drowned', 'six', 'frigi...",account almost drowned six frigid water confir...
2,11.TTPD,Peter,"['forgive', 'peter', 'lost', 'fearless', 'lead...",forgive peter lost fearless leader closet like...
3,11.TTPD,imgonnagetyouback,"['lilac', 'short', 'skirt', 'one', 'fit', 'lik...",lilac short skirt one fit like skin research k...
4,11.TTPD,DownBad,"['really', 'beam', 'cloud', 'sparkling', 'dust...",really beam cloud sparkling dust experiment te...
...,...,...,...,...
237,7.Lover,MissAmericanaTheHeartbreakPrince,"['know', 'adore', ""i'm"", 'crazier', '16', 'los...",know adore i'm crazier 16 lost film scene wavi...
238,7.Lover,CruelSummer,"['fever', 'dream', 'high', 'quiet', 'night', '...",fever dream high quiet night know caught oh ri...
239,7.Lover,LondonBoy,"['go', ""drivin'"", 'scooter', 'uh', 'know', ""'r...",go drivin' scooter uh know 'round london oh i'...
240,7.Lover,FalseGod,"['crazy', 'think', 'crazy', 'think', 'could', ...",crazy think crazy think could work remember sa...


In [4]:
vectorizer = TfidfVectorizer()

In [5]:
matrix = vectorizer.fit_transform([songs.joined_words[180]])

In [6]:
scores = vectorizer.transform([songs.joined_words[180]])

In [7]:
word_indices = scores.toarray().argsort(axis=1)[:,-20:]

In [8]:
top_words = [(vectorizer.get_feature_names_out()[i], scores[0, i]) for i in word_indices[0]]

In [9]:
top_words

[('rain', 0.12067769800636945),
 ('kissing', 0.12067769800636945),
 ('cursing', 0.12067769800636945),
 ('coming', 0.12067769800636945),
 ('love', 0.12067769800636945),
 ('miss', 0.12067769800636945),
 ('kind', 0.12067769800636945),
 ('insane', 0.12067769800636945),
 ('name', 0.12067769800636945),
 ('2am', 0.12067769800636945),
 ('much', 0.16090359734182594),
 ('could', 0.16090359734182594),
 ('breaking', 0.16090359734182594),
 ('knew', 0.16090359734182594),
 ('feel', 0.20112949667728242),
 ('never', 0.20112949667728242),
 ('way', 0.3218071946836519),
 ('loved', 0.3218071946836519),
 ('that', 0.3218071946836519),
 ('oh', 0.40225899335456483)]

Now lets group the songs by album and get the top 20 songs for each album. Then we are going to save this data in a new dataset which we are going to use later in the [Close-Up Analysis](./close-up.ipynb)

In [10]:
sorted_songs  = songs.sort_values(by='album')

In [11]:
groups = sorted_songs.groupby('album')

In [12]:
def get_top_words(album):
    tfidf_matrix = vectorizer.fit_transform(album['joined_words'])
    word_indices = tfidf_matrix.toarray().argsort(axis=1)[:, -20:]  # Get indices of top 20 words
    top_words = [(vectorizer.get_feature_names_out()[i], tfidf_matrix[0, i]) for i in word_indices[0]]
    return top_words

In [13]:
top_words_per_group = songs.groupby('album').apply(get_top_words)

In [14]:
result = pd.DataFrame(columns=['album', 'word', 'score'])

In [15]:
for album , top_words in top_words_per_group.items():
    for word, score in top_words:
        result = result.append({'album': album, 'word': word, 'score': score}, ignore_index=True)

In [16]:
result

Unnamed: 0,album,word,score
0,1.TaylorSwift,time,0.128101
1,1.TaylorSwift,who,0.139300
2,1.TaylorSwift,old,0.139300
3,1.TaylorSwift,concerned,0.147523
4,1.TaylorSwift,far,0.147523
...,...,...,...
215,9.Evermore,taken,0.221970
216,9.Evermore,could,0.230512
217,9.Evermore,even,0.253596
218,9.Evermore,road,0.253596


In [17]:
result[(result.album == '10.Midnights')]

Unnamed: 0,album,word,score
20,10.Midnights,think,0.068835
21,10.Midnights,sometimes,0.074407
22,10.Midnights,sunshine,0.102405
23,10.Midnights,except,0.102405
24,10.Midnights,full,0.102405
25,10.Midnights,haunted,0.102405
26,10.Midnights,guess,0.102405
27,10.Midnights,like,0.137502
28,10.Midnights,stayed,0.180031
29,10.Midnights,bride,0.180031


In [18]:
result.to_csv('../data/results/album-thematics',index=False)