In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
from transformers import BertTokenizer, BertModel

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [2]:
lyrics = pd.read_csv("../data/lyrics.csv")
lyrics.head()

Unnamed: 0,track_name,lyrics
0,Tim McGraw,he said the way my blue eyes shined put those ...
1,Picture To Burn,"state the obvious, i didn't get my perfect fan..."
2,Teardrops On My Guitar,drew looks at me i fake a smile so he won't se...
3,A Place In This World,"i don't know what i want, so don't ask me caus..."
4,Cold As You,you have a way of coming easily to me and when...


In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def encode_text(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
    
    with torch.no_grad(): 
        outputs = model(**inputs)
    
    embeddings = outputs.last_hidden_state[:, 0, :].numpy()
    return embeddings

In [4]:
# Attempting to encode Tim McGraw Lyrics
mcgraw_lyrics = lyrics.iloc[0,1]
print(mcgraw_lyrics)

mcgraw_embedding = encode_text(mcgraw_lyrics)
print(mcgraw_embedding.shape)
print(mcgraw_embedding)

he said the way my blue eyes shined put those georgia stars to shame that night i said, "that's a lie" just a boy in a chevy truck that had a tendency of gettin' stuck on backroads at night and i was right there beside him all summer long and then the time we woke up to find that summer gone but when you think tim mcgraw i hope you think my favorite song the one we danced to all night long the moon like a spotlight on the lake when you think happiness i hope you think that little black dress think of my head on your chest and my old faded blue jeans when you think tim mcgraw i hope you think of me september saw a month of tears and thankin' god that you weren't here to see me like that but in a box beneath my bed is a letter that you never read from three summers back it's hard not to find it all a little bittersweet and lookin' back on all of that, it's nice to believe when you think tim mcgraw i hope you think my favorite song the one we danced to all night long the moon like a spotl

In [5]:
lyrics['latent_embedding'] = lyrics['lyrics'].apply(lambda x: encode_text(x))

In [37]:
embeddings = np.array([embedding.flatten() for embedding in lyrics['latent_embedding']])

pca = PCA()

# Fit PCA on the embeddings
pca.fit(embeddings)

# Calculate the cumulative sum of explained variance ratio
cumulative_variance = pca.explained_variance_ratio_.cumsum()

# Find the number of components to keep 95% of the variance
n_components_95 = (cumulative_variance < 0.95).sum() + 1

# Print the number of components
print(f"Number of components to keep 95% of variance: {n_components_95}")

# Now, initialize PCA with n_components_95 to reduce the data while keeping 95% of the variance
pca_95 = PCA(n_components=n_components_95)

# Fit PCA on the embeddings and transform the data
reduced_data_95 = pca_95.fit_transform(embeddings)

"""k_values = range(5, 50) 
wcss = []

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=256, n_init=10)
    kmeans.fit(reduced_data_95)
    wcss.append(kmeans.inertia_)

# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(k_values, wcss, '-o', color='black')
plt.title('The Elbow Method')
plt.xlabel('Number of clusters (k)')
plt.ylabel('WCSS')
plt.xticks(k_values)
plt.grid(True)
plt.show()"""

k = 10
kmeans = KMeans(n_clusters=k, random_state=256, n_init=10).fit(embeddings)
lyrics['cluster_assignment'] = kmeans.labels_

Number of components to keep 95% of variance: 130


In [42]:
cluster_sizes = {}

for label in lyrics['cluster_assignment']:
    if label in cluster_sizes:
        cluster_sizes[label] += 1
    else:
        cluster_sizes[label] = 1

print(cluster_sizes)

lyrics[lyrics["cluster_assignment"] == 4]

{0: 34, 2: 39, 9: 31, 3: 62, 1: 26, 5: 42, 4: 22, 7: 30, 6: 28, 8: 11}


Unnamed: 0,track_name,lyrics,latent_embedding,cluster_assignment
24,The Way I Loved You,he is sensible and so incredible and all my si...,"[[-0.6524123, -0.04458748, -0.22136214, 0.0836...",4
87,Innocent (Taylor's Version),i guess you really did it this time left yours...,"[[-0.63805485, -0.07485211, -0.3526963, 0.0723...",4
96,Castles Crumbling (Taylor's Version) [From The...,"(once, i had an empire) (once, i had an empire...","[[-0.7658571, -0.0068979934, -0.04758803, 0.14...",4
105,I Almost Do,"i bet this time of night, you're still up i be...","[[-0.3132206, -0.05314971, -0.7422055, -0.1833...",4
119,Red (Original Demo Recording),loving him is like driving a new maserati down...,"[[-0.53848416, -0.09094588, -0.55734235, 0.164...",4
127,I Almost Do (Taylor's Version),"i bet this time of night, you're still up i be...","[[-0.33771777, -0.07272535, -0.75503176, -0.21...",4
172,Shake It Off (Taylor's Version),i stay out too late got nothin' in my brain th...,"[[-0.665012, 0.16054896, -0.089480355, -0.0595...",4
195,Look What You Made Me Do,i don't like your little games don't like your...,"[[-0.58706117, -0.20761967, -0.2624453, 0.0901...",4
198,Getaway Car,"no, nothing good starts in a getaway car it wa...","[[-0.6162787, 0.2732562, -0.4512482, 0.1393862...",4
228,mirrorball,i want you to know i'm a mirrorball i'll show ...,"[[-0.39124307, -0.2668876, -0.2012381, -0.0133...",4


In [None]:
lyrics.to_csv("lyrics.csv", index=False)