In [1]:
import pandas as pd
import nltk
import nltk.corpus
import string
import numpy as np
import gensim
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans



In [2]:
# Read in the file
newpath = '/Users/awburns2/Downloads/Taylor_FullLyrics_By_Song_up.csv'      
tswizzle=pd.read_csv(newpath)
tswizzle['song_title'] = tswizzle['song_title'].str.strip()

In [3]:
#optional print to check if it worked
#print(tswizzle.iloc[2:50,1:4])
#tswizzle['album'].unique()
#print(tswizzle[tswizzle['album'] == 'folklore'])

In [4]:
#Get top 5 tracks for each song

#songs = ['You Need To Calm Down', "exile (Ft. Bon Iver)", 'Style', 'august', 'Wildest Dreams', 'I Knew You Were Trouble','We Are Never Ever Getting Back Together','Back to December','Mean','Fifteen','...Ready for It?','cardigan','Delicate','Gorgeous','The Archer','Fearless','Tim McGraw','End Game (Ft. Ed Sheeran & Future)','Mine','White Horse','Speak Now',"Should've Said No",'Look What You Made Me Do','ME! (Ft. Brendon Urie)','Love Story','Shake It Off','The Story of Us','Blank Space','the 1','betty','The Man','Begin Again','You Belong with Me','22','Red','Teardrops On My Guitar','Bad Blood','Our Song','Lover']

#tswizzle = tswizzle[tswizzle['song_title'].isin(songs)]

In [5]:
#Optionally check if demo album removal worked
tswizzle['song_title'].nunique()
#print(tswizzle)

39

In [6]:
#import stop words
from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))

In [7]:
# optional print and check stop words
#print(stop_words)

In [9]:
#updating stop words list with several entries unique to Taylor and tokenization
#Some are now obsolete after some data clean up
stop_words.update(['[Verse]', '[Verse 1]', '[Chorus]', '[Verse 1: Future]', '[Verse 1: Taylor Swift]'])
stop_words.update(['[Intro]', '[Intro: Idris Elba & James Corden]', '[Intro: Ed Sheeran]'])
stop_words.update(['[Pre-Chorus]','[Pre-Chorus:Taylor Swift]','[PreChorus 1]', '[PreChorus]'])
stop_words.update(['[Recorder Click]', '[Refrain]'])
stop_words.update(['Verse','1','Chorus','Pre','PreChorus','Hook'])
stop_words.update(["mm", "mmm", "mmmm","i'ma"])
stop_words.update(["ah", "oh","ohoh","ahah","ooh","aah", "Ha", "ohoh","oohoohoohooh"])
stop_words.update(["uh","huh","ey","radidididididididididada","dada", "oohoohoohoohooh","e"])
punc=string.punctuation
stop_words.update(punc)
stop_words.update(["'re","'ve","'s","n't","'m",'"',"''","``","'ll","na","ai","'d","..."])
stop_words.update(["thi","wa","ca","cause","caus","'caus","'cause","'em", "ha"])
stop_words.update(['vers','post-choru','2','bridg','choru','1','pre-choru','ah-aah-aah-aah-aah-hafhaa','ooh-ooh-ooh-ooh','ooh-ooh-ooh','eeh-eeh-eeh','outro'])

In [10]:
# optional double check that all stop words are loaded
#print(stop_words)

In [11]:
#import some stuff to tokenize and stem
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [12]:
stem=PorterStemmer()

tokenized_lyrics=[]

In [14]:
#Run through lyics, tokenize and stem, creates a list with a 
# list of words for each song in the same order as the csv
for line in range(0,len(tswizzle['lyrics'])):
    tokenized_word = word_tokenize(tswizzle.iloc[line,2])
    for word in range(0,len(tokenized_word)):
        tokenized_word[word] = stem.stem(tokenized_word[word])
    tokenized_lyrics.append(tokenized_word)
    
#print(tokenized_lyrics)

In [15]:
#Filter out all the stop words
filtered_lyrics=[]
for song in range(0,len(tswizzle['lyrics'])):
    filtered_lyric=[]
    for word in range(0,len(tokenized_lyrics[song])):
        tokenized_lyrics[song][word]=tokenized_lyrics[song][word].lower()
        if tokenized_lyrics[song][word] not in stop_words:
            filtered_lyric.append(tokenized_lyrics[song][word])
    
    filtered_lyrics.append(filtered_lyric)

In [16]:
#Join filtered lyrics back into individual songs in a list
test = []

for i in filtered_lyrics:
    temp = " ".join(i)
    test.append(temp)

In [17]:
#Make lists so we can use kmeans/LDA clustering. The main list: tswizzle_list is one list with the iterable objects as the stemmed and filtered words for each song
tswizzle_list = test
tswizzle_song_list = tswizzle['song_title'].values.tolist()
tswizzle_album_list = tswizzle['album'].values.tolist()


In [18]:
#optional print
#print(tswizzle_list)



In [19]:
# Remove punctuation from the songs

tswizzle_list_abv = [ ]

punc = string.punctuation.replace( '-', '' )
for i in range( 0, len( tswizzle_list ) ):
    tswizzle_list_abv.append( re.sub( '[' + punc + ']+', '', tswizzle_list[ i ] ) )

In [23]:
#print(tswizzle_list_abv)

In [24]:
#Confirms # of lines to choose from
#len(tswizzle_list_abv)

In [25]:
# Remove empty lyrics after stop word removal

i = 0
while i < len( tswizzle_list_abv ):
    if len( tswizzle_list_abv[ i ] ) == 0:
        del tswizzle_list_abv[ i ]
    else:
        i += 1

In [26]:
# Convert frequencies to TF-IDF values, get cosine similarity
# of all pairs of documents

tfidf = TfidfVectorizer( max_df=0.8, max_features=1000 )
term_vec = tfidf.fit_transform( tswizzle_list_abv )
X = cosine_similarity( term_vec )

In [27]:
# Fit vectors for Kmeans clusters

clust = KMeans( n_clusters=8, random_state=1 ).fit( X )

In [28]:
#Determines # of clusters

import numpy as np
from scipy.spatial.distance import cdist

def elbow( X, max_clust=25 ):
    distort = [ ]
    inertia = [ ]

    map_distort = { }
    map_inertia = { }

    elbow_distort = 1
    elbow_inertia = 1

    K = range( 1, max_clust )
    for k in K:
        kmean_model = KMeans( n_clusters=k )
        kmean_model.fit( X )

        distort.append( sum( np.min( cdist( X, kmean_model.cluster_centers_, 'euclidean' ), axis=1 ) ) / X.shape[ 0 ] )
        inertia.append( kmean_model.inertia_ )

        map_distort[ k ] = sum( np.min( cdist( X, kmean_model.cluster_centers_, 'euclidean' ), axis=1 ) ) / X.shape[ 0 ]
        map_inertia[ k ] = kmean_model.inertia_

    prev_k = ''
    prev_v = 0
    prev_pct = 0
    for i,(k,v) in enumerate( map_distort.items() ):
        if prev_k == '':
            print( f'{k}: {v:.4f}' )
            prev_k = str( k )
            prev_v = v
            continue

        print( f'{k}: {v:.4f} ', end='' )

        diff_v = prev_v - v
        diff_v_pct = diff_v / prev_v * 100.0
        print( f'{diff_v:.4f}, {diff_v_pct:.2f}%' )

        if i > 2 and prev_pct - diff_v_pct < 0.5:
            elbow_distort = i + 1
            break

        prev_k = str( k )
        prev_v = v
        prev_pct = diff_v_pct

    print()

    prev_k = ''
    prev_v = 0
    prev_pct = 0
    for i,(k,v) in enumerate( map_inertia.items() ):
        if prev_k == '':
            print( f'{k}: {v:.4f}' )
            prev_k = str( k )
            prev_v = v
            continue

        print( f'{k}: {v:.4f} ', end='' )

        diff_v = prev_v - v
        diff_v_pct = diff_v / prev_v * 100.0
        print( f'{diff_v:.4f}, {diff_v_pct:.2f}%' )

        if i > 2 and prev_pct - diff_v_pct < 0.5:
            elbow_inertia = i + 1
            break

        prev_k = str( k )
        prev_v = v
        prev_pct = diff_v_pct

    return max( elbow_distort, elbow_inertia )

In [29]:
#Shows the number of clusters based on inertia and distort
#Can be used to determine the "real" amount of clusters
elbow(X)

#This is the end of TF_IDF. Code below is for concept clustering

1: 0.9520
2: 0.9304 0.0216, 2.27%
3: 0.9089 0.0215, 2.32%
4: 0.8949 0.0140, 1.54%
5: 0.8745 0.0204, 2.28%

1: 35.3873
2: 33.8704 1.5169, 4.29%
3: 32.3722 1.4982, 4.42%
4: 31.2999 1.0723, 3.31%
5: 29.9705 1.3294, 4.25%


5

In [30]:
#Concept Clustering - LDA clustering

In [31]:
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer

In [32]:
# Count raw term frequencies

count = CountVectorizer()
term_vec = count.fit_transform( tswizzle_list_abv )

n_concepts = 8

In [33]:
# Build a string list of [ 'Topic 1', 'Topic 2', ..., 'Topic n' ]

col_nm = [ ]
for i in range( 1, n_concepts + 1 ):
    col_nm += [ f'Concept {i}' ]

In [34]:
# Fit an LDA model to the term vectors, get cosine similarities

lda_model = LDA( n_components=n_concepts, random_state = 1 )
concept = lda_model.fit_transform( term_vec )
X = cosine_similarity( concept )

In [35]:
# Print top 10 terms for each topic

feat = count.get_feature_names()
topic_list = [ ]
for i,topic in enumerate( lda_model.components_ ):
    top_n = [ feat[ i ] for i in topic.argsort()[ -10: ] ]
    top_feat = ' '.join( top_n )
    topic_list.append( f"topic_{'_'.join(top_n[ :3 ] ) }" )

    print( f'Concept {i}: {top_feat}' )
print()

Concept 0: back yeah night gon think know ever like ooh shake
Concept 1: sick nobodi got trust need like man look made oh
Concept 2: know think like blood bad babi knew got say troubl
Concept 3: like mani would know back time gave sign think never
Concept 4: feel hate say stay face make see could gorgeou right
Concept 5: one back see got babi time like never go know
Concept 6: yeah red reput big got love game end like wan
Concept 7: see gon away us back fifteen never mine hope la



In [36]:
#Create DF's for new clusters of albums

#"New" Albums created by LDA concepts
Album = pd.DataFrame(columns = ['Song', 'Album', 'Cluster'])

#Clustered Albums based on Elbow analysis which resulted in 5 concept albums
Album_clust = pd.DataFrame(columns = ['Song', 'Album', 'Cluster'])

In [37]:
# Clusters of "new" albums if we use results from inertia/distortion with a result of 5 distinct clusters

clust = KMeans( n_clusters=4, random_state = 1 ).fit( concept )

for i in range( 0, len( set( clust.labels_ ) ) ):
    print( f'Cluster {i}:' )
    for j in range( 0, len( clust.labels_ ) ):
        if clust.labels_[ j ] != i:
            continue
        print( tswizzle_song_list[j], ", ", tswizzle_album_list[j])
        
        Album_clust = Album_clust.append({'Song' : tswizzle_song_list[j], 'Album' : tswizzle_album_list[j], 'Cluster' : i }, ignore_index=True)

    print()

Cluster 0:
Style ,  1989 (Deluxe)
Fearless ,  Fearless
Love Story ,  Fearless
ME!  (Ft. Brendon Urie) ,  Lover
...Ready for It? ,  reputation
Back to December ,  Speak Now (Deluxe)
Mean ,  Speak Now (Deluxe)
Mine ,  Speak Now (Deluxe)
The Story of Us ,  Speak Now (Deluxe)
Teardrops on My Guitar (Pop Version) ,  Taylor Swift
Our Song ,  Taylor Swift

Cluster 1:
​betty ,  folklore
Shake It Off ,  1989 (Deluxe)
You Belong with Me ,  Fearless
Lover ,  Lover
We Are Never Ever Getting Back Together ,  Red
22 ,  Red
Delicate ,  reputation
Tim McGraw ,  Taylor Swift

Cluster 2:
​exile  (Ft. Bon Iver) ,  folklore
​august ,  folklore
​the 1 ,  folklore
Blank Space ,  1989 (Deluxe)
Fifteen ,  Fearless
White Horse ,  Fearless
You Need To Calm Down ,  Lover
The Archer ,  Lover
The Man ,  Lover
Begin Again ,  Red
Red ,  Red
Gorgeous ,  reputation
End Game  (Ft. Ed Sheeran & Future) ,  reputation
Look What You Made Me Do ,  reputation
Should've Said No ,  Taylor Swift

Cluster 3:
​cardigan ,  folklor

In [38]:
# Cluster sentences and print clusters

clust = KMeans( n_clusters=8, random_state = 1 ).fit( concept )

for i in range( 0, len( set( clust.labels_ ) ) ):
    print( f'Cluster {i}:' )
    for j in range( 0, len( clust.labels_ ) ):
        if clust.labels_[ j ] != i:
            continue
        print( tswizzle_list[ j ], tswizzle_song_list[j], tswizzle_album_list[j])
        
        Album = Album.append({'Song' : tswizzle_song_list[j], 'Album' : tswizzle_album_list[j], 'Cluster' : i }, ignore_index=True)
        
    print()

Cluster 0:
betti wo make assumpt whi switch homeroom think betti one time ride skateboard pass hous like could breath heard rumor inez believ word say time time true worst thing ever show parti would would want would tell go fuck lead garden garden would trust told summer thing onli seventeen know anyth know miss betti know went wrong favorit song play far side gym nowher found hate crowd know plu saw danc heard rumor inez believ word say time time true worst thing ever show parti would would want would tell go fuck lead garden garden would trust told summer thing onli seventeen know anyth know miss walk home broken cobbleston think pull like figment worst intent said jame get let drive day turn night slept next dreamt summer long 3 betti doorstep plan week final sinkin betti right last time dream happen see face onli thing wan make show parti yeah show parti yeah show parti love kiss porch front stupid friend kiss like dream patch broken wing onli seventeen know anyth know miss stand 

In [None]:
#print(Album)
#print(Album_clust)

In [39]:
Album.to_csv('/Users/awburns2/Documents/Top5New8Albums Full Lyrics.csv')
Album_clust.to_csv('/Users/awburns2/Documents/Top5NewAlbumConcepts Full Lyrics.csv')