In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import re

In [2]:
df = pd.read_csv('final_taylor_swift_lyrics.tsv', sep='\t')

In [3]:
print(df.head())

   index  album song_title                                              lyric  \
0      0  Lover     Lover                                           [Verse 1]   
1      0  Lover     Lover   We could leave the Christmas lights up 'til Ja...   
2      0  Lover     Lover            And this is our place, we make the rules   
3      0  Lover     Lover   And there's a dazzling haze, a mysterious way ...   
4      0  Lover     Lover            Have I known you 20 seconds or 20 years?   

   line_number release_date  
0            0   2019-08-16  
1            1   2019-08-16  
2            2   2019-08-16  
3            3   2019-08-16  
4            4   2019-08-16  


In [4]:
df.to_dict(orient = 'records')

[{'index': 0,
  'album': 'Lover',
  'song_title': 'Lover ',
  'lyric': '[Verse 1]',
  'line_number': 0,
  'release_date': '2019-08-16'},
 {'index': 0,
  'album': 'Lover',
  'song_title': 'Lover ',
  'lyric': "We could leave the Christmas lights up 'til January",
  'line_number': 1,
  'release_date': '2019-08-16'},
 {'index': 0,
  'album': 'Lover',
  'song_title': 'Lover ',
  'lyric': 'And this is our place, we make the rules',
  'line_number': 2,
  'release_date': '2019-08-16'},
 {'index': 0,
  'album': 'Lover',
  'song_title': 'Lover ',
  'lyric': "And there's a dazzling haze, a mysterious way about you, dear",
  'line_number': 3,
  'release_date': '2019-08-16'},
 {'index': 0,
  'album': 'Lover',
  'song_title': 'Lover ',
  'lyric': 'Have I known you 20 seconds or 20 years?',
  'line_number': 4,
  'release_date': '2019-08-16'},
 {'index': 0,
  'album': 'Lover',
  'song_title': 'Lover ',
  'lyric': '[Chorus]',
  'line_number': 5,
  'release_date': '2019-08-16'},
 {'index': 0,
  'album'

In [5]:
df_dict = df.to_dict(orient = 'records')

In [6]:
#song to index maps song titles to their index
#index dic maps song index to a list of its lyrics

song_to_index = {}
index_dic = {}
for i in df_dict:
    index = i['index']
    lyric = i['lyric']
    if index in index_dic:
        index_dic[index].append(lyric)
    else:
        index_dic[index] = [lyric]
        title = i['song_title']
        song_to_index[title] = index 

In [7]:
num_songs = len(song_to_index)

In [8]:
print(index_dic[1])

['[Verse 1]', "I don't like your little games", "Don't like your tilted stage", 'The role you made me play', "Of the fool, no, I don't like you", "I don't like your perfect crime", 'How you laugh when you lie', 'You said the gun was mine', "Isn't cool, no, I don't like you (Oh)", '[Pre-Chorus]', 'But I got smarter, I got harder in the nick of time', 'Honey, I rose up from the dead, I do it all the time', "I've got a list of names and yours is in red, underlined", 'I check it once, then I check it twice, oh!', '[Chorus]', 'Ooh, look what you made me do', 'Look what you made me do', 'Look what you just made me do', 'Look what you just made me—', 'Ooh, look what you made me do', 'Look what you made me do', 'Look what you just made me do', 'Look what you just made me do', '[Verse 2]', "I don't like your kingdom keys", 'They once belonged to me', 'You asked me for a place to sleep', 'Locked me out and threw a feast (What?)', 'The world moves on, another day, another drama, drama', 'But not 

In [51]:
vectorizer = TfidfVectorizer(stop_words = 'english', min_df = 2)

In [90]:
#code from 4300 class demo
word_splitter = re.compile(r"""
    (\w+)
    """, re.VERBOSE)

def getwords(lyric):
  return [w.lower() for w in word_splitter.findall(lyric)]

In [91]:
lyrics_list = []
for i in index_dic:
    lyric = index_dic[i]
    lyric_str = ""
    for l in lyric:
        l.lower()
        lyric_str += " "
        lyric_str += l
    all_words = getwords(lyric_str)
    lyrics_list.append(" ".join(all_words))
print(lyrics_list[0])
print(len(lyrics_list))

verse 1 we could leave the christmas lights up til january and this is our place we make the rules and there s a dazzling haze a mysterious way about you dear have i known you 20 seconds or 20 years chorus can i go where you go can we always be this close forever and ever ah take me out and take me home you re my my my my lover verse 2 we could let our friends crash in the living room this is our place we make the call and i m highly suspicious that everyone who sees you wants you i ve loved you three summers now honey but i want em all chorus can i go where you go can we always be this close forever and ever ah take me out and take me home forever and ever you re my my my my lover bridge ladies and gentlemen will you please stand with every guitar string scar on my hand i take this magnetic force of a man to be my lover my heart s been borrowed and yours has been blue all s well that ends well to end up with you swear to be overdramatic and true to my lover and you ll save all your di

In [92]:
tfidf_vec = vectorizer.fit_transform(lyrics_list).toarray()

In [93]:
print(sum(tfidf_vec))

[0.05411225 0.38995259 2.34563354 ... 1.5257614  1.51963972 0.10571972]


In [94]:
print(num_songs)
print(tfidf_vec.shape)

343
(343, 2728)


In [95]:
cos_sim = np.zeros((num_songs,num_songs))
i = 0
while i < num_songs:
    j = 0
    while j < num_songs:
        song1 = tfidf_vec[i]
        song2 = tfidf_vec[j]
        song1_norm = np.linalg.norm(tfidf_vec[i])
        song2_norm = np.linalg.norm(tfidf_vec[j])
        numerator = np.dot(song1, song2)
        denominator = song1_norm * song2_norm
        cosine_similarity = numerator/denominator
        cos_sim[i][j] = cosine_similarity
        j += 1
    i += 1

In [96]:
print(cos_sim[100][331])

0.061633256458969984


In [97]:
np.save('cosine_matrix', cos_sim)