In [1]:
import pandas as pd
import numpy as np
import re
from pathlib import Path
import string

# Processing the grades data

In [2]:
grades_data = pd.read_csv('data/Taylor_Swift_Songs_Data/Grades.csv', header=None)

In [3]:
# Working on the grades

grades = grades_data.iloc[1:, :12].reset_index(drop=True)
grades.columns = grades.iloc[0]
grades = grades.drop(0).reset_index(drop=True)

grades.columns = ['song_name', 'album', 'happy_sad', 'relationship', 'feelings_of_self', 'glass_half_full', 'stages', 'tempo', 'seriousness', 'future_prospects', 'feelings_of_male', 'togetherness']
grades['song_name'] = grades['song_name'].apply(lambda x: str(x).lower())
grades.head()

Unnamed: 0,song_name,album,happy_sad,relationship,feelings_of_self,glass_half_full,stages,tempo,seriousness,future_prospects,feelings_of_male,togetherness
0,cold as you,Taylor Swift,-10,-8,-1,-3,-3,-3,-3,-3,-1,-1
1,i'm only me when i'm with you,Taylor Swift,9,10,3,3,1,2,2,2,3,3
2,invisible,Taylor Swift,-1,-4,0,-2,1,0,0,0,-1,-3
3,mary's song,Taylor Swift,5,12,0,2,1,2,3,3,3,3
4,our song,Taylor Swift,5,6,2,2,1,0,1,1,3,1


# Processing the lyrics data (Version 1, not usable)

In [4]:
album1 = pd.read_csv('data/Taylor_Swift_Songs_Data/lyrics/01-taylor_swift.csv')
album2 = pd.read_csv('data/Taylor_Swift_Songs_Data/lyrics/02-fearless_taylors_version.csv')
album3 = pd.read_csv('data/Taylor_Swift_Songs_Data/lyrics/03-speak_now_deluxe_package.csv')
album4 = pd.read_csv('data/Taylor_Swift_Songs_Data/lyrics/04-red_deluxe_edition.csv')
album5 = pd.read_csv('data/Taylor_Swift_Songs_Data/lyrics/05-1989_deluxe.csv')
album6 = pd.read_csv('data/Taylor_Swift_Songs_Data/lyrics/06-reputation.csv')
album7 = pd.read_csv('data/Taylor_Swift_Songs_Data/lyrics/07-lover.csv')
album8 = pd.read_csv('data/Taylor_Swift_Songs_Data/lyrics/08-folklore_deluxe_version.csv')
album9 = pd.read_csv('data/Taylor_Swift_Songs_Data/lyrics/09-evermore_deluxe_version.csv')

In [5]:
lyrics = pd.concat([album1, album2, album3, album4, album5, album6, album7, album8, album9], axis=0).reset_index(drop=True)
lyrics['track_title'] = lyrics['track_title'].apply(lambda x: str(x).lower())
lyrics['track_title'] = [x.split('(taylor’s version)')[0] for x in lyrics['track_title']]
lyrics['track_title'] = [x.rstrip() for x in lyrics['track_title']]
lyrics['track_title'] = [''.join(x.split('\u200b')).rstrip() for x in lyrics['track_title']]
lyrics['track_title'] = ["'".join(x.split('’')) for x in lyrics['track_title']]

# Processing lyrics data (Version 2)

**Note: remove the duplicated songs**

In [6]:
def clean_text(text):
    
    # Some light data cleaning - you will need to adjust based on your data
    text = text.replace('See Taylor Swift LiveGet tickets as low as $270', ' ') # remove ad
    text = text.replace('See Taylor Swift LiveGet tickets as low as $373', ' ') # remove ad
    text = text.replace('You might also like', ' ') # remove ad
    text = re.sub('\d*Embed', ' ', text) # remove ending text with number + Embed
    
    return text

In [7]:
# Specify the folder names with the lyric data from Genius
directory_paths = ['data/Taylor_Swift_Songs_Data/Taylor_Swift_Genius/Taylor-Swift_Taylor-Swift/',
                   'data/Taylor_Swift_Songs_Data/Taylor_Swift_Genius/Taylor-Swift_Fearless/',
                   'data/Taylor_Swift_Songs_Data/Taylor_Swift_Genius/Taylor-Swift_Speak-Now/',
                   'data/Taylor_Swift_Songs_Data/Taylor_Swift_Genius/Taylor-Swift_Red/',
                   'data/Taylor_Swift_Songs_Data/Taylor_Swift_Genius/Taylor-Swift_1989/',
                   'data/Taylor_Swift_Songs_Data/Taylor_Swift_Genius/Taylor-Swift_Reputation/',
                   'data/Taylor_Swift_Songs_Data/Taylor_Swift_Genius/Taylor-Swift_Lover/',
                   'data/Taylor_Swift_Songs_Data/Taylor_Swift_Genius/Taylor-Swift_folklore/',
                   'data/Taylor_Swift_Songs_Data/Taylor_Swift_Genius/Taylor-Swift_evermore/',
                   'data/Taylor_Swift_Songs_Data/Taylor_Swift_Genius/Taylor-Swift_Midnights/',
                   'data/Taylor_Swift_Songs_Data/Taylor_Swift_Genius/Taylor-Swift_NA/']

In [8]:
pd.options.display.max_rows = 500
pd.set_option('display.max_colwidth', 0)

ts_lyrics = pd.DataFrame({"album": [],
                          "song_name": [],
                          "lyrics": []})

idx = 0

for i, album in enumerate(directory_paths):
    
    album_name = album.split('/')[-2].split('_')[-1].replace("-", " ")
        
    for song in Path(album).glob('*.txt'):

        song_name = str(song).replace("-", " ").split("\\")[-1][:-4]
        song_name = song_name.split('[')[0].strip()
        song_name = ''.join(song_name.split('\u200b'))
        song_name = song_name.split("(Taylor's Version)")[0].rstrip()
        song_name = song_name.split("(Acoustic")[0].rstrip()
        song_name = song_name.split("(P")[0].rstrip()
        song_name = song_name.split("(Oh My My My)")[0].rstrip()
        song_name = song_name.split("(Voice Memo)")[0].rstrip()
        song_name = song_name.replace('’', "'")
        song_name = song_name.replace('questionMark', '?')
                
        full_text = open(song, encoding="utf-8")
        lyrics_list = full_text.readlines()[1:] #read()
        lyrics = ' '.join(lyrics_list)
        lyrics = clean_text(lyrics)
        full_text.close()
        
        ts_lyrics.loc[idx] = [album_name, song_name, lyrics]
        idx += 1

In [9]:
# Remove 8 extra rows of data including duplicate songs, deluxe songs and non-songs
# to match the same 147 songs that were on the Spotify API list

# ts_lyrics.drop(13, axis=0, inplace=True) # Teardrops on My Guitar (Pop Version)
# ts_lyrics.drop(44, axis=0, inplace=True) # The Moment I Knew
# ts_lyrics.drop(49, axis=0, inplace=True) # Come Back... Be Here
# ts_lyrics.drop(55, axis=0, inplace=True) # Girl at Home
# ts_lyrics.drop(75, axis=0, inplace=True) # Reputation Magazine Vol. 1
# ts_lyrics.drop(77, axis=0, inplace=True) # Why She Disappeared [Poem]
# ts_lyrics.drop(86, axis=0, inplace=True) # Reputation [Prologue]
# ts_lyrics.drop(88, axis=0, inplace=True) # If You're Anything Like Me [Poem]
ts_lyrics.reset_index(inplace=True, drop=True)

In [10]:
ts_lyrics['song_name'] = ts_lyrics['song_name'].apply(lambda x: str(x).lower())
# ts_lyrics['song_name'] = ts_lyrics['song_name'].apply(lambda x: x.replace('questionmark', '?'))
print(len(set(grades['song_name']) - set(ts_lyrics['song_name'])))

0


In [20]:
rag_dataset = grades.merge(ts_lyrics, on='song_name', how='left').reset_index(drop=True)
rag_dataset = rag_dataset.drop('album_y', axis=1)
rag_dataset = rag_dataset.rename({'album_x':'album'}, axis=1)

0      cold as you                            
1      i'm only me when i'm with you          
2      invisible                              
3      mary's song                            
4      our song                               
5      the outside                            
6      a perfectly good heart                 
7      picture to burn                        
8      a place in this world                  
9      should've said no                      
10     stay beautiful                         
11     teardrops on my guitar                 
12     teardrops on my guitar                 
13     tied together with a smile             
14     tim mcgraw                             
15     beautiful eyes                         
16     breathe                                
17     change                                 
18     come in with the rain                  
19     fearless                               
20     fifteen                                
21     foreve

In [21]:
from collections import Counter
Counter(rag_dataset['song_name'])

Counter({'cold as you': 1,
         "i'm only me when i'm with you": 1,
         'invisible': 1,
         "mary's song": 1,
         'our song': 1,
         'the outside': 1,
         'a perfectly good heart': 1,
         'picture to burn': 1,
         'a place in this world': 1,
         "should've said no": 1,
         'stay beautiful': 1,
         'teardrops on my guitar': 2,
         'tied together with a smile': 1,
         'tim mcgraw': 1,
         'beautiful eyes': 1,
         'breathe': 1,
         'change': 1,
         'come in with the rain': 1,
         'fearless': 1,
         'fifteen': 1,
         'forever & always': 2,
         'hey stephen': 1,
         'i heart ?': 1,
         'jump then fall': 1,
         'love story': 2,
         'the other side of the door': 1,
         'superstar': 1,
         'tell me why': 1,
         'untouchable': 1,
         'the way i loved you': 1,
         'white horse': 1,
         'you belong with me': 1,
         "you're not sorry": 1,
  