In [2]:
import os
import pandas as pd
import re

In [3]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity


In [4]:
file_path = 'taylor_swift_data_kaggle/Albums/'

In [5]:
albums = ['TaylorSwift',
          'Fearless_TaylorsVersion_', 
          'SpeakNow_TaylorsVersion_',
          'Red_TaylorsVersion_', 
          '1989_TaylorsVersion',  
          'Reputation', 
          'Lover',
          'Folklore',
          'evermore_deluxeversion_', 
          'Midnights_TheTilDawnEdition_', 
          'THETORTUREDPOETSDEPARTMENT_THEANTHOLOGY'
          ]

In [6]:
# method to extract from every album
def create_df(file_path, album_name):
    dir_path = f'{file_path}{album_name}'
    data = [] # create empty dataframe for this album

    # go through every file in this folder
    for filename in os.listdir(dir_path):
        if filename.endswith('.txt'):
            song_path = os.path.join(dir_path, filename) # file path for this song.txt

        with open(song_path, 'r') as file:
            file_content = file.read()

        data.append({'album_title': album_name, 'song_title': filename, 'lyrics': file_content}) # create entry for song and lyrics observation

    df = pd.DataFrame(data) # append to dataframe
    return df

In [9]:
df = pd.DataFrame()
for i in albums:
    df = pd.concat([df, create_df(file_path, i)], ignore_index = True)

In [10]:
df['song_title'].unique()

array(['PicturetoBurn.txt', 'TimMcGraw.txt', 'TheOutside.txt',
       'TeardropsOnMyGuitar.txt', 'MarysSong_OhMyMyMy_.txt',
       'TiedTogetherwithaSmile.txt', 'APerfectlyGoodHeart.txt',
       'APlaceInThisWorld.txt', 'ColdasYou.txt', 'OurSong.txt',
       'StayBeautiful.txt', 'ImOnlyMeWhenImWithYou.txt', 'Invisible.txt',
       'ShouldveSaidNo.txt', 'TeardropsonMyGuitar_PopVersion_.txt',
       'TheWayILovedYou_TaylorsVersion_.txt',
       'Fifteen_TaylorsVersion_.txt',
       'TheOtherSideoftheDoor_TaylorsVersion_.txt',
       'Mr.PerfectlyFine_TaylorsVersion__FromtheVault_.txt',
       'JumpThenFall_TaylorsVersion_.txt',
       'Superstar_TaylorsVersion_.txt',
       'ByeByeBaby_TaylorsVersion__FromtheVault_.txt',
       'YouAllOverMe_TaylorsVersion__FromtheVault_.txt',
       'WeWereHappy_TaylorsVersion__FromtheVault_.txt',
       'Untouchable_TaylorsVersion_.txt',
       'HeyStephen_TaylorsVersion_.txt',
       'IfThisWasaMovie_TaylorsVersion_.txt',
       'ThatsWhen_TaylorsVers

In [11]:
def clean_text(text):
    # 1. Remove everything before and including the word "Lyrics"
    text = re.sub(r".*?Lyrics", "", text)
    
    # 2. Remove any number (if present) followed by "Embed" at the end
    text = re.sub(r"\s*\d*\s*Embed\s*$", "", text)
    
    # 3. Replace newlines with spaces
    text = re.sub(r'\s+', ' ', text)
    
    # 4. Remove the extra slashes from quotes in songs
    text = re.sub(r"\'", "", text)
    
    # 5. Remove the [Intro], [Chorus], etc. informaiton
    text = re.sub(r'\[.*?\]', '', text)

    # 6. Unwanted characters
    text = re.sub(r'See Taylor Swift LiveGet tickets as low as \$60You might also like', '', text)
    
    text = text.strip()
    
    return text

In [12]:
def clean_title(text):
    # 1. Replace _**_ with (**)
    text = re.sub(r'_(.*?)_', r' (\1)', text)

    # 2. Remove the .txt 
    text = re.sub('.txt', '', text)

    # 3. Fix formatting
    text = re.sub(r"TaylorsVersion", "Taylor's Version", text, flags=re.IGNORECASE)
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
    text = re.sub(r"Fromthe Vault", "From The Vault", text, flags=re.IGNORECASE)

    text = re.sub(r'_', r' ', text)

    return text

In [13]:
# lemmatize is not as formulaic as stem 
# (so it takes in more consideration of the word rather than just stemming to the root)
def lemmatize_text(text):
    return ' '.join(lemmatizer.lemmatize(w) for w in (re.sub(',', '', text).lower()).split())

In [14]:
# clean dataframe
df['lyrics'] = df['lyrics'].apply(clean_text)
df['song_title'] = df['song_title'].apply(clean_title)
df['lemmatize_lyrics'] = df['lyrics'].apply(lemmatize_text)

In [15]:
df['album_title'].unique()

array(['TaylorSwift', 'Fearless_TaylorsVersion_',
       'SpeakNow_TaylorsVersion_', 'Red_TaylorsVersion_',
       '1989_TaylorsVersion', 'Reputation', 'Lover', 'Folklore',
       'evermore_deluxeversion_', 'Midnights_TheTilDawnEdition_',
       'THETORTUREDPOETSDEPARTMENT_THEANTHOLOGY'], dtype=object)

In [16]:
df

Unnamed: 0,album_title,song_title,lyrics,lemmatize_lyrics
0,TaylorSwift,Pictureto Burn,"State the obvious, I didnt get my perfect fant...",state the obvious i didnt get my perfect fanta...
1,TaylorSwift,Tim Mc Graw,He said the way my blue eyes shined Put those ...,he said the way my blue eye shined put those g...
2,TaylorSwift,The Outside,I didnt know what I would find When I went loo...,i didnt know what i would find when i went loo...
3,TaylorSwift,Teardrops On My Guitar,"Drew looks at me I fake a smile, so he wont se...",drew look at me i fake a smile so he wont see ...
4,TaylorSwift,Marys Song (Oh My My My),She said I was seven and you were nine I looke...,she said i wa seven and you were nine i looked...
...,...,...,...,...
240,THETORTUREDPOETSDEPARTMENT_THEANTHOLOGY,ICan Fix Him (No Really ICan),The smoke cloud billows out his mouth Like a f...,the smoke cloud billow out his mouth like a fr...
241,THETORTUREDPOETSDEPARTMENT_THEANTHOLOGY,Whos Afraidof Little Old Me,"The whos who of ""Whos that?"" is poised for the...","the who who of ""whos that?"" is poised for the ..."
242,THETORTUREDPOETSDEPARTMENT_THEANTHOLOGY,Fortnight,I was supposed to be sent away But they forgot...,i wa supposed to be sent away but they forgot ...
243,THETORTUREDPOETSDEPARTMENT_THEANTHOLOGY,Fresh Out The Slammer,"Now, pretty baby, Im runnin back home to you F...",now pretty baby im runnin back home to you fre...


In [18]:
df.to_csv('songs_lyrics.csv')

In [13]:
# pipline to chain multiple steps in a workflow (recipe)
text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,1))), # feature extraction: transforms text into bag of words model
    ('tfidf', TfidfTransformer()), # tf-idf: transfroms the count matrix into tf-idf
    ('clf', RandomForestClassifier()) # ML: random forest to classify with decission trees 
    ])

In [182]:
# method to find search recommendations
def get_results(user_input):
    # determine the number of words in user search
    search_length = user_input.split()
    n_words = len(search_length)

    # if user is looking for phrase (trigrams to total word-grams)
    if n_words > 2:
        tfv = TfidfVectorizer(ngram_range=(3, n_words), use_idf=False)
        tfv_matrix = tfv.fit_transform(df['lemmatize_lyrics'])

    # if user is just searching for words (unigrams, bigrams)
    else:
        tfv = TfidfVectorizer(ngram_range=(1, n_words), use_idf=False)
        tfv_matrix = tfv.fit_transform(df['lemmatize_lyrics'])
    
    # fit user input 
    tfv_user = tfv.transform([lemmatize_text(user_input)])

    # calculate similarity scores between search and all songs
    cosine_similarities = cosine_similarity(tfv_user, tfv_matrix)
    cosine_similarities = cosine_similarities.flatten()

    # create matrix for all songs that match 
    matches = pd.DataFrame()
    matches['index'] = np.where(cosine_similarities > 0)[0]
    matches['score'] = cosine_similarities[matches]
    matches_sorted = matches.sort_values(by='score', ascending=False)

    # output the list of similarities
    for i in range(len(matches_sorted)):
        song_index = matches_sorted['index'].iloc[i]
        print(f"Song: {df['song_title'][song_index]}, Similarity Score: {matches_sorted['score'].iloc[i]:.4f}")


In [206]:
get_results("He said he'd love me all his life")

Song: ICanDoItWithABrokenHeart, Similarity Score: 0.0379
Song: MyBoyOnlyBreaksHisFavoriteToys, Similarity Score: 0.0169
Song: YoureOnYourOwn_Kid, Similarity Score: 0.0085


In [208]:
get_results("taylor swift")

Song: ClaraBow, Similarity Score: 0.0523
Song: 22(TaylorsVersion), Similarity Score: 0.0228
Song: BeginAgain(TaylorsVersion), Similarity Score: 0.0122
Song: ()(ReadyforIt), Similarity Score: 0.0082
Song: LookWhatYouMadeMeDo, Similarity Score: 0.0040


In [186]:
get_results("midnight")

Song: MidnightRain, Similarity Score: 0.2084
Song: NewYearsDay, Similarity Score: 0.0498
Song: happiness, Similarity Score: 0.0402
Song: LavenderHaze, Similarity Score: 0.0268
Song: thelastgreatamericandynasty, Similarity Score: 0.0259
Song: Paris, Similarity Score: 0.0247
Song: Anti_Hero, Similarity Score: 0.0221
Song: Style(TaylorsVersion), Similarity Score: 0.0185
Song: 22(TaylorsVersion), Similarity Score: 0.0159
Song: YouAreInLove(TaylorsVersion), Similarity Score: 0.0128


In [194]:
get_results("so it goes")

Song: SoItGoes()_, Similarity Score: 0.3835
Song: TheVeryFirstNight(TaylorsVersion)(FromtheVault), Similarity Score: 0.0302
Song: Style(TaylorsVersion), Similarity Score: 0.0288
Song: YouAreInLove(TaylorsVersion), Similarity Score: 0.0239


In [207]:
get_results("safe")

Song: Safe(Sound)TaylorsVersion_, Similarity Score: 0.1002
Song: Treacherous(TaylorsVersion), Similarity Score: 0.0499
Song: loml, Similarity Score: 0.0255
Song: SoLong_London, Similarity Score: 0.0248
Song: Wouldve(Couldve)Shouldve, Similarity Score: 0.0179
Song: DownBad, Similarity Score: 0.0164


In [73]:
df_train, df_test = train_test_split(df, test_size=0.5) # each are just subsets

In [75]:
# train to predict album based on the lemmatized lyrics using .fit(X, Y)
text_clf.fit(df_train['lemmatize_lyrics'], df_train['album_title'])

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', RandomForestClassifier())])

In [76]:
text_clf['vect'].get_feature_names_out()[100:110]

array(['begging', 'begin', 'beginning', 'bein', 'being', 'believe',
       'believed', 'believing', 'belong', 'belt'], dtype=object)

In [77]:
len(text_clf['vect'].get_feature_names_out())

1540

In [78]:
# the computed IDF values for each observation in training
text_clf['tfidf'].idf_

array([3.97041447, 1.8303483 , 3.97041447, ..., 3.97041447, 3.97041447,
       3.05412373])

In [79]:
# an estimate of the importance of each feature used by the classifier (ie weight of each word)
text_clf['clf'].feature_importances_

array([0.        , 0.00498709, 0.        , ..., 0.        , 0.0010381 ,
       0.00120001])

In [81]:
pd.DataFrame({"feature": text_clf['vect'].get_feature_names_out(), 
                           'importance': text_clf['clf'].feature_importances_}).sort_values(by=['importance'], ascending=False)

Unnamed: 0,feature,importance
1449,were,0.017177
692,just,0.016744
859,my,0.016372
1529,you,0.014092
479,for,0.013315
...,...,...
714,landing,0.000000
713,land,0.000000
712,lady,0.000000
711,knеw,0.000000


In [82]:
y_pred = text_clf.predict(df_test['lemmatize_lyrics'])

In [86]:
y_proba = text_clf.predict_proba(df_test[text_type])

In [94]:
df_test['pred'] = y_pred
df_test['proba'] = [item[1] for item in y_proba]

In [44]:
tfv = TfidfVectorizer(ngram_range = (1,1))

In [47]:
tfv.fit_transform(df['lyrics'].apply(lambda x: re.sub(',', '', x).lower()))

<76x2222 sparse matrix of type '<class 'numpy.float64'>'
	with 9618 stored elements in Compressed Sparse Row format>

In [50]:
tfv.get_feature_names()

['2am',
 '45',
 '90s',
 'abigail',
 'about',
 'above',
 'absent',
 'accident',
 'account',
 'ache',
 'achievement',
 'achilles',
 'aching',
 'acid',
 'across',
 'act',
 'acted',
 'actin',
 'actress',
 'actually',
 'addressed',
 'admitted',
 'affair',
 'affairyou',
 'affection',
 'afraid',
 'after',
 'again',
 'age',
 'ago',
 'ah',
 'ain',
 'aint',
 'air',
 'airport',
 'album',
 'align',
 'alive',
 'all',
 'alley',
 'alls',
 'almost',
 'alone',
 'along',
 'already',
 'alright',
 'also',
 'always',
 'am',
 'amber',
 'amount',
 'an',
 'and',
 'angel',
 'angels',
 'another',
 'answer',
 'anticipatin',
 'any',
 'anymore',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'apart',
 'apartment',
 'apologies',
 'apology',
 'applauded',
 'arcade',
 'are',
 'arent',
 'arm',
 'armor',
 'arms',
 'army',
 'around',
 'art',
 'arе',
 'as',
 'ask',
 'asked',
 'askin',
 'asking',
 'asks',
 'asleep',
 'assume',
 'at',
 'attached',
 'attention',
 'attitude',
 'autumn',
 'awake',
 'away',
 'babe',
 'baby',