In [1]:
import os
import pandas as pd
import re

In [54]:
# !pip install nltk
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [72]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

In [2]:
file_path = 'taylor_swift_data_kaggle/Albums/'

In [3]:
albums = ['Fearless_TaylorsVersion_', 'Red_TaylorsVersion_', 'evermore_deluxeversion_']

In [4]:
# method to extract from every album
def create_df(file_path, album_name):
    dir_path = f'{file_path}{album_name}'
    data = [] # create empty dataframe for this album

    # go through every file in this folder
    for filename in os.listdir(dir_path):
        if filename.endswith('.txt'):
            song_path = os.path.join(dir_path, filename) # file path for this song.txt

        with open(song_path, 'r') as file:
            file_content = file.read()

        data.append({'album_title': album_name, 'song_title': filename, 'lyrics': file_content}) # create entry for song and lyrics observation

    df = pd.DataFrame(data) # append to dataframe
    return df

In [28]:
df = pd.DataFrame()
for i in albums:
    df = pd.concat([df, create_df(file_path, i)], ignore_index = True)

In [29]:
df

Unnamed: 0,album_title,song_title,lyrics
0,Fearless_TaylorsVersion_,TheWayILovedYou_TaylorsVersion_.txt,42 ContributorsTranslationsTürkçeСрпскиEspañol...
1,Fearless_TaylorsVersion_,Fifteen_TaylorsVersion_.txt,51 ContributorsTranslationsTürkçeEspañolСрпски...
2,Fearless_TaylorsVersion_,DontYou_TaylorsVersion__FromtheVault_.txt,62 ContributorsTranslationsTürkçeHrvatskiPortu...
3,Fearless_TaylorsVersion_,TheOtherSideoftheDoor_TaylorsVersion_.txt,33 ContributorsTranslationsTürkçeСрпскиPortugu...
4,Fearless_TaylorsVersion_,JumpThenFall_TaylorsVersion_.txt,28 ContributorsTranslationsСрпскиEspañolPortug...
...,...,...,...
71,evermore_deluxeversion_,goldrush.txt,145 ContributorsTranslationsTürkçeEspañolRomân...
72,evermore_deluxeversion_,rightwhereyouleftme.txt,111 ContributorsTranslationsTürkçeEspañolItali...
73,evermore_deluxeversion_,nobody_nocrime.txt,160 ContributorsTranslationsTürkçeEspañolСрпск...
74,evermore_deluxeversion_,champagneproblems.txt,200 ContributorsTranslationsTürkçeEspañolRomân...


In [27]:
def clean_text(text):
    # 1. Remove everything before and including the word "Lyrics"
    text = re.sub(r".*?Lyrics", "", text)
    
    # 2. Remove any number (if present) followed by "Embed" at the end
    text = re.sub(r"\s*\d*\s*Embed\s*$", "", text)
    
    # 3. Replace newlines with spaces
    text = re.sub(r'\s+', ' ', text)
    
    # 4. Remove the extra slashes from quotes in songs
    text = re.sub(r"\'", "", text)
    
    # 5. Remove the [Intro], [Chorus], etc. informaiton
    text = re.sub(r'\[.*?\]', '', text)
    
    return text

In [8]:
def clean_title(text):
    text = re.sub(r'_(.*?)_', r'(\1)', text)
    text = re.sub('.txt', '', text)
    return text

In [69]:
# lemmatize is not as formulaic as stem 
# (so it takes in more consideration of the word rather than just stemming to the root)
def lemmatize_text(text):
    return ' '.join(lemmatizer.lemmatize(w) for w in (re.sub(',', '', text).lower()).split())

In [30]:
# clean dataframe
df['lyrics'] = df['lyrics'].apply(clean_text)
df['song_title'] = df['song_title'].apply(clean_title)
df['lemmatize_lyrics'] = df['lyrics'].apply(lemmatize_text)

In [71]:
df

Unnamed: 0,album_title,song_title,lyrics,lemmatize_lyrics
0,Fearless_TaylorsVersion_,TheWayILovedYou(TaylorsVersion),He is sensible and so incredible And all my s...,he is sensible and so incredible and all my si...
1,Fearless_TaylorsVersion_,Fifteen(TaylorsVersion),You take a deep breath and you walk through t...,you take a deep breath and you walk through th...
2,Fearless_TaylorsVersion_,DontYou(TaylorsVersion)(FromtheVault),"Hey, I knew Id run into you somewhere Its bee...",hey i knew id run into you somewhere it been a...
3,Fearless_TaylorsVersion_,TheOtherSideoftheDoor(TaylorsVersion),In the heat of the fight I walked away Ignori...,in the heat of the fight i walked away ignorin...
4,Fearless_TaylorsVersion_,JumpThenFall(TaylorsVersion),I like the way you sound in the mornin Were o...,i like the way you sound in the mornin were on...
...,...,...,...,...
71,evermore_deluxeversion_,goldrush,"Gleaming, twinkling Eyes like sinking ships o...",gleaming twinkling eye like sinking ship on wa...
72,evermore_deluxeversion_,rightwhereyouleftme,"Friends break up, friends get married Strange...",friend break up friend get married stranger ge...
73,evermore_deluxeversion_,nobody_nocrime,He did it He did it Estes a friend of mine W...,he did it he did it estes a friend of mine we ...
74,evermore_deluxeversion_,champagneproblems,You booked the night train for a reason So yo...,you booked the night train for a reason so you...


In [None]:
# TF-IDF with RandomForestClassifier

In [13]:
# pipline to chain multiple steps in a workflow (recipe)
text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,1))), # feature extraction: transforms text into bag of words model
    ('tfidf', TfidfTransformer()), # tf-idf: transfroms the count matrix into tf-idf
    ('clf', RandomForestClassifier()) # ML: random forest to classify with decission trees 
    ])

In [73]:
df_train, df_test = train_test_split(df, test_size=0.5) # each are just subsets

In [75]:
# train to predict album based on the lemmatized lyrics using .fit(X, Y)
text_clf.fit(df_train['lemmatize_lyrics'], df_train['album_title'])

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', RandomForestClassifier())])

In [76]:
text_clf['vect'].get_feature_names_out()[100:110]

array(['begging', 'begin', 'beginning', 'bein', 'being', 'believe',
       'believed', 'believing', 'belong', 'belt'], dtype=object)

In [77]:
len(text_clf['vect'].get_feature_names_out())

1540

In [78]:
# the computed IDF values for each observation in training
text_clf['tfidf'].idf_

array([3.97041447, 1.8303483 , 3.97041447, ..., 3.97041447, 3.97041447,
       3.05412373])

In [79]:
# an estimate of the importance of each feature used by the classifier (ie weight of each word)
text_clf['clf'].feature_importances_

array([0.        , 0.00498709, 0.        , ..., 0.        , 0.0010381 ,
       0.00120001])

In [81]:
pd.DataFrame({"feature": text_clf['vect'].get_feature_names_out(), 
                           'importance': text_clf['clf'].feature_importances_}).sort_values(by=['importance'], ascending=False)

Unnamed: 0,feature,importance
1449,were,0.017177
692,just,0.016744
859,my,0.016372
1529,you,0.014092
479,for,0.013315
...,...,...
714,landing,0.000000
713,land,0.000000
712,lady,0.000000
711,knеw,0.000000


In [82]:
y_pred = text_clf.predict(df_test['lemmatize_lyrics'])

In [86]:
y_proba = text_clf.predict_proba(df_test[text_type])

In [94]:
df_test['pred'] = y_pred
df_test['proba'] = [item[1] for item in y_proba]

In [95]:
df_test

Unnamed: 0,album_title,song_title,lyrics,lemmatize_lyrics,pred,proba
36,Red_TaylorsVersion_,WeAreNeverEverGettingBackTogether(TaylorsVersion),"I remember when we broke up The first time, s...",i remember when we broke up the first time say...,Red_TaylorsVersion_,0.43
48,Red_TaylorsVersion_,SadBeautifulTragic(TaylorsVersion),"Long handwritten notes, deep in your pocket W...",long handwritten note deep in your pocket word...,Red_TaylorsVersion_,0.4
40,Red_TaylorsVersion_,NothingNew(TaylorsVersion)(FromtheVault),"They tell you while youre young ""Girls, go ou...","they tell you while youre young ""girls go out ...",Red_TaylorsVersion_,0.39
49,Red_TaylorsVersion_,Starlight(TaylorsVersion),"I said, ""Oh my, what a marvelous tune"" It was...","i said ""oh my what a marvelous tune"" it wa the...",Red_TaylorsVersion_,0.53
6,Fearless_TaylorsVersion_,ByeByeBaby(TaylorsVersion)(FromtheVault),It wasnt just like a movie The rain didnt soa...,it wasnt just like a movie the rain didnt soak...,Red_TaylorsVersion_,0.42
8,Fearless_TaylorsVersion_,WeWereHappy(TaylorsVersion)(FromtheVault),We used to walk along the streets When the po...,we used to walk along the street when the porc...,Red_TaylorsVersion_,0.45
35,Red_TaylorsVersion_,TheVeryFirstNight(TaylorsVersion)(FromtheVault),I wish I could fly Id pick you up and wed go ...,i wish i could fly id pick you up and wed go b...,Red_TaylorsVersion_,0.51
2,Fearless_TaylorsVersion_,DontYou(TaylorsVersion)(FromtheVault),"Hey, I knew Id run into you somewhere Its bee...",hey i knew id run into you somewhere it been a...,evermore_deluxeversion_,0.35
14,Fearless_TaylorsVersion_,Breathe(TaylorsVersion),I see your face in my mind as I drive away Ca...,i see your face in my mind a i drive away caus...,Red_TaylorsVersion_,0.39
52,Red_TaylorsVersion_,ForeverWinter(TaylorsVersion)(FromtheVault),He says he doesnt believe anything much he he...,he say he doesnt believe anything much he hear...,Fearless_TaylorsVersion_,0.39


In [44]:
tfv = TfidfVectorizer(ngram_range = (1,1))

In [47]:
tfv.fit_transform(df['lyrics'].apply(lambda x: re.sub(',', '', x).lower()))

<76x2222 sparse matrix of type '<class 'numpy.float64'>'
	with 9618 stored elements in Compressed Sparse Row format>

In [50]:
tfv.get_feature_names()

['2am',
 '45',
 '90s',
 'abigail',
 'about',
 'above',
 'absent',
 'accident',
 'account',
 'ache',
 'achievement',
 'achilles',
 'aching',
 'acid',
 'across',
 'act',
 'acted',
 'actin',
 'actress',
 'actually',
 'addressed',
 'admitted',
 'affair',
 'affairyou',
 'affection',
 'afraid',
 'after',
 'again',
 'age',
 'ago',
 'ah',
 'ain',
 'aint',
 'air',
 'airport',
 'album',
 'align',
 'alive',
 'all',
 'alley',
 'alls',
 'almost',
 'alone',
 'along',
 'already',
 'alright',
 'also',
 'always',
 'am',
 'amber',
 'amount',
 'an',
 'and',
 'angel',
 'angels',
 'another',
 'answer',
 'anticipatin',
 'any',
 'anymore',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'apart',
 'apartment',
 'apologies',
 'apology',
 'applauded',
 'arcade',
 'are',
 'arent',
 'arm',
 'armor',
 'arms',
 'army',
 'around',
 'art',
 'arе',
 'as',
 'ask',
 'asked',
 'askin',
 'asking',
 'asks',
 'asleep',
 'assume',
 'at',
 'attached',
 'attention',
 'attitude',
 'autumn',
 'awake',
 'away',
 'babe',
 'baby',