### In this notebook, we will determine the theme features for each song using the word representations from the Word2Vec models

In [1]:
import pandas as pd
import gensim
from collections import Counter
from ast import literal_eval
from nltk.corpus import stopwords
stopwords=set(stopwords.words('english'))

from gensim.test.utils import datapath
from gensim.models import KeyedVectors

Load the tokenized data and check its shape

In [2]:
trackLyricsFeaturesTokenized = pd.read_csv('./tracksLyricFeatures/tracksLyricFeaturesTokenzised.csv')
trackLyricsFeaturesTokenized.head()

Unnamed: 0,id,track,trackArtist,genre,lyrics,top_lang_identified,top_lang_identified_prob,regex_cleaned_lyrics,tokenized_lyrics
0,2,Food,AWOL,Hip-Hop,"The Waking Blind LyricsThe waking blind, embra...",__label__en,0.932648,"The waking blind, embraced the dark\nThey made...","['the', 'waking', 'blind', 'embraced', 'the', ..."
1,5,This World,AWOL,Hip-Hop,"E.T. Lyrics[Verse One: Mickey Factz]\nSee, I s...",__label__en,0.949876,"\nSee, I seen so many faces\nI've been to many...","['see', 'seen', 'so', 'many', 'faces', 've', '..."
2,10,Freeway,Kurt Vile,Pop,"Freeway LyricsI got a freeway in mind, let go ...",__label__en,0.960411,"I got a freeway in mind, let go of my head\nWa...","['got', 'freeway', 'in', 'mind', 'let', 'go', ..."
3,148,Blackout 2,Contradiction,Experimental,Blackout Lyrics(Verse 1: Noelz Vedere)\nAnatom...,__label__en,0.900489,"\nAnatomic, catastrophic\nMy profits probably ...","['anatomic', 'catastrophic', 'my', 'profits', ..."
4,182,Jules Lost His Jewels,Ariel Pink's Haunted Graffiti,Rock,Jules Lost His Jewels Lyrics[Intro]\nC'est la ...,__label__en,0.726533,"\nC'est la vie, c'est la vie\nComme ci, comme ...","['est', 'la', 'vie', 'est', 'la', 'vie', 'comm..."


In [3]:
trackLyricsFeaturesTokenized.shape

(3954, 9)

In [4]:
trackLyricsFeaturesTokenized['tokenized_lyrics'] = trackLyricsFeaturesTokenized['tokenized_lyrics'].apply(literal_eval)
type(trackLyricsFeaturesTokenized.iloc[0,8])

list

The remove_stopwords() function is remove the stop words from the tokenized lyrics

In [5]:
def remove_stopwords(data):
    temp_list = []
    for word in data:
        if word.lower() not in stopwords:
            temp_list.append(word)
    return temp_list

Call remove_stopwords() on the tokenized_lyrics columns

In [8]:
trackLyricsFeaturesTokenized['tokenized_lyrics_no_stop'] = trackLyricsFeaturesTokenized['tokenized_lyrics'].apply(remove_stopwords)

In [None]:
## tokenized lyrics with stopwords
print(trackLyricsFeaturesTokenized.iloc[0,8])

['the', 'waking', 'blind', 'embraced', 'the', 'dark', 'they', 'made', 'us', 'fight', 'for', 'the', 'answers', 'they', 'made', 'us', 'they', 'made', 'us', 'fight', 'fight', 'fight', 'for', 'the', 'answers', 'they', 'made', 'us', 'fight', 'time', 'is', 'up', 'the', 'lines', 'are', 'drawn', 'in', 'sand', 'it', 'like', 'ritual', 'cleaning', 'faith', 'against', 'them', 'all', 'some', 'kind', 'of', 'wicked', 'pressure', 'growing', 'below', 'bound', 'to', 'breach', 'the', 'surface', 'gloves', 'are', 'off', 'the', 'blinders', 'tighten', 'misguided', 'hands', 'leading', 'the', 'awoken', 'food', 'for', 'thought', 'once', 'your', 'brothers', 'now', 'the', 'wretches', 'the', 'dregs', 'the', 'scum', 'and', 'the', 'bastards', 'eyes', 'up', 'grab', 'your', 'pitchforks', 'just', 'before', 'the', 'dawn', 'they', 're', 'out', 'of', 'time', 'and', 'out', 'of', 'touch', 'and', 'out', 'of', 'luck', 'with', 'the', 'air', 'that', 'we', 'breath', 'and', 'the', 'ground', 'beneath', 'their', 'feet', 'mask', 'of

In [9]:
## tokenized lyrics without stopwords
## It reduced the tokens considerably
print(trackLyricsFeaturesTokenized.iloc[0,9])

['waking', 'blind', 'embraced', 'dark', 'made', 'us', 'fight', 'answers', 'made', 'us', 'made', 'us', 'fight', 'fight', 'fight', 'answers', 'made', 'us', 'fight', 'time', 'lines', 'drawn', 'sand', 'like', 'ritual', 'cleaning', 'faith', 'kind', 'wicked', 'pressure', 'growing', 'bound', 'breach', 'surface', 'gloves', 'blinders', 'tighten', 'misguided', 'hands', 'leading', 'awoken', 'food', 'thought', 'brothers', 'wretches', 'dregs', 'scum', 'bastards', 'eyes', 'grab', 'pitchforks', 'dawn', 'time', 'touch', 'luck', 'air', 'breath', 'ground', 'beneath', 'feet', 'mask', 'virtue', 'hide', 'behind', 'feed', 'lies', 'bowing', 'pressure', 'resolve', 'never', 'allow', 'fear', 'hold', 'us', 'faith', 'designed', 'deny', 'deprive', 'waking', 'blind', 'embraced', 'dark', 'made', 'us', 'fight', 'answers', 'coming', 'whipping', 'winds', 'winds', 'blow', 'house', 'straw', 'away', 'built', 'tear', 'built', 'tear', 'might', 'also', 'likethere', 'second', 'coming', 'playing', 'heart', 'something', 'admit'

The gettop5() function is to find the 5 most frequent non-stopword words in the lyrics which will determine the theme features

In [10]:
def gettop5(lyrics):
    x = list(Counter(lyrics).most_common())
    result = []
    try:
        for i in range(0,5):
            result.append(x[i][0])

        return result
    except:
        print(lyrics)
        return None

Call gettop5() on the 'tokenized_lyrics_no_stop' column

In [12]:
trackLyricsFeaturesTokenized['tokenized_lyrics_top5_tokens'] = trackLyricsFeaturesTokenized['tokenized_lyrics_no_stop'].apply(gettop5)

['slip', 'might', 'also', 'likeembed']
['transcribedyou', 'might', 'also', 'likeembed']
['embed']
['might', 'also', 'likeembed']
['might', 'also', 'likeembed']
['might', 'also', 'likeembed']
['might', 'also', 'like', 'embed']


In [13]:
# 'tokenized_lyrics_top5_tokens' column
print(trackLyricsFeaturesTokenized.iloc[0,10])

['us', 'fight', 'sky', 'made', 'blind']


Remove songs with bad lyrics i.e. lyrics with less than 5 words

In [14]:
# trackLyricsFeaturesTokenized_good_lyrics = trackLyricsFeaturesTokenized[not trackLyricsFeaturesTokenized['tokenized_lyrics_top5_tokens'].isna()]
trackLyricsFeaturesTokenized_good_lyrics = trackLyricsFeaturesTokenized[trackLyricsFeaturesTokenized['tokenized_lyrics_top5_tokens'].notna()]
trackLyricsFeaturesTokenized_good_lyrics.head()

Unnamed: 0,id,track,trackArtist,genre,lyrics,top_lang_identified,top_lang_identified_prob,regex_cleaned_lyrics,tokenized_lyrics,tokenized_lyrics_no_stop,tokenized_lyrics_top5_tokens
0,2,Food,AWOL,Hip-Hop,"The Waking Blind LyricsThe waking blind, embra...",__label__en,0.932648,"The waking blind, embraced the dark\nThey made...","[the, waking, blind, embraced, the, dark, they...","[waking, blind, embraced, dark, made, us, figh...","[us, fight, sky, made, blind]"
1,5,This World,AWOL,Hip-Hop,"E.T. Lyrics[Verse One: Mickey Factz]\nSee, I s...",__label__en,0.949876,"\nSee, I seen so many faces\nI've been to many...","[see, seen, so, many, faces, ve, been, to, man...","[see, seen, many, faces, many, places, get, ti...","[world, say, see, get, girl]"
2,10,Freeway,Kurt Vile,Pop,"Freeway LyricsI got a freeway in mind, let go ...",__label__en,0.960411,"I got a freeway in mind, let go of my head\nWa...","[got, freeway, in, mind, let, go, of, my, head...","[got, freeway, mind, let, go, head, walk, line...","[got, know, go, dump, freeway]"
3,148,Blackout 2,Contradiction,Experimental,Blackout Lyrics(Verse 1: Noelz Vedere)\nAnatom...,__label__en,0.900489,"\nAnatomic, catastrophic\nMy profits probably ...","[anatomic, catastrophic, my, profits, probably...","[anatomic, catastrophic, profits, probably, pa...","[better, blackout, gonna, take, never]"
4,182,Jules Lost His Jewels,Ariel Pink's Haunted Graffiti,Rock,Jules Lost His Jewels Lyrics[Intro]\nC'est la ...,__label__en,0.726533,"\nC'est la vie, c'est la vie\nComme ci, comme ...","[est, la, vie, est, la, vie, comme, ci, comme,...","[est, la, vie, est, la, vie, comme, ci, comme,...","[est, la, vie, comme, ci]"


In [15]:
trackLyricsFeaturesTokenized_good_lyrics.shape

(3947, 11)

Seperate the 5 theme words in 5 different columns

In [16]:
trackLyricsFeaturesTokenized_good_lyrics[['theme_token_1','theme_token_2','theme_token_3','theme_token_4','theme_token_5']] = pd.DataFrame(trackLyricsFeaturesTokenized_good_lyrics.tokenized_lyrics_top5_tokens.tolist(), index=trackLyricsFeaturesTokenized_good_lyrics.index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trackLyricsFeaturesTokenized_good_lyrics[['theme_token_1','theme_token_2','theme_token_3','theme_token_4','theme_token_5']] = pd.DataFrame(trackLyricsFeaturesTokenized_good_lyrics.tokenized_lyrics_top5_tokens.tolist(), index=trackLyricsFeaturesTokenized_good_lyrics.index)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trackLyricsFeaturesTokenized_good_lyrics[['theme_token_1','theme_token_2','theme_token_3','theme_token_4','theme_token_5']] = pd.DataFrame(trackLyricsFeaturesTokenized_good_lyrics.tokenized_lyrics_

In [17]:
trackLyricsFeaturesTokenized_good_lyrics.head(4)

Unnamed: 0,id,track,trackArtist,genre,lyrics,top_lang_identified,top_lang_identified_prob,regex_cleaned_lyrics,tokenized_lyrics,tokenized_lyrics_no_stop,tokenized_lyrics_top5_tokens,theme_token_1,theme_token_2,theme_token_3,theme_token_4,theme_token_5
0,2,Food,AWOL,Hip-Hop,"The Waking Blind LyricsThe waking blind, embra...",__label__en,0.932648,"The waking blind, embraced the dark\nThey made...","[the, waking, blind, embraced, the, dark, they...","[waking, blind, embraced, dark, made, us, figh...","[us, fight, sky, made, blind]",us,fight,sky,made,blind
1,5,This World,AWOL,Hip-Hop,"E.T. Lyrics[Verse One: Mickey Factz]\nSee, I s...",__label__en,0.949876,"\nSee, I seen so many faces\nI've been to many...","[see, seen, so, many, faces, ve, been, to, man...","[see, seen, many, faces, many, places, get, ti...","[world, say, see, get, girl]",world,say,see,get,girl
2,10,Freeway,Kurt Vile,Pop,"Freeway LyricsI got a freeway in mind, let go ...",__label__en,0.960411,"I got a freeway in mind, let go of my head\nWa...","[got, freeway, in, mind, let, go, of, my, head...","[got, freeway, mind, let, go, head, walk, line...","[got, know, go, dump, freeway]",got,know,go,dump,freeway
3,148,Blackout 2,Contradiction,Experimental,Blackout Lyrics(Verse 1: Noelz Vedere)\nAnatom...,__label__en,0.900489,"\nAnatomic, catastrophic\nMy profits probably ...","[anatomic, catastrophic, my, profits, probably...","[anatomic, catastrophic, profits, probably, pa...","[better, blackout, gonna, take, never]",better,blackout,gonna,take,never


In [18]:
trackLyricsFeaturesTokenized_good_lyrics_cp = trackLyricsFeaturesTokenized_good_lyrics

Load the Word2Vec model of 250 vector size

In [69]:
lyrics_dictionary_model = gensim.models.Word2Vec.load("./tracksLyricFeatures/Lyrics_Dictionary_Word2Vec.model")

In [70]:
lyrics_dictionary_model.wv.most_similar("love")

[('passion', 0.4768434166908264),
 ('loved', 0.4057614803314209),
 ('happiness', 0.3845676779747009),
 ('lover', 0.3454592525959015),
 ('affection', 0.3384852111339569),
 ('dreams', 0.33796426653862),
 ('hate', 0.33684098720550537),
 ('lovin', 0.3353321850299835),
 ('hope', 0.3316356837749481),
 ('dance', 0.3286251723766327)]

vectorize() function to get the vectorized value of the theme words using the Word2Vec model

In [71]:
def vectorize(x):
    return lyrics_dictionary_model.wv[x]

Call vectorize() and save the returned values in new columns

In [73]:
arr = ['theme_token_1','theme_token_2','theme_token_3','theme_token_4','theme_token_5']

for i in arr:
    i_vec = i+"_vec"
    trackLyricsFeaturesTokenized_good_lyrics[i_vec] = trackLyricsFeaturesTokenized_good_lyrics[i].apply(vectorize)

trackLyricsFeaturesTokenized_good_lyrics.head(4)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trackLyricsFeaturesTokenized_good_lyrics[i_vec] = trackLyricsFeaturesTokenized_good_lyrics[i].apply(vectorize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trackLyricsFeaturesTokenized_good_lyrics[i_vec] = trackLyricsFeaturesTokenized_good_lyrics[i].apply(vectorize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view

Unnamed: 0,id,track,trackArtist,genre,lyrics,top_lang_identified,top_lang_identified_prob,regex_cleaned_lyrics,tokenized_lyrics,tokenized_lyrics_no_stop,...,theme_token_1,theme_token_2,theme_token_3,theme_token_4,theme_token_5,theme_token_1_vec,theme_token_2_vec,theme_token_3_vec,theme_token_4_vec,theme_token_5_vec
0,2,Food,AWOL,Hip-Hop,"The Waking Blind LyricsThe waking blind, embra...",__label__en,0.932648,"The waking blind, embraced the dark\nThey made...","[the, waking, blind, embraced, the, dark, they...","[waking, blind, embraced, dark, made, us, figh...",...,us,fight,sky,made,blind,"[-0.7368858, -0.48690912, -0.6822355, 0.237681...","[-0.65264344, -0.86627525, -0.95098966, -1.614...","[-1.9216352, 0.29534137, 0.22540647, -0.280238...","[0.9126966, -0.17673089, -2.4622324, -0.719030...","[0.29462576, -0.4674098, -0.42534876, 1.163112..."
1,5,This World,AWOL,Hip-Hop,"E.T. Lyrics[Verse One: Mickey Factz]\nSee, I s...",__label__en,0.949876,"\nSee, I seen so many faces\nI've been to many...","[see, seen, so, many, faces, ve, been, to, man...","[see, seen, many, faces, many, places, get, ti...",...,world,say,see,get,girl,"[3.1807659, -0.5182738, -2.3499858, 2.371013, ...","[0.7124076, -0.25377122, -0.56789577, -1.42382...","[-1.0426443, 2.3224962, -1.6050172, 1.5430597,...","[-1.1066152, -0.6201647, -1.9317, 1.1186424, -...","[2.0542653, -1.348139, 2.0537117, -1.4452268, ..."
2,10,Freeway,Kurt Vile,Pop,"Freeway LyricsI got a freeway in mind, let go ...",__label__en,0.960411,"I got a freeway in mind, let go of my head\nWa...","[got, freeway, in, mind, let, go, of, my, head...","[got, freeway, mind, let, go, head, walk, line...",...,got,know,go,dump,freeway,"[0.59052515, -1.4287226, -2.8578434, 0.3214650...","[0.06573162, 0.6512165, -2.1585948, -1.1851531...","[-0.5467437, -0.81637454, 0.032996584, 2.33412...","[0.3819906, -0.30425143, 1.3380872, -0.1257418...","[0.35344937, -0.31860176, 0.41144308, -0.01509..."
3,148,Blackout 2,Contradiction,Experimental,Blackout Lyrics(Verse 1: Noelz Vedere)\nAnatom...,__label__en,0.900489,"\nAnatomic, catastrophic\nMy profits probably ...","[anatomic, catastrophic, my, profits, probably...","[anatomic, catastrophic, profits, probably, pa...",...,better,blackout,gonna,take,never,"[-0.36204743, 0.86772096, -0.8309755, -1.83883...","[0.21543865, -0.9045766, 0.73628676, -0.296150...","[-1.3718919, -1.2975837, 0.0046721785, -2.2206...","[-1.1461269, -0.28182548, 0.0060873674, 0.9467...","[-0.32878348, 3.0885634, -0.984712, 0.9389313,..."


In [74]:
lyrics_dictionary_model.wv["us"]

array([-7.3688579e-01, -4.8690912e-01, -6.8223548e-01,  2.3768100e-01,
       -1.9561019e+00,  1.9184388e-01, -5.2804232e-01, -4.0862875e+00,
        2.5918359e-01, -4.3167898e-01, -1.9687415e+00, -1.7977568e+00,
        1.0085567e+00, -1.6839849e+00, -8.4392041e-02, -1.3700677e+00,
        3.4134750e+00,  1.2074479e+00,  1.1471809e-01, -9.5125008e-01,
        4.8479915e-01, -9.9488795e-02,  3.1531608e-01,  1.2446588e+00,
        1.2877927e+00, -1.6323663e-02, -6.3703501e-01,  2.7935192e-01,
       -4.0926725e-01,  1.7880853e+00,  1.3745763e+00, -5.8682066e-01,
       -2.2150755e+00,  5.3085464e-01,  1.1443494e-01, -4.4122070e-01,
       -8.8140202e-01, -2.4638457e+00, -4.8468471e-01,  1.9053366e+00,
        8.5188669e-01, -7.9409224e-01, -1.4330336e+00,  1.9398315e+00,
        1.4239583e-02,  1.3069561e+00, -2.0439343e+00, -1.4490843e+00,
       -1.1134715e+00,  7.0423394e-01,  7.8106147e-01, -8.9247674e-01,
        6.4587885e-01, -1.9743184e+00, -7.7339923e-01, -1.1423810e+00,
      

Verify length of theme token vectors

In [64]:
trackLyricsFeaturesTokenized_good_lyrics.iloc[0,17].shape

(250,)

In [75]:
trackLyricsFeaturesTokenized_good_lyrics.to_csv("./tracksLyricFeatures/tracksLyricThemeVectorized.csv")

Repeat the above process for 125 vector size Word2Vec model

In [20]:
lyrics_dictionary_model_125 = gensim.models.Word2Vec.load("./tracksLyricFeatures/Lyrics_Dictionary_Word2Vec_125.model")

In [22]:
trackLyricsFeaturesTokenized_good_lyrics = trackLyricsFeaturesTokenized_good_lyrics_cp
trackLyricsFeaturesTokenized_good_lyrics.head(4)

Unnamed: 0,id,track,trackArtist,genre,lyrics,top_lang_identified,top_lang_identified_prob,regex_cleaned_lyrics,tokenized_lyrics,tokenized_lyrics_no_stop,tokenized_lyrics_top5_tokens,theme_token_1,theme_token_2,theme_token_3,theme_token_4,theme_token_5
0,2,Food,AWOL,Hip-Hop,"The Waking Blind LyricsThe waking blind, embra...",__label__en,0.932648,"The waking blind, embraced the dark\nThey made...","[the, waking, blind, embraced, the, dark, they...","[waking, blind, embraced, dark, made, us, figh...","[us, fight, sky, made, blind]",us,fight,sky,made,blind
1,5,This World,AWOL,Hip-Hop,"E.T. Lyrics[Verse One: Mickey Factz]\nSee, I s...",__label__en,0.949876,"\nSee, I seen so many faces\nI've been to many...","[see, seen, so, many, faces, ve, been, to, man...","[see, seen, many, faces, many, places, get, ti...","[world, say, see, get, girl]",world,say,see,get,girl
2,10,Freeway,Kurt Vile,Pop,"Freeway LyricsI got a freeway in mind, let go ...",__label__en,0.960411,"I got a freeway in mind, let go of my head\nWa...","[got, freeway, in, mind, let, go, of, my, head...","[got, freeway, mind, let, go, head, walk, line...","[got, know, go, dump, freeway]",got,know,go,dump,freeway
3,148,Blackout 2,Contradiction,Experimental,Blackout Lyrics(Verse 1: Noelz Vedere)\nAnatom...,__label__en,0.900489,"\nAnatomic, catastrophic\nMy profits probably ...","[anatomic, catastrophic, my, profits, probably...","[anatomic, catastrophic, profits, probably, pa...","[better, blackout, gonna, take, never]",better,blackout,gonna,take,never


In [23]:
def vectorize_125(x):
    return lyrics_dictionary_model_125.wv[x]

In [24]:
arr = ['theme_token_1','theme_token_2','theme_token_3','theme_token_4','theme_token_5']

for i in arr:
    i_vec = i+"_vec"
    trackLyricsFeaturesTokenized_good_lyrics[i_vec] = trackLyricsFeaturesTokenized_good_lyrics[i].apply(vectorize_125)

trackLyricsFeaturesTokenized_good_lyrics.head(4)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trackLyricsFeaturesTokenized_good_lyrics[i_vec] = trackLyricsFeaturesTokenized_good_lyrics[i].apply(vectorize_125)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trackLyricsFeaturesTokenized_good_lyrics[i_vec] = trackLyricsFeaturesTokenized_good_lyrics[i].apply(vectorize_125)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returnin

Unnamed: 0,id,track,trackArtist,genre,lyrics,top_lang_identified,top_lang_identified_prob,regex_cleaned_lyrics,tokenized_lyrics,tokenized_lyrics_no_stop,...,theme_token_1,theme_token_2,theme_token_3,theme_token_4,theme_token_5,theme_token_1_vec,theme_token_2_vec,theme_token_3_vec,theme_token_4_vec,theme_token_5_vec
0,2,Food,AWOL,Hip-Hop,"The Waking Blind LyricsThe waking blind, embra...",__label__en,0.932648,"The waking blind, embraced the dark\nThey made...","[the, waking, blind, embraced, the, dark, they...","[waking, blind, embraced, dark, made, us, figh...",...,us,fight,sky,made,blind,"[0.644366, -1.8408995, -0.36924273, 0.48891515...","[-1.6862539, 2.0813553, -0.75358343, -0.413386...","[-0.16251208, 1.049491, 1.8228607, -2.8737357,...","[0.42628798, -0.22753428, -1.9290991, 1.566466...","[-0.7847691, 2.0510666, -2.613047, -0.6102007,..."
1,5,This World,AWOL,Hip-Hop,"E.T. Lyrics[Verse One: Mickey Factz]\nSee, I s...",__label__en,0.949876,"\nSee, I seen so many faces\nI've been to many...","[see, seen, so, many, faces, ve, been, to, man...","[see, seen, many, faces, many, places, get, ti...",...,world,say,see,get,girl,"[-0.13129893, -1.2346379, 2.8603253, 1.2410786...","[-0.5285235, 0.8967087, -1.753356, -0.55411255...","[-2.2611797, 2.8824058, -1.6390867, -2.0671778...","[0.2112657, 2.286089, -2.550711, 0.1377382, 2....","[0.19001627, 1.3540837, -0.6024195, -0.4582716..."
2,10,Freeway,Kurt Vile,Pop,"Freeway LyricsI got a freeway in mind, let go ...",__label__en,0.960411,"I got a freeway in mind, let go of my head\nWa...","[got, freeway, in, mind, let, go, of, my, head...","[got, freeway, mind, let, go, head, walk, line...",...,got,know,go,dump,freeway,"[-0.1946999, 2.6564999, -0.8294583, -0.4025004...","[-1.4553922, 3.8754973, -1.6874942, -1.1240348...","[-0.33381423, 3.078735, -1.3531779, 0.06115425...","[0.013161531, -0.9392986, -0.471028, 0.1687076...","[1.0677713, -0.119711965, 0.10461726, 0.206433..."
3,148,Blackout 2,Contradiction,Experimental,Blackout Lyrics(Verse 1: Noelz Vedere)\nAnatom...,__label__en,0.900489,"\nAnatomic, catastrophic\nMy profits probably ...","[anatomic, catastrophic, my, profits, probably...","[anatomic, catastrophic, profits, probably, pa...",...,better,blackout,gonna,take,never,"[0.4509398, 1.6939244, -1.8020959, -0.9481517,...","[0.19974507, 0.235193, 0.22889362, 0.3196816, ...","[-1.8374147, 1.4321458, 2.168696, -1.3679633, ...","[2.5032115, 1.12036, -2.1712477, -0.68210334, ...","[-2.0304005, 0.40059865, 0.18482238, -1.408464..."


Verify the length of the theme token vector

In [25]:
trackLyricsFeaturesTokenized_good_lyrics.iloc[0,17].shape

(125,)

Save the 125 vector size data

In [26]:
trackLyricsFeaturesTokenized_good_lyrics.to_csv("./tracksLyricFeatures/tracksLyricThemeVectorized_125Vec.csv")

This data will be combined the Audio MFCC features to get the final dataset to be used in the Genre detection models