# 1. Import Packages and Libraries

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import random
import gc
import os
import ast
import lyricsgenius as lg
import pickle
%matplotlib inline
%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [19]:
pd.set_option('display.max_columns',None)

# 2. Language Detection EDA and Dataset Creation

#### Read in song data w/ language labels

In [33]:
lang_det_task = pd.read_csv('Original_Data/lyrics_artist_genre/lyrics-data.csv')
lang_det_task.dropna(inplace=True)
lang_det_task.index = np.arange(0,len(lang_det_task))
lang_det_task.head()

Unnamed: 0,ALink,SName,SLink,Lyric,language
0,/ivete-sangalo/,Arerê,/ivete-sangalo/arere.html,"Tudo o que eu quero nessa vida,\nToda vida, é\...",pt
1,/ivete-sangalo/,Se Eu Não Te Amasse Tanto Assim,/ivete-sangalo/se-eu-nao-te-amasse-tanto-assim...,Meu coração\nSem direção\nVoando só por voar\n...,pt
2,/ivete-sangalo/,Céu da Boca,/ivete-sangalo/chupa-toda.html,É de babaixá!\nÉ de balacubaca!\nÉ de babaixá!...,pt
3,/ivete-sangalo/,Quando A Chuva Passar,/ivete-sangalo/quando-a-chuva-passar.html,Quando a chuva passar\n\nPra quê falar\nSe voc...,pt
4,/ivete-sangalo/,Sorte Grande,/ivete-sangalo/sorte-grande.html,A minha sorte grande foi você cair do céu\nMin...,pt


#### Map language codes to language names

In [34]:
url = 'https://www.loc.gov/standards/iso639-2/php/English_list.php'

header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

r = requests.get(url, headers=header)

dfs = pd.read_html(r.text)[1].iloc[:,[0,-1]]
dfs.dropna(inplace=True)
dfs.set_index('ISO 639-1',inplace=True)
code_to_lang = dfs.to_dict()['English Name of Language']
lang_det_task['language full'] = lang_det_task['language'].map(code_to_lang)
top_7_langs = pd.concat([lang_det_task['language full'].value_counts().head(7),
                    (lang_det_task['language full'].value_counts()/len(lang_det_task)).head(7)],axis=1)
top_7_langs.columns = ['Count','Proportion of Total']
display(top_7_langs)
print('Number of Unique Languages / Classes: ' + str(lang_det_task['language'].nunique()))


Unnamed: 0,Count,Proportion of Total
English,191812,0.525089
Portuguese,157393,0.430867
Spanish,9917,0.027148
Kinyarwanda,1679,0.004596
Italian,1432,0.00392
French,1225,0.003353
German,844,0.00231


Number of Unique Languages / Classes: 52


#### Labels will be retained for songs within top 7 of proportion of total counts, all else will be mapped to other

In [35]:
lang_det_task['language label'] = lang_det_task['language full'].apply(lambda x:x if x in top_7_langs.index else 'Other')
print('Label Proportions')
pd.concat([lang_det_task['language label'].value_counts(),
                    (lang_det_task['language label'].value_counts()/len(lang_det_task))],axis=1)

Label Proportions


Unnamed: 0,language label,language label.1
English,191812,0.525089
Portuguese,157393,0.430867
Spanish,9917,0.027148
Kinyarwanda,1679,0.004596
Italian,1432,0.00392
French,1225,0.003353
Other,992,0.002716
German,844,0.00231


#### Keep 40K for training, 10K for val/test

In [36]:
random.seed(50)
sample = random.sample(list(lang_det_task.index),k=50000)
lang_train_sample = lang_det_task.iloc[sample[:40000]]
lang_test_sample = lang_det_task.iloc[sample[10000:]]

#### View Label Proportions and Save as CSV

In [37]:
print('Training Set label proportions')
pd.concat([lang_train_sample['language label'].value_counts(),
                    (lang_train_sample['language label'].value_counts()/len(lang_train_sample))],axis=1)

Training Set label proportions


Unnamed: 0,language label,language label.1
English,21031,0.525775
Portuguese,17237,0.430925
Spanish,1062,0.02655
Kinyarwanda,187,0.004675
Italian,145,0.003625
French,142,0.00355
German,102,0.00255
Other,94,0.00235


In [38]:
print('Val/Test Set label proportions')
pd.concat([lang_test_sample['language label'].value_counts(),
                    (lang_test_sample['language label'].value_counts()/len(lang_test_sample))],axis=1)

Val/Test Set label proportions


Unnamed: 0,language label,language label.1
English,20842,0.52105
Portuguese,17430,0.43575
Spanish,1031,0.025775
Kinyarwanda,197,0.004925
French,145,0.003625
Italian,143,0.003575
Other,110,0.00275
German,102,0.00255


In [39]:
lang_train_sample.to_csv('Language_Detection/Train_Test_Data/train.csv',index=False)
lang_test_sample.to_csv('Language_Detection/Train_Test_Data/test.csv',index=False)

In [40]:
del lang_det_task,lang_test_sample,lang_train_sample,dfs,url,header,code_to_lang,sample
gc.collect()

1073

# 3. Genre Classification EDA and Dataset Creation

#### Function to pull song lyrics, around Genius API

In [41]:
genius_api = lg.Genius(os.getenv('genius_token'))
genius_api.verbose = False

def get_song_lyrics(song_name,artist_name,genius_api):
    try:
        lyrics = genius_api.search_song(song_name,artist_name).lyrics
    except:
        lyrics = ''
    
    return lyrics

#### Create Dataset of Songs w/ Song Lyrics, Audio Features, Song Metadata (Artist, Track Name), Genre (major genre, sub-genre)

In [None]:
#filepaths
fps = ['Original_Data/multi_genre_playlist/' + x for x in os.listdir('Original_Data/multi_genre_playlist/')]
#major genres predetermined by file separation
major_genres = ['Blues','Metal','Pop','Hip Hop','Indie','Alternative','Rock']
#dataframe to store major genre dataset
major_genre_data = pd.DataFrame()
for num in range(len(fps)):
    #genre specific songs
    df = pd.read_csv(fps[num])
    #clean genre data, store string of list as list, 
    df['Genres'] = df['Genres'].apply(lambda x:ast.literal_eval(x) if x!='[]' else np.nan)
    #Pull lyrics
    df['Lyrics'] = df.apply(lambda x:get_song_lyrics(x['Track Name'],x['Artist Name'],genius_api),axis=1)
    #create major genre label according to file separation
    df['Major Genre'] = major_genres[num]
    major_genre_data = pd.concat([major_genre_data,df],axis=0)
    print(fps[num].split('/')[-1] + ': ' + str(len(df)))

#### Clean Sub-Genre Data and Create Sub-Genre Labels

Explanation:
While each song will typically belong to a singular major genre, that genre itself is partitioned into sub-genres that depict different influences and ultimately variant music within the same major genre. Individual songs can belong to multiple different sub-genres, which indicates a multilabel classification problem. This is an extension of supervised learning with the primary difference that instead of a point belonging to a single class or containing a single target label, it has multiple labels. There are a few several approaches for handling this sort of problem.
- Option 1: Treat each unique multilabel sequence as it's own class and reformat as a multiclass classification problem. The benefit of this approach is that it is less computationally expensive and does not require aggregation of results across multiple models. The issues with this approach are the assumption of distinctnesss among classes (violated), a much smaller proportion of examples per class, forced class imbalances, and the inability to predict a multilabel sequence outside of the unique sequences it was trained on.
- Option 2: Transform each label into an k dimensional vector where k corresponds to the # of individual unique sub-genres. Then, equally disperse overall density (=1) across all the present labels for each example, forming a label vector. A neural architecture for this problem will have k output nodes and a softmax across all output node outputs will be the equivalent of predicting the density of each class in a multilabel scheme. This problem setup is well suited for DL based models due to the vector output but requires additional logic for determining which labels to actually select after predicting density of each label.
- **Option 3: Binary relevence method** (will likely go with this one). Recast the problem as a binary classification problem where we have k models that independently predict the likelihood of each label for a given example, allowing us to return a sequence of label's for each example. This allows us to optimize for the ability of predicting each label independently, and does not constrain us to label sequences that have already been seen before. It also does not assume equal densities of each present label and therefore does not require recasting our labels as a label vector. However, this method is computationally more expensive because it involves building an ensemble of k models and aggregating results to determine classes as opposed to building one model that either predicts multilabel sequences or the density of each label.

#### Drop songs without available lyrics via genius, drop duplicates, reindex

In [None]:
major_genre_data = major_genre_data[major_genre_data['Lyrics'] != '']
major_genre_data.drop('Playlist',axis=1,inplace=True)
major_genre_data.index = np.arange(0,len(major_genre_data))
genre_series = major_genre_data['Genres'].copy()
major_genre_data.drop('Genres',axis=1,inplace=True)
major_genre_data.drop_duplicates(inplace=True)
major_genre_data['Genres'] = genre_series.iloc[major_genre_data.index]
major_genre_data.index = np.arange(0,len(major_genre_data))

#### Get Top 10 Sub-Genres for Each Genre

In [None]:
top_genres = []
for genre in major_genres:
    subset = major_genre_data[major_genre_data['Major Genre'] == genre].copy()
    sub_genres = []
    for row_index in subset.index:
        try:
            sub_genres.extend(subset.loc[row_index,'Genres'])
        except:
            num = num
            
    top_10 = ((pd.Series(sub_genres).value_counts()).head(10)).index
    top_genres.extend(top_10)

top_genres = list(set(top_genres))

#### Create Sub Genre Columns

In [None]:
for genre in top_genres:
    label = 'Sub-Genre: ' + genre
    sub_genre_labels = []
    for num in range(len(major_genre_data)):
        try:
            obs_genre_list = major_genre_data.loc[num,'Genres']
            if genre in obs_genre_list:
                append_value = 1
            else:
                append_value = 0
        except:
            append_value
        sub_genre_labels.append(append_value)
        
    major_genre_data[label] = sub_genre_labels

#### Create Train/Test Datasets for genre and sub-genre classification

In [None]:
random.seed(50)
train_indices = random.sample(list(major_genre_data.index),k=16500)
test_indices = [x for x in major_genre_data.index if x not in train_indices]
genre_sub_genre_train = major_genre_data.iloc[train_indices].copy()
genre_sub_genre_test = major_genre_data.iloc[test_indices].copy()

In [None]:
pickle.dump(genre_sub_genre_train,open('Genre_Classification/Train_Test_Data/genre_sub_genre_train.pkl','wb'))
pickle.dump(genre_sub_genre_test,open('Genre_Classification/Train_Test_Data/genre_sub_genre_test.pkl','wb'))

In [42]:
pickle.load(open('Genre_Classification/Train_Test_Data/genre_sub_genre_train.pkl','rb'))

Unnamed: 0,Artist Name,Track Name,Popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,uri,track_href,analysis_url,duration_ms,time_signature,Lyrics,Major Genre,Genres,Sub-Genre: electropop,Sub-Genre: blues rock,Sub-Genre: album rock,Sub-Genre: alternative dance,Sub-Genre: permanent wave,Sub-Genre: blues,Sub-Genre: new rave,Sub-Genre: rock,Sub-Genre: rap,Sub-Genre: acoustic blues,Sub-Genre: modern blues rock,Sub-Genre: post-teen pop,Sub-Genre: modern blues,Sub-Genre: hard rock,Sub-Genre: east coast hip hop,Sub-Genre: alternative metal,Sub-Genre: art pop,Sub-Genre: metal,Sub-Genre: dance pop,Sub-Genre: trap,Sub-Genre: israeli hip hop,Sub-Genre: metalcore,Sub-Genre: alternative rock,Sub-Genre: melodic metalcore,Sub-Genre: pop,Sub-Genre: mizrahi,Sub-Genre: modern rock,Sub-Genre: electric blues,Sub-Genre: pop rap,Sub-Genre: alt z,Sub-Genre: traditional blues,Sub-Genre: israeli pop,Sub-Genre: dance-punk,Sub-Genre: indie rock,Sub-Genre: underground hip hop,Sub-Genre: hip hop,Sub-Genre: art rock,Sub-Genre: rap rock,Sub-Genre: gangster rap,Sub-Genre: modern alternative rock,Sub-Genre: southern hip hop,Sub-Genre: nu metal,Sub-Genre: israeli mediterranean,Sub-Genre: thrash metal,Sub-Genre: pop rock,Sub-Genre: chicago blues,Sub-Genre: indie pop,Sub-Genre: classic rock,Sub-Genre: hardcore hip hop
16303,Escape the Fate,Unbreakable,47,0.563,0.820,1,-6.740,0,0.0489,0.00546,0.000579,0.1910,0.558,123.035,1oaQMlC6Cry1wu8v0q4Cfx,spotify:track:1oaQMlC6Cry1wu8v0q4Cfx,https://api.spotify.com/v1/tracks/1oaQMlC6Cry1...,https://api.spotify.com/v1/audio-analysis/1oaQ...,182470,4,Unbreakable Lyrics[Intro]\nGo take it all\nYou...,Rock,"[alternative metal, metalcore, modern rock, pi...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8721,COUCOU CHLOE,NOBODY,46,0.844,0.787,11,-7.044,1,0.1320,0.09410,0.000853,0.0841,0.540,110.985,7H06sYnrGsMbWzAR377J8s,spotify:track:7H06sYnrGsMbWzAR377J8s,https://api.spotify.com/v1/tracks/7H06sYnrGsMb...,https://api.spotify.com/v1/audio-analysis/7H06...,116757,4,NOBODY LyricsTake a bitch\nThat I have in one ...,Indie,"[deconstructed club, electra, escape room, flu...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
11930,beabadoobee,Worth It,51,0.576,0.751,2,-5.256,1,0.0269,0.01250,0.000459,0.0501,0.380,103.009,3r1qdSsvJEwiMKHeCWapMM,spotify:track:3r1qdSsvJEwiMKHeCWapMM,https://api.spotify.com/v1/tracks/3r1qdSsvJEwi...,https://api.spotify.com/v1/audio-analysis/3r1q...,194173,4,Worth It Lyrics[Verse 1]\nYour eyes are just l...,Alternative,"[bedroom pop, bubblegrunge, indie pop]",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
7945,Andrew Broder,Bloodrush,39,0.693,0.613,11,-10.160,0,0.2610,0.07980,0.000137,0.1080,0.294,130.113,5VrvHUYRdYRdBRcoB7MekM,spotify:track:5VrvHUYRdYRdBRcoB7MekM,https://api.spotify.com/v1/tracks/5VrvHUYRdYRd...,https://api.spotify.com/v1/audio-analysis/5Vrv...,203731,4,Bloodrush Lyrics[Intro: Denzel Curry]\nUgh\nUg...,Hip Hop,,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15504,Greta Van Fleet,Age Of Man,56,0.460,0.601,6,-5.486,0,0.0304,0.03770,0.715000,0.0954,0.247,136.912,54DIzLw4LLxB3n1XiiQftU,spotify:track:54DIzLw4LLxB3n1XiiQftU,https://api.spotify.com/v1/tracks/54DIzLw4LLxB...,https://api.spotify.com/v1/audio-analysis/54DI...,366187,4,Age of Man Lyrics[Intro]\nIn an age of darknes...,Rock,"[modern blues rock, modern hard rock, rock]",0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12579,ABSOLUTE.,Sage comme une image - Good as Gold,44,0.792,0.873,6,-8.622,0,0.3290,0.00402,0.000396,0.2500,0.672,124.996,2sjd0qvJ5t49KHIYueVBdO,spotify:track:2sjd0qvJ5t49KHIYueVBdO,https://api.spotify.com/v1/tracks/2sjd0qvJ5t49...,https://api.spotify.com/v1/audio-analysis/2sjd...,236165,4,Reflections on the Revolution in France Lyrics...,Alternative,,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
16468,Pepper,Warning (feat. Stick Figure),51,0.801,0.527,1,-7.191,0,0.1180,0.01920,0.118000,0.1310,0.520,76.022,29JOtyhGMv2gfikATHJlCF,spotify:track:29JOtyhGMv2gfikATHJlCF,https://api.spotify.com/v1/tracks/29JOtyhGMv2g...,https://api.spotify.com/v1/audio-analysis/29JO...,232160,4,The Uncommercial Traveller (Chap. 15) LyricsNu...,Rock,"[reggae fusion, reggae rock]",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5125,Kelvyn Boy,Tele,49,0.848,0.743,1,-7.095,0,0.0804,0.36200,0.001400,0.1020,0.706,100.036,7gdKO8t9G1qrAvrnQji5Wk,spotify:track:7gdKO8t9G1qrAvrnQji5Wk,https://api.spotify.com/v1/tracks/7gdKO8t9G1qr...,https://api.spotify.com/v1/audio-analysis/7gdK...,194400,4,Tele Lyrics[Intro: Kelvyn Boy]\nAnother (It's ...,Pop,"[ghanaian alternative, ghanaian pop]",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15805,Sheryl Crow,Home,38,0.485,0.409,0,-13.920,1,0.0278,0.24500,0.251000,0.0887,0.301,166.177,0VLxlACQp8DrbhLQ8QbFG3,spotify:track:0VLxlACQp8DrbhLQ8QbFG3,https://api.spotify.com/v1/tracks/0VLxlACQp8Dr...,https://api.spotify.com/v1/audio-analysis/0VLx...,290733,4,Home Lyrics[Verse 1]\nI woke up this mornin'\n...,Rock,"[lilith, mellow gold, neo mellow, new wave pop...",0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


# 4. Topic Classification EDA and Dataset Creation

In [43]:
topic_data = pd.read_csv('Original_Data/audio_topic_lyrics_genre.csv').iloc[:,1:]

In [44]:
topic_data.head()

Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,night/time,shake the audience,family/gospel,romantic,communication,obscene,music,movement/places,light/visual perceptions,family/spiritual,like/girls,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
0,mukesh,mohabbat bhi jhoothi,1950,pop,hold time feel break feel untrue convince spea...,95,0.000598,0.063746,0.000598,0.000598,0.000598,0.048857,0.017104,0.263751,0.000598,0.039288,0.000598,0.000598,0.000598,0.000598,0.380299,0.117175,0.357739,0.454119,0.997992,0.901822,0.339448,0.13711,sadness,1.0
1,frankie laine,i believe,1950,pop,believe drop rain fall grow believe darkest ni...,51,0.035537,0.096777,0.443435,0.001284,0.001284,0.027007,0.001284,0.001284,0.001284,0.118034,0.001284,0.212681,0.051124,0.001284,0.001284,0.001284,0.331745,0.64754,0.954819,2e-06,0.325021,0.26324,world/life,1.0
2,johnnie ray,cry,1950,pop,sweetheart send letter goodbye secret feel bet...,24,0.00277,0.00277,0.00277,0.00277,0.00277,0.00277,0.158564,0.250668,0.00277,0.323794,0.00277,0.00277,0.00277,0.00277,0.00277,0.225422,0.456298,0.585288,0.840361,0.0,0.351814,0.139112,music,1.0
3,pérez prado,patricia,1950,pop,kiss lips want stroll charm mambo chacha merin...,54,0.048249,0.001548,0.001548,0.001548,0.0215,0.001548,0.411536,0.001548,0.001548,0.001548,0.12925,0.001548,0.001548,0.081132,0.225889,0.001548,0.686992,0.744404,0.083935,0.199393,0.77535,0.743736,romantic,1.0
4,giorgos papadopoulos,apopse eida oneiro,1950,pop,till darling till matter know till dream live ...,48,0.00135,0.00135,0.417772,0.00135,0.00135,0.00135,0.46343,0.00135,0.00135,0.00135,0.00135,0.00135,0.029755,0.00135,0.0688,0.00135,0.291671,0.646489,0.975904,0.000246,0.597073,0.394375,romantic,1.0


In [45]:
display(topic_data.head(10).iloc[:,6:-8].sum(axis=1))
display(topic_data.head(10).iloc[:,6:-8].columns)
display(topic_data.iloc[:,6:-8].corr()**2)

0    0.935604
1    0.996149
2    0.991690
3    0.933036
4    0.995951
5    0.807859
6    0.970047
7    0.992481
8    0.955261
9    0.996641
dtype: float64

Index(['dating', 'violence', 'world/life', 'night/time', 'shake the audience',
       'family/gospel', 'romantic', 'communication', 'obscene', 'music',
       'movement/places', 'light/visual perceptions', 'family/spiritual',
       'like/girls', 'sadness', 'feelings'],
      dtype='object')

Unnamed: 0,dating,violence,world/life,night/time,shake the audience,family/gospel,romantic,communication,obscene,music,movement/places,light/visual perceptions,family/spiritual,like/girls,sadness,feelings
dating,1.0,0.011536,0.005429,0.001189881,0.000727,9.7e-05,0.001196118,0.003301,0.000422,8.5e-05,0.006082,0.007839679,0.004012,1.82276e-05,0.000166,0.0006631355
violence,0.011536,1.0,0.03489,0.01544016,0.000779,0.001125,0.02414528,0.005761,0.025507,0.025301,0.000337,2.825033e-06,0.005155,0.004143451,0.048889,0.006629689
world/life,0.005429,0.03489,1.0,0.01349478,0.002865,0.001561,0.005176362,0.004874,0.044691,0.016531,0.006443,0.0003054867,0.000529,0.001786619,0.025443,0.002849256
night/time,0.00119,0.01544,0.013495,1.0,7.7e-05,0.000183,0.001568769,0.001762,0.013642,0.004968,0.001828,0.002005502,0.003831,0.0003626005,0.010787,5.615182e-08
shake the audience,0.000727,0.000779,0.002865,7.710711e-05,1.0,2.4e-05,0.001520089,0.003077,0.005449,0.001819,0.001337,0.006174824,0.001233,1.149915e-05,0.005672,0.0001610883
family/gospel,9.7e-05,0.001125,0.001561,0.0001827232,2.4e-05,1.0,0.0001551645,0.003992,3.9e-05,0.000104,4.6e-05,0.004797888,1e-05,0.0004609233,0.002576,3.942031e-05
romantic,0.001196,0.024145,0.005176,0.001568769,0.00152,0.000155,1.0,0.000401,0.02515,0.001685,0.009782,9.788283e-07,0.003872,2.549555e-07,0.001992,0.000490588
communication,0.003301,0.005761,0.004874,0.001761852,0.003077,0.003992,0.000400976,1.0,0.006057,0.006291,0.021898,0.02375891,0.008271,0.004773966,3.5e-05,8.479153e-05
obscene,0.000422,0.025507,0.044691,0.01364234,0.005449,3.9e-05,0.02515038,0.006057,1.0,0.017906,0.002726,0.01775176,0.005855,9.032703e-05,0.070801,0.005691038
music,8.5e-05,0.025301,0.016531,0.004968497,0.001819,0.000104,0.001684922,0.006291,0.017906,1.0,0.000174,0.001094665,0.000323,0.0007177369,0.013178,0.002041422


#### New Topics
- Family/Religion = 'family/gospel' + 'family/spiritual'
- Music = 'music'
- Violence = 'violence'
- Explicit Content = 'obscene'
- Dating/Love = 'dating' + 'romantic' + 'like/girls' + 'communication'
- Emotion = 'sadness' + 'feelings'
- Travel/World = 'movement/places' + 'world/life'
- Time = 'night/time'
- Sensory Perception = 'light/visual perceptions'
- Energize Audience = 'shake the audience'
- Other = 1 - sum(above)

In [46]:
modified_topic_data = topic_data[['artist_name', 'track_name', 'release_date', 'genre','danceability', 'loudness',
       'acousticness', 'instrumentalness', 'valence', 'energy']].copy()
modified_topic_data['Topic: Family/Religion'] = topic_data['family/gospel'] + topic_data['family/spiritual']
modified_topic_data['Topic: Music'] = topic_data['music']
modified_topic_data['Topic: Violence'] = topic_data['violence']
modified_topic_data['Topic: Explicit Content'] = topic_data['obscene']
modified_topic_data['Topic: Dating/Love'] = topic_data['dating'] + topic_data['romantic'] + topic_data['like/girls'] + topic_data['communication']
modified_topic_data['Topic: Emotion'] = topic_data['sadness'] + topic_data['feelings']
modified_topic_data['Topic: Travel/World'] = topic_data['movement/places'] + topic_data['world/life']
modified_topic_data['Topic: Time'] = topic_data['night/time']
modified_topic_data['Topic: Sensory Perception'] = topic_data['light/visual perceptions']
modified_topic_data['Topic: Energize Audience'] = topic_data['shake the audience']
modified_topic_data['Topic: Other'] = 1 - modified_topic_data.loc[:,'Topic: Family/Religion':].sum(axis=1)

In [None]:
modified_topic_data['Lyrics'] = modified_topic_data.apply(lambda x:get_song_lyrics(x['track_name'],x['artist_name'],genius_api),axis=1)

In [None]:
modified_topic_data.iloc[:50].apply(lambda x:get_song_lyrics(x['track_name'],x['artist_name'],genius_api),axis=1)