# Final Dataset Creation


In [1]:
import pandas as pd
from tqdm import tqdm
from ast import literal_eval
import numpy as np
import random

random.seed(10)

The main aim of this notebook is to create the final dataset by merging both audio and lyric i.e. theme features.

In [2]:
audioFeatures = pd.read_csv('tracksAudioFeatures/tracksAudioFeatures_complete.csv')
lyricFeatures = pd.read_csv('tracksLyricFeatures/tracksLyricThemeVectorized.csv')



In [3]:
#Droping extra columns before merging
audioFeatures = audioFeatures.drop(['Unnamed: 0'], axis=1)
lyricFeatures = lyricFeatures.drop(['Unnamed: 0','track', 'trackArtist', 'genre', 'lyrics', 'top_lang_identified',
       'top_lang_identified_prob', 'regex_cleaned_lyrics', 'tokenized_lyrics',
       'tokenized_lyrics_no_stop', 'tokenized_lyrics_top5_tokens',
       'theme_token_1', 'theme_token_2', 'theme_token_3', 'theme_token_4',
       'theme_token_5'], axis=1)


In [4]:
lyricFeatures.head(3)


Unnamed: 0,id,theme_token_1_vec,theme_token_2_vec,theme_token_3_vec,theme_token_4_vec,theme_token_5_vec
0,127936,[-0.39857802 -0.7149435 0.19870338 -2.165562...,[ 0.54379386 -0.78404784 2.5348065 -1.979363...,[-1.9840902 0.6029517 -1.0292802 0.835790...,[-2.85954309e+00 1.30950487e+00 -2.08809757e+...,[-2.55218768e+00 8.12104702e-01 9.09771979e-...
1,52632,[-0.7222719 -1.1852437 1.3600969 -1.510474...,[ 9.34379041e-01 -1.20123565e+00 -3.79777968e-...,[-0.90332586 -1.1629791 -0.06437027 -1.990405...,[ 1.18106747e+00 -1.30691707e-01 3.56504321e-...,[-1.2248039 0.6531294 -0.6717697 0.065949...
2,99309,[-2.36634517e+00 1.00017858e+00 -6.03607595e-...,[-0.6021649 -2.528206 1.8324572 1.667492...,[-1.6888599e+00 -2.9139459e+00 -6.6858500e-01 ...,[-0.6582515 -0.3084944 1.4986044 -0.150477...,[ 1.9839271 2.0191925 -1.479551 -0.654519...


In [5]:
audioFeatures.head(3)

Unnamed: 0,id,genre,mfccFeature_0,mfccFeature_1,mfccFeature_2,mfccFeature_3,mfccFeature_4,mfccFeature_5,mfccFeature_6,mfccFeature_7,mfccFeature_8,mfccFeature_9,mfccFeature_10,mfccFeature_11,mfccFeature_12,mfccFeature_13,mfccFeature_14
0,124423_0,Old-Time / Historic,"[-328.7624, -206.09636, -150.4038, -144.09038,...","[132.91347, 150.61551, 144.43718, 140.92017, 1...","[-90.37503, -126.661026, -140.2472, -138.8731,...","[-63.457577, -50.04628, -51.89651, -52.70818, ...","[-38.716564, -46.91884, -50.58444, -47.508934,...","[-56.29294, -62.14529, -53.36077, -51.67916, -...","[-17.247395, -34.123608, -40.161243, -42.32673...","[-3.6903424, -5.2592907, -1.0339475, -0.248538...","[-16.171959, -23.949322, -23.311352, -21.07062...","[-1.7130346, -6.9061756, -2.5145044, 4.5913672...","[-17.067202, -17.211754, -16.577923, -9.204768...","[-27.971401, -30.455574, -20.908623, -16.16786...","[8.152201, 8.3445635, 1.8432193, -5.7282104, -...","[10.642556, 9.837347, 18.859747, 17.205154, 8....","[-4.3580694, -3.3247807, 6.962681, 11.387974, ..."
1,124423_1,Old-Time / Historic,"[-236.76901, -203.01874, -200.4282, -208.92003...","[164.211, 159.14294, 136.04929, 136.82301, 156...","[-85.3822, -117.465, -142.91583, -143.71155, -...","[-32.252678, -37.033913, -54.37635, -57.882896...","[-7.4214034, -20.556595, -37.773483, -39.49316...","[-31.144537, -44.259975, -59.811573, -61.17159...","[-14.987923, -23.720203, -43.693214, -45.93542...","[4.4418592, 3.9385574, -10.675606, -14.657623,...","[-12.005516, -25.275543, -41.375755, -39.71647...","[-5.6790867, 1.2063706, -4.721801, -8.164688, ...","[-6.2902756, -3.91095, -4.4361215, -4.327603, ...","[-5.6098385, -7.864852, -7.2575703, -2.4607427...","[7.336014, 14.364861, 14.558414, 20.445679, 16...","[2.09269, -0.17045055, -0.978363, 0.28908974, ...","[-4.1928773, -3.4265738, -3.9279053, -5.465062..."
2,124423_2,Old-Time / Historic,"[-390.48816, -339.31622, -317.2405, -326.96646...","[97.122345, 115.60747, 127.4955, 128.93234, 14...","[-84.24397, -109.15258, -110.69359, -99.36203,...","[-40.43964, -44.553093, -47.186836, -41.452816...","[0.28208655, -1.2433429, -6.5626597, -0.983691...","[-36.504112, -43.72944, -42.5904, -35.0524, -4...","[-22.552574, -24.396482, -21.35398, -15.982605...","[1.2889054, 0.24470624, -1.0263059, 3.6228871,...","[-9.089966, -12.901368, -15.339406, -10.585323...","[-9.621838, -6.8267345, -4.4633904, -3.513043,...","[-6.904684, -6.3937654, -4.0830145, -0.6454271...","[-2.2965891, -8.502451, -8.448065, -3.9730012,...","[11.558136, 10.830755, 6.7897396, 8.929026, 13...","[5.913314, 5.5654097, 1.5172951, 0.10546613, 2...","[-6.5406847, -3.1975913, -1.6860085, -6.242577..."


In [6]:
print(type(audioFeatures['mfccFeature_0'].iloc[0]))
print(len(audioFeatures['mfccFeature_0'].iloc[0]))
print(len(audioFeatures['mfccFeature_0'].iloc[0].split(',')))

<class 'str'>
2940
250


Since a song was divided into multiple parts ,we also duplicate the theme features for the same number of times.

In [7]:

extendedLyricFeatures = pd.DataFrame()

for index, row in lyricFeatures.iterrows():
    for i in range(0,5):
        df = pd.DataFrame()
        newId = str(row['id'])+'_'+str(i)
        df['id'] = [newId]
        df['themeFeature_0'] = row['theme_token_1_vec']
        df['themeFeature_1'] = row['theme_token_2_vec']
        df['themeFeature_2'] = row['theme_token_3_vec']
        df['themeFeature_3'] = row['theme_token_4_vec']
        df['themeFeature_4'] = row['theme_token_5_vec']
        extendedLyricFeatures = pd.concat([extendedLyricFeatures,df])
    
        
        
    

In [8]:
extendedLyricFeatures

Unnamed: 0,id,themeFeature_0,themeFeature_1,themeFeature_2,themeFeature_3,themeFeature_4
0,127936_0,[-0.39857802 -0.7149435 0.19870338 -2.165562...,[ 0.54379386 -0.78404784 2.5348065 -1.979363...,[-1.9840902 0.6029517 -1.0292802 0.835790...,[-2.85954309e+00 1.30950487e+00 -2.08809757e+...,[-2.55218768e+00 8.12104702e-01 9.09771979e-...
0,127936_1,[-0.39857802 -0.7149435 0.19870338 -2.165562...,[ 0.54379386 -0.78404784 2.5348065 -1.979363...,[-1.9840902 0.6029517 -1.0292802 0.835790...,[-2.85954309e+00 1.30950487e+00 -2.08809757e+...,[-2.55218768e+00 8.12104702e-01 9.09771979e-...
0,127936_2,[-0.39857802 -0.7149435 0.19870338 -2.165562...,[ 0.54379386 -0.78404784 2.5348065 -1.979363...,[-1.9840902 0.6029517 -1.0292802 0.835790...,[-2.85954309e+00 1.30950487e+00 -2.08809757e+...,[-2.55218768e+00 8.12104702e-01 9.09771979e-...
0,127936_3,[-0.39857802 -0.7149435 0.19870338 -2.165562...,[ 0.54379386 -0.78404784 2.5348065 -1.979363...,[-1.9840902 0.6029517 -1.0292802 0.835790...,[-2.85954309e+00 1.30950487e+00 -2.08809757e+...,[-2.55218768e+00 8.12104702e-01 9.09771979e-...
0,127936_4,[-0.39857802 -0.7149435 0.19870338 -2.165562...,[ 0.54379386 -0.78404784 2.5348065 -1.979363...,[-1.9840902 0.6029517 -1.0292802 0.835790...,[-2.85954309e+00 1.30950487e+00 -2.08809757e+...,[-2.55218768e+00 8.12104702e-01 9.09771979e-...
...,...,...,...,...,...,...
0,147735_0,[-1.90860951e+00 1.57677662e+00 -3.88008380e+...,[-0.93748844 -0.25881132 -1.1592864 -2.300666...,[-1.9301409 -1.756477 3.294717 -0.267101...,[-1.9625174 0.54614407 1.1446604 -2.486829...,[-2.4221561e+00 -2.4010549e-01 3.0238932e-01 ...
0,147735_1,[-1.90860951e+00 1.57677662e+00 -3.88008380e+...,[-0.93748844 -0.25881132 -1.1592864 -2.300666...,[-1.9301409 -1.756477 3.294717 -0.267101...,[-1.9625174 0.54614407 1.1446604 -2.486829...,[-2.4221561e+00 -2.4010549e-01 3.0238932e-01 ...
0,147735_2,[-1.90860951e+00 1.57677662e+00 -3.88008380e+...,[-0.93748844 -0.25881132 -1.1592864 -2.300666...,[-1.9301409 -1.756477 3.294717 -0.267101...,[-1.9625174 0.54614407 1.1446604 -2.486829...,[-2.4221561e+00 -2.4010549e-01 3.0238932e-01 ...
0,147735_3,[-1.90860951e+00 1.57677662e+00 -3.88008380e+...,[-0.93748844 -0.25881132 -1.1592864 -2.300666...,[-1.9301409 -1.756477 3.294717 -0.267101...,[-1.9625174 0.54614407 1.1446604 -2.486829...,[-2.4221561e+00 -2.4010549e-01 3.0238932e-01 ...


We finally merge both the datasets and check the value counts

In [9]:
musicFeatures = pd.merge(audioFeatures, extendedLyricFeatures)

In [10]:
musicFeatures.shape

(35665, 22)

In [11]:
musicFeatures['genre'].value_counts()

Rock            9550
Electronic      7980
Hip-Hop         4130
Experimental    4035
Folk            4030
Pop             3035
Instrumental    2905
Name: genre, dtype: int64

Since the classes are imbalanced, o we cut both Rock and Electronic by half.

In [12]:
rock_indexes = musicFeatures[musicFeatures['genre'] == 'Rock'].sample(frac=0.5).index
musicFeatures = musicFeatures.drop(rock_indexes)

elec_indexes = musicFeatures[musicFeatures['genre'] == 'Electronic'].sample(frac=0.5).index
musicFeatures = musicFeatures.drop(elec_indexes)


In [13]:
musicFeatures['genre'].value_counts()

Rock            4775
Hip-Hop         4130
Experimental    4035
Folk            4030
Electronic      3990
Pop             3035
Instrumental    2905
Name: genre, dtype: int64

Since we technically have a 3 dimensional csv file, we wanted to flatten it so we convert both types of features into same format where rows represents the number of songs, each column represent the feaures and the array inside represents the actual values of the features. We suse the follwing representations to do the same.

In [14]:
def convertRepresentationSimple(x):
    return np.array(literal_eval(x))

def convertRepresentationNewline(x):
    x = ','.join(x.replace('[ ', '[').split())
    return np.array(literal_eval(x))



In [15]:

cols = ['mfccFeature_0', 'mfccFeature_1', 'mfccFeature_2',
       'mfccFeature_3', 'mfccFeature_4', 'mfccFeature_5', 'mfccFeature_6',
       'mfccFeature_7', 'mfccFeature_8', 'mfccFeature_9', 'mfccFeature_10',
       'mfccFeature_11', 'mfccFeature_12', 'mfccFeature_13', 'mfccFeature_14',]

for i in tqdm(cols):
    musicFeatures[i] = musicFeatures[i].apply(convertRepresentationSimple)
    
cols2 =  ['themeFeature_0', 'themeFeature_1', 'themeFeature_2', 'themeFeature_3',
       'themeFeature_4']

for i in tqdm(cols2):
    musicFeatures[i] = musicFeatures[i].apply(convertRepresentationNewline)






100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [03:57<00:00, 15.84s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:21<00:00, 16.36s/it]


In [16]:
musicFeatures.shape

(26900, 22)

Final verification of the dataset.

In [17]:
"""
Verifying all shapres and types of data
"""
print(musicFeatures.shape)
print('MFCC Features \n')
print(type(musicFeatures['mfccFeature_0'].iloc[0]))
print(len(musicFeatures['mfccFeature_0'].iloc[0]))
print(type(musicFeatures['mfccFeature_0'].iloc[0][0]))

print('Theme Features \n')
print(type(musicFeatures['themeFeature_0'].iloc[0]))
print(musicFeatures['themeFeature_0'].iloc[0].size)
print(type(musicFeatures['themeFeature_0'].iloc[0][0]))



(26900, 22)
MFCC Features 

<class 'numpy.ndarray'>
250
<class 'numpy.float64'>
Theme Features 

<class 'numpy.ndarray'>
250
<class 'numpy.float64'>


In [18]:
#Saving as pickle so that we dont lose numpy arrays
musicFeatures.to_pickle("./finalDataset/musicFeatures.pkl") 


In [19]:
#Saving as csv file causes arrays to be represented as string, when reading use a converter
#https://stackoverflow.com/questions/42755214/how-to-keep-numpy-array-when-saving-pandas-dataframe-to-csv
musicFeatures.to_csv("./finalDataset/musicFeatures.csv") 

In [20]:
#Saving as Json for good measure too
musicFeatures.to_json("./finalDataset/musicFeatures.json")



The dataset is ready for model creation