## Data Cleaning and Preprocessing Notebook

This notebook is to be strictly used for data cleaning and preprocessing purposes. Steps:

1. Read the dataset
2. Handle Missing Values (if any).
3. Do visualizations as required
4. Explore your data here
5. Save the cleaned and processed dataset as `data/final_dataset.csv`.
6. Split the dataset obtained in step 5 as `input/train.csv`,`input/test.csv`,`input/validation.csv`

NO MODELLING WILL BE DONE IN THIS NOTEBOOK!

In [1]:
from scipy.io import wavfile
from scipy.signal import spectrogram
import numpy as np
import librosa.display
import pandas as pd
import matplotlib.pyplot as plt
from librosa.feature import mfcc
import librosa
import soundfile as sf

In [2]:
PATH='../data/genres_original/'

In [3]:
samplerate, data=wavfile.read("../data/genres_original/classical/0Gef573AJfARbMuQSoCy2r.wav")

In [4]:
data=np.mean(data,axis=1)

In [5]:
def convert_to_mfcc(filepath):
    samplerate,data=wavfile.read(filepath)
    if len(data.shape)==2:
        data=np.mean(data,axis=1)
    start=data.shape[0]//3
    stop=start+data.shape[0]//3
    data=data[start:stop]
    features=mfcc(y=data.astype("float"),n_mfcc=39).T
    return np.mean(features,axis=0)

In [6]:
import os

In [7]:
classes=os.listdir(PATH)

In [8]:
classes

['blues',
 'classical',
 'country',
 'disco',
 'hiphop',
 'metal',
 'pop',
 'reggae',
 'rock']

In [9]:
song_files=[]
mfcc_songs=[]
for c in classes:
    for song in os.listdir(os.path.join(PATH,c)):
        song_path=os.path.join(PATH,c,song)
        print(song)
        song_files.append([song_path,c])
        mfcc_songs.append(convert_to_mfcc(song_path))

09PAOMUiAjZho9HVkd5umo.wav
0Q4ubt0dvVtMcAYwAul0Nl.wav
0QpxUzXEI3ZjJeo6dDFgV7.wav
0t0CvqbxqwtETTajrzqKoW.wav
0v6tG0HoAHiluZAeany7oU.wav
0yBzzceMXi6VMMXGJDqY9A.wav
18PhtDBBYxE0vN4YvsZnMy.wav
1I32D5O8mjfQEglezN4bY0.wav
1i7QbG73o4liXhmbtaT7Z3.wav
1k6zIr7FmCDWBlEGMjkwLb.wav
1MIDCw2NTW3Dtn9c6ohSC8.wav
1nUtE7LNXLGFEJk8kG8gE6.wav
1oG6p5Ue73whPJODUWTuCe.wav
1OGh5aeiGiNvuD37LiNN99.wav
1qHFxjvmKpmwZUcXckLkYc.wav
1ska3YnfMLiOJ6YH7EpZa9.wav
1tsOB58QHINgc2FEJylsLP.wav
2EoSICrErU4WI3QSSu61Xf.wav
2h8nq99nTZQk9H4B63QjVn.wav
2jgHuKeHNfhtYEHwIuc0iJ.wav
2jXrwWE32uQJBi50xCvE9c.wav
2lWTmgOTlemZ0RJyKKhr0s.wav
2NdRldi1OaTHDdTOnTS1Z8.wav
2O7OjNCjqflzWOTVg9UlEm.wav
2Qm98Rw4rdfvo14ZOUiZTN.wav
2T6QJ7Ax9evXGxvGXBC8le.wav
2uSArwKGIy7RKMqIwNyEiN.wav
2wUDhs2MkJGHWFXHZtNmMQ.wav
2z34AUAlDL5ZVjZpCNqwD1.wav
3bhUhdJjutn7jjvkS5jHXH.wav
3BkMv30DKcUEOsRbDSAyIq.wav
3DbVBS8JojEGfuLOZRGgWX.wav
3DjwjAUPT4zvIu6ZzcgLGJ.wav
3gGKOVwsAVvwt9BcH3k18J.wav
3JPC78sNlPaiZ1NgztXW2Q.wav
3LGB6BkiSmzP8IHo03rZMo.wav
3lnavfgHUTrxdRqcPmhqUA.wav
3

In [10]:
mfcc_features=pd.DataFrame(mfcc_songs)

In [11]:
mfcc_features=mfcc_features.add_prefix("mean_mfcc_")

In [12]:
songs=pd.DataFrame(song_files,columns=["filepath","genre"])

In [13]:
dataset=pd.concat([mfcc_features,songs],axis=1)

In [14]:
dataset

Unnamed: 0,mean_mfcc_0,mean_mfcc_1,mean_mfcc_2,mean_mfcc_3,mean_mfcc_4,mean_mfcc_5,mean_mfcc_6,mean_mfcc_7,mean_mfcc_8,mean_mfcc_9,...,mean_mfcc_31,mean_mfcc_32,mean_mfcc_33,mean_mfcc_34,mean_mfcc_35,mean_mfcc_36,mean_mfcc_37,mean_mfcc_38,filepath,genre
0,389.830269,101.913146,-8.081283,38.818610,4.422686,17.671487,-0.791387,9.969514,4.886704,9.506469,...,2.253134,1.840256,5.401648,3.582529,7.066961,3.518919,3.568450,3.295176,../data/genres_original/blues\09PAOMUiAjZho9HV...,blues
1,276.542613,103.286028,-4.768063,34.023069,23.166427,-0.232400,2.411217,6.421958,-7.599663,-1.183824,...,11.157916,12.374295,10.484138,5.069859,4.025534,5.503153,7.086537,6.732503,../data/genres_original/blues\0Q4ubt0dvVtMcAYw...,blues
2,439.404157,105.610466,-2.597989,15.388756,14.379118,5.831333,1.971954,11.624723,9.441901,4.223720,...,-0.329869,2.210618,1.633187,-0.709041,2.715509,0.700981,3.634254,-1.194212,../data/genres_original/blues\0QpxUzXEI3ZjJeo6...,blues
3,388.838896,88.701932,-41.845872,48.042033,-1.511994,1.681485,6.309696,14.849037,7.332121,6.590197,...,-0.601358,3.284882,8.577535,10.382903,8.964642,7.338554,4.508686,-0.097310,../data/genres_original/blues\0t0CvqbxqwtETTaj...,blues
4,472.693570,93.625061,4.083838,39.666360,9.628457,14.310981,6.329387,4.515982,-1.258290,1.438771,...,0.241276,-2.258065,1.896969,4.441195,2.100871,2.090461,0.006407,1.208450,../data/genres_original/blues\0v6tG0HoAHiluZAe...,blues
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2675,858.461476,114.960290,-23.636703,57.113928,-8.779920,22.074693,-12.710622,19.354571,-21.565763,16.465147,...,-8.831534,-3.913821,-2.118554,4.254585,1.009141,-0.339230,-2.538879,-2.532717,../data/genres_original/rock\rock.00095.wav,rock
2676,904.530696,115.640272,-36.586899,49.134857,-9.279729,23.542973,-19.786624,20.690265,-24.077642,12.975183,...,3.839082,5.002131,-4.517260,-6.193667,-4.948907,-1.482094,-3.990042,-1.710012,../data/genres_original/rock\rock.00096.wav,rock
2677,895.492693,110.319734,-44.931821,51.391753,-16.046002,26.264275,-11.170195,19.737735,-18.653527,9.122975,...,-6.195041,-1.452225,0.561126,-0.253360,-4.183094,0.426335,-3.223836,-5.073747,../data/genres_original/rock\rock.00097.wav,rock
2678,858.691183,124.388497,-25.243616,57.232746,8.244684,23.372433,-12.558707,18.986485,-12.093466,16.685246,...,-1.853567,-1.177599,-0.939998,1.709381,-0.775198,-0.604280,-4.401733,-2.993642,../data/genres_original/rock\rock.00098.wav,rock


In [16]:
dataset.to_csv("../data/Song_Data_MFCC.csv",index=False)