# Data Cleaning

## Numeric Feature Data

In [1]:
# Imports
import pandas as pd

In [2]:
# Reading in the data
genre = pd.read_csv('../data/genre.csv')

In [3]:
# Taking a look at the data
genre.head()

Unnamed: 0,files,zero_crossing_rate,spectral_centroid,spectral_rolloff,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13,labels
0,<DirEntry 'pop.00027.wav'>,0.101867,2682.251387,5955.912452,-75.31012,83.352333,10.692239,14.464298,14.244169,7.392612,5.275318,-0.394537,5.729705,3.368037,-2.524067,-4.216263,-1.839258,<DirEntry 'pop
1,<DirEntry 'pop.00033.wav'>,0.13701,2835.103815,5944.979313,-68.928207,73.759949,-12.472088,9.549262,-0.037814,12.72626,3.162044,-0.072349,2.534841,2.882886,-1.671194,-3.015886,1.618294,<DirEntry 'pop
2,<DirEntry 'pop.00032.wav'>,0.228641,4225.533439,8483.474172,-113.112213,41.596046,21.518061,11.100333,17.437881,0.610943,3.028697,0.086027,3.453746,0.289805,0.620056,1.260636,2.900202,<DirEntry 'pop
3,<DirEntry 'pop.00026.wav'>,0.103494,2525.419447,5552.776916,-73.322212,97.488853,16.8584,-0.587017,7.188521,4.874156,0.759747,-5.546975,1.001608,4.935908,-5.822525,-4.91801,-0.025517,<DirEntry 'pop
4,<DirEntry 'pop.00030.wav'>,0.148679,3070.608038,6653.027004,-18.115849,72.282204,-4.89048,-0.378657,1.685867,-0.92505,-1.330334,-0.343852,0.651997,-0.186307,-1.225008,-1.049567,2.526758,<DirEntry 'pop


### Creating Labels

In [4]:
# Fixing the file names and labels
genre['files'] = genre['files'].map(lambda x: x[11:-2])
genre['labels'] = genre['labels'].map(lambda x: x[11:])

In [5]:
# Mapping the labels to numeric values
label_map = {
    'blues': 1,
    'classical': 2,
    'country': 3,
    'disco': 4,
    'hiphop': 5,
    'jazz': 6,
    'metal': 7,
    'pop': 8,
    'reggae': 9,
    'rock': 10
}

genre['y'] = genre['labels'].map(label_map)

In [6]:
genre.head()

Unnamed: 0,files,zero_crossing_rate,spectral_centroid,spectral_rolloff,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13,labels,y
0,pop.00027.wav,0.101867,2682.251387,5955.912452,-75.31012,83.352333,10.692239,14.464298,14.244169,7.392612,5.275318,-0.394537,5.729705,3.368037,-2.524067,-4.216263,-1.839258,pop,8
1,pop.00033.wav,0.13701,2835.103815,5944.979313,-68.928207,73.759949,-12.472088,9.549262,-0.037814,12.72626,3.162044,-0.072349,2.534841,2.882886,-1.671194,-3.015886,1.618294,pop,8
2,pop.00032.wav,0.228641,4225.533439,8483.474172,-113.112213,41.596046,21.518061,11.100333,17.437881,0.610943,3.028697,0.086027,3.453746,0.289805,0.620056,1.260636,2.900202,pop,8
3,pop.00026.wav,0.103494,2525.419447,5552.776916,-73.322212,97.488853,16.8584,-0.587017,7.188521,4.874156,0.759747,-5.546975,1.001608,4.935908,-5.822525,-4.91801,-0.025517,pop,8
4,pop.00030.wav,0.148679,3070.608038,6653.027004,-18.115849,72.282204,-4.89048,-0.378657,1.685867,-0.92505,-1.330334,-0.343852,0.651997,-0.186307,-1.225008,-1.049567,2.526758,pop,8


#### Export

In [7]:
genre.to_csv('../data/genre_clean.csv', index=False)

## Mel Spectrogram Data

In [8]:
# Reading in the data
mel_specs = pd.read_csv('../data/genre_mel_specs.csv')

In [9]:
# Taking a look at the data
mel_specs.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,84471,84472,84473,84474,84475,84476,84477,84478,84479,84480
0,-29.408966,-48.99612,-35.890785,-38.473793,-45.027336,-42.14228,-38.156357,-42.31077,-42.887623,-47.695614,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,pop
1,-28.04128,-25.964296,-38.926926,-34.85074,-23.611973,-20.352016,-20.710598,-18.608648,-18.044193,-20.077085,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,pop
2,-31.275974,-21.357664,-22.34517,-24.183,-24.496088,-23.788536,-22.786224,-23.287695,-23.489752,-22.041656,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,pop
3,-34.49039,-57.283905,-66.37612,-47.502922,-46.439133,-66.58425,-66.201706,-58.24589,-63.198303,-79.21695,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,pop
4,-19.389597,-24.738457,-26.721848,-26.782904,-28.27762,-29.624008,-26.266392,-20.711044,-20.775738,-20.707817,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,pop


### Creating Labels

In [10]:
# Renaming the label column and mapping them to numeric values using the same map as above
mel_specs = mel_specs.rename(columns={'84480': 'labels'})
mel_specs['y'] = mel_specs['labels'].map(label_map)

#### Export

In [11]:
mel_specs.to_csv('../data/genre_mel_specs_clean.csv', index=False)