In [222]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import librosa
import librosa.display   #for loading and visualizing audio files
import IPython.display as ipd   #to play audio

plt.style.use('ggplot')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline
import time 
import pickle

## Summary Statistics Extraction

Get summary statistics for each feature (mean, median, std, min, max, kurtosis, skew : 7 total)
- We want to calculate each statistic on'b' a total of 'a' times for a given feature array with shape : (a , b)
- For each track, store each statistic for a given feature in a dictionary (dictionary within dictionary)
- Will be saved as dictionaries as will need to use this later on for classification methods b and c


Inspecting a saved feature dictionary:

In [20]:
feat_dict_fp = open('/Volumes/Extreme SSD/CAPSTONE_DATA/saved/features_dictionaries/dict_feat_000139.pkl', "rb")
feat_dict_000139 = pickle.load(feat_dict_fp)
feat_dict_000139['melspec'].shape

(128, 1292)

Face issue for certain features like 'melspec', 'H', 'R'....as these have many rows > 100 ('a') need to reduce these rows down. Otherwise will have too many feature columns!

### Pre-processing function

In [17]:
#this functions allows to check divisibility of array and reshape by largest divisor possible between 2 - 10 inclusive
#this way can reduce dimensionality for when calculating summary statistics for arrays that have many rows
#will specify minimum number of rows required to use this reshaper and in turn reducing dim. of final summary stats

def reshape_array_custom(array, display_shape = False):
    div_list = []
    for n in range(2,11):
        if array.shape[0] % n == 0:
            div_list.append(n)
    if len(div_list) == 0:
        print('{} array not able to reshape'.format(array)) #will give error if not able to divide by numbers in range
    else:
        a = int(array.shape[0]/div_list[-1])
        b = int(array.shape[1]*div_list[-1])
        
        array_reshaped = np.reshape(array, (a,b))
        if display_shape:
            return array_reshaped, array_reshaped.shape
        else:
            return array_reshaped

Main SS extraction function:

In [18]:
from scipy.stats import skew, kurtosis

def get_statistics_dict(feat_dict, reduce_limit=50):  #function which gets respective statistics of an array and its statistic name
    track_id = feat_dict['track_id'] 
    feature_list = list(feat_dict_000139.keys())[1:]
    
    stat_feat_dictionary = {}
    
    for feat_key in feature_list:
        mean_list = []
        median_list = []
        std_list = []
        min_list = []
        max_list = []
        kurtosis_list = []
        skew_list = []
        
        feature = feat_dict[feat_key] #numpy array of feature 

        if (len(feature.shape) == 1) | (len(feature.shape) == 0) :  #covers 'yharm' as array is (a,) format and 'tempo' as it is only a number
            mean_list.append(np.mean(feature))
            median_list.append(np.median(feature))
            std_list.append(np.std(feature))
            min_list.append(np.min(feature))
            max_list.append(np.max(feature))
            kurtosis_list.append(kurtosis(feature))
            skew_list.append(skew(feature))
            
        else:
            if (feature.shape[0] >= reduce_limit) & (len(feature.shape)  == 2): #checking to see if no. of rows is > 50 and also given it has rows and columns (==2)
                feature =  reshape_array_custom(feature) #uses reshape function to reduce dimensiaonlity further down the line
            else:
                pass
            for i in range(0, feature.shape[0]):
                mean_list.append(np.mean(feature[i]))
                median_list.append(np.median(feature[i]))
                std_list.append(np.std(feature[i]))
                min_list.append(np.min(feature[i]))
                max_list.append(np.max(feature[i]))
                kurtosis_list.append(kurtosis(feature[i]))
                skew_list.append(skew(feature[i]))  
         
        #dictionary of lists for each feature
        items = [('mean', mean_list),('median', median_list),('std', std_list),('min', min_list),('max', max_list),
                ('kurtosis',kurtosis_list),('skew',skew_list)]
        
        stat_dictionary = {} 
        
        for key, value in items:           #filling stat_dictionary with items list
            stat_dictionary[key] = value
        
        stat_feat_dictionary[feat_key] = stat_dictionary  #feat_key is feature name i.e 'melspec'
    
    return stat_feat_dictionary

Check keys, sub keys and test on one feature dictionary :

In [17]:
print("feature keys:",get_statistics_dict(feat_dict_000139).keys())
print(" ")
print("statistic keys:",get_statistics_dict(feat_dict_000139)['yharm'].keys())

feature keys: dict_keys(['yharm', 'melspec', 'chroma_stft_S1', 'chroma_cens', 'mfcc', 'rms', 'spec_centroid', 'spec_bw', 'contrast', 'flatness', 'rolloff', 'poly', 'tonnetz', 'ZCR', 'tempo', 'H', 'P'])
 
statistic keys: dict_keys(['mean', 'median', 'std', 'min', 'max', 'kurtosis', 'skew'])


In [21]:
get_statistics_dict(feat_dict_000139)

{'yharm': {'mean': [-4.6489004e-06],
  'median': [-0.0033945264],
  'std': [0.10561557],
  'min': [-0.5848908],
  'max': [0.5880267],
  'kurtosis': [1.0268645105716043],
  'skew': [0.08389510214328766]},
 'melspec': {'mean': [29.244663,
   8.8465185,
   5.737589,
   4.053247,
   4.1738486,
   1.4565071,
   1.5830191,
   2.2931275,
   1.4136465,
   0.70607615,
   0.58230174,
   0.37390986,
   0.11022425,
   0.044468988,
   0.025862614,
   0.01614708],
  'median': [5.8609743,
   1.2600415,
   0.7698909,
   0.53783286,
   0.41226405,
   0.2190738,
   0.45480734,
   0.4850561,
   0.4260375,
   0.25516254,
   0.27047718,
   0.18845913,
   0.048006065,
   0.015879836,
   0.008875962,
   0.0042483336],
  'std': [67.33632,
   20.39787,
   14.753952,
   10.23804,
   12.564953,
   4.567591,
   3.2658617,
   5.246332,
   2.814441,
   1.1847454,
   0.9366098,
   0.52759546,
   0.19641934,
   0.10466671,
   0.057799146,
   0.04999411],
  'min': [1.7842575e-11,
   5.4207493e-11,
   4.0820906e-14,
  

**Estimating the number of final features we will have as the number of rows, 'a', in the feature arrays changes by varying the reduce_limit variable**

- No limit
- 129 limit
- 50 limit

as number of observations approaches number of features will overfit and not approximate well to real test set, observations ~ 16,000

In [68]:
#count of how many feature columns we will in total for when arrays.shape[0] >= 1000000
reduce_limit = 1000000
stat_dict = get_statistics_dict(feat_dict_000139, reduce_limit = reduce_limit) 
counter = 0
for i in list(stat_dict.keys()):
    for j in list(stat_dict[i].keys()):
        counter = counter + len(stat_dict[i][j])
print("reduction limit :",reduce_limit)
print("No. of feature columns : ",counter)  

reduction limit : 1000000
No. of feature columns :  15652


In [71]:
#count of how many feature columns we will in total for when arrays.shape[0] >= 129
reduce_limit = 129
stat_dict = get_statistics_dict(feat_dict_000139, reduce_limit = reduce_limit)
counter = 0
for i in list(stat_dict.keys()):
    for j in list(stat_dict[i].keys()):
        counter = counter + len(stat_dict[i][j])
print("reduction limit :",reduce_limit)       
print("No. of feature columns : ",counter) 

reduction limit : 129
No. of feature columns :  4172


In [72]:
#count of how many feature columns we will in total for when arrays.shape[0] >= 50
reduce_limit = 50
stat_dict = get_statistics_dict(feat_dict_000139, reduce_limit = reduce_limit)
counter = 0
for i in list(stat_dict.keys()):
    for j in list(stat_dict[i].keys()):
        counter = counter + len(stat_dict[i][j])
print("reduction limit :",reduce_limit)   
print("No. of feature columns : ",counter) 

reduction limit : 50
No. of feature columns :  3388


***EXTRACTING (0 - 50)***

Getting summary statistic feature dictionaries for all numpy arrays stored in feature_dictionaries*** (Do not run twice*)

In [342]:
feat_dict_0_50_fp = '/Volumes/Extreme SSD/CAPSTONE_DATA/saved/features_dictionaries'
feature_dict_0_50 = os.listdir(feat_dict_0_50_fp)

In [142]:
start_index = feature_dict_0_50.index(feature_dict_0_50[0])
stop_index = feature_dict_0_50.index(feature_dict_0_50[-1])

for i in range(start_index, stop_index + 1): #allows me to control and save partially and then continue another day from where i stopped
    feat_dict = feature_dict_0_50[i]
    track_id = feature_dict_0_50[i].split('_')[2].split('.')[0]     
    load_feat_from = open('/Volumes/Extreme SSD/CAPSTONE_DATA/saved/features_dictionaries/{}'.format(feat_dict), "rb")
    a_feat_dict = pickle.load(load_feat_from)
    
    ss_feat_dict = get_statistics_dict(a_feat_dict, reduce_limit = 50)
    
    save_ss_feat_dict_to = open('/Volumes/Extreme SSD/CAPSTONE_DATA/saved/ss_feature_dictionaries/dict_ss_feat_{}.pkl'.format(track_id), "wb")
    pickle.dump(ss_feat_dict, save_ss_feat_dict_to)
    save_ss_feat_dict_to.close()  

Checking total No. of ss feature dictionaries is consistent with total No. of feature dictionaries/tracks/ATS arrays (7,949)

In [150]:
ss_feat_dict_0_50_fp = '/Volumes/Extreme SSD/CAPSTONE_DATA/saved/ss_feature_dictionaries' #numpy arrays that were extracted
ss_feat_dict_0_50 = os.listdir(ss_feat_dict_0_50_fp)
print("No. of ss feature dictionaries collected:", len(ss_feat_dict_0_50))

No. of ss feature dictionaries collected: 7949


Check that dictionaries were saved correctly - try opening one

In [154]:
ss_feat_dict_0_50[2]

'dict_ss_feat_000141.pkl'

In [491]:
ss_feat_dict_000141_fp = open('/Volumes/Extreme SSD/CAPSTONE_DATA/saved/ss_feature_dictionaries/dict_ss_feat_000141.pkl', "rb")
ss_feat_dict_000141 = pickle.load(ss_feat_dict_000141_fp)
ss_feat_dict_000141

{'yharm': {'mean': [-5.0546873e-06],
  'median': [-0.00063150603],
  'std': [0.11217831],
  'min': [-0.7301745],
  'max': [0.7713583],
  'kurtosis': [3.7140214918663217],
  'skew': [-0.012445690110325813]},
 'melspec': {'mean': [29.872717,
   5.61408,
   15.33078,
   2.8461154,
   3.9046223,
   0.67912954,
   0.7289318,
   0.41905156,
   0.22696327,
   0.17613901,
   0.1380045,
   0.07299476,
   0.028343866,
   0.010564126,
   0.0076444666,
   0.007040733],
  'median': [1.4553485,
   0.7197008,
   0.35563093,
   0.09464314,
   0.06651504,
   0.030882044,
   0.03032779,
   0.019366384,
   0.010984183,
   0.010900689,
   0.006768495,
   0.002733102,
   0.0005842892,
   0.00014348314,
   0.00022197365,
   0.00033982168],
  'std': [127.47803,
   14.300507,
   53.796978,
   13.829424,
   16.214296,
   3.820685,
   2.46332,
   1.2406027,
   0.9335616,
   0.6489097,
   0.53329456,
   0.24570027,
   0.087113455,
   0.0361242,
   0.028177591,
   0.025324496],
  'min': [4.586849e-12,
   2.722912

In [498]:
len(ss_feat_dict_000141['melspec']['median'])

16

In [496]:
len(ss_feat_dict_000141['melspec']['min'])

16

***EXTRACTING (51 - 100)***

After adding extracted features from folders 51 - 100 need to get ss for these also. New starting index, start_index_51 is old stop index + 1. stop_index_51 is last index in list.

Need to sort list of files as they could have become mixed with older files from previous 0 - 50 folder dump

In [12]:
feature_dict_0_100_fp = '/Volumes/Extreme SSD/CAPSTONE_DATA/saved/features_dictionaries'
feature_dict_0_100 = sorted(os.listdir(feature_dict_0_100_fp))

In [22]:
#old list had total 7949 (new starting index is old index + 1, 7948 + 1)
start_index_51_100 = feature_dict_0_100.index(feature_dict_0_100[7949])
stop_index_51_100 =  feature_dict_0_100.index(feature_dict_0_100[-1])

for i in range(start_index_51_100, stop_index_51_100 + 1): #allows me to control and save partially and then continue another day from where i stopped
    feat_dict = feature_dict_0_100[i]
    track_id = feature_dict_0_100[i].split('_')[2].split('.')[0]     
    load_feat_from = open('/Volumes/Extreme SSD/CAPSTONE_DATA/saved/features_dictionaries/{}'.format(feat_dict), "rb")
    a_feat_dict = pickle.load(load_feat_from)
    
    ss_feat_dict = get_statistics_dict(a_feat_dict, reduce_limit = 50)
    
    save_ss_feat_dict_to = open('/Volumes/Extreme SSD/CAPSTONE_DATA/saved/ss_feature_dictionaries/dict_ss_feat_{}.pkl'.format(track_id), "wb")
    pickle.dump(ss_feat_dict, save_ss_feat_dict_to)
    save_ss_feat_dict_to.close()  

Checking new total No. of ss feature dictionaries is consistent with total No. of feature dictionaries/tracks/ATS arrays (7,949 + 8073 = 16022)

In [23]:
ss_feat_dict_0_100_fp = '/Volumes/Extreme SSD/CAPSTONE_DATA/saved/ss_feature_dictionaries' #numpy arrays that were extracted
ss_feat_dict_0_100 = os.listdir(ss_feat_dict_0_100_fp)
print("No. of ss feature dictionaries collected:", len(ss_feat_dict_0_100))

No. of ss feature dictionaries collected: 16022


## Collate Genre meta-data into dataframe (0 - 50)
1. Combine genres, sub genres with associated track_id
2. Add features for given track_id
3. preliminarly modelling

Loading track_id and folder dictionary for all tracks collected so far (folders 0 - 50)

In [29]:
dict_id_0_50_fp = open("/Volumes/Extreme SSD/CAPSTONE_DATA/saved/dictionaries/dict_ATS_0_50.pkl", "rb")
dict_id_0_50 = pickle.load(dict_id_0_50_fp)
df_genres_raw = pd.DataFrame.from_dict(dict_id_0_50)

In [30]:
df_genres_raw.head()

Unnamed: 0,track_id,folder
0,2,0
1,3,0
2,5,0
3,10,0
4,134,0


from genre meta-data csv load associated genre for each track_id

In [43]:
tracks = pd.read_csv('/Users/KarimChammaa/Desktop/GA/CAPSTONE_METADATA/fma_metadata/tracks.csv')
pd.set_option('display.max_columns', None)
tracks

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0.1,Unnamed: 0,album,album.1,album.2,album.3,album.4,album.5,album.6,album.7,album.8,album.9,album.10,album.11,album.12,artist,artist.1,artist.2,artist.3,artist.4,artist.5,artist.6,artist.7,artist.8,artist.9,artist.10,artist.11,artist.12,artist.13,artist.14,artist.15,artist.16,set,set.1,track,track.1,track.2,track.3,track.4,track.5,track.6,track.7,track.8,track.9,track.10,track.11,track.12,track.13,track.14,track.15,track.16,track.17,track.18,track.19
0,,comments,date_created,date_released,engineer,favorites,id,information,listens,producer,tags,title,tracks,type,active_year_begin,active_year_end,associated_labels,bio,comments,date_created,favorites,id,latitude,location,longitude,members,name,related_projects,tags,website,wikipedia_page,split,subset,bit_rate,comments,composer,date_created,date_recorded,duration,favorites,genre_top,genres,genres_all,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title
1,track_id,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,[],AWOL - A Way Of Life,7,Album,2006-01-01 00:00:00,,,"<p>A Way Of Life, A Collective of Hip-Hop from...",0,2008-11-26 01:42:32,9,1,40.0583238,New Jersey,-74.4056612,"Sajje Morocco,Brownbum,ZawidaGod,Custodian of ...",AWOL,The list of past projects is 2 long but every1...,['awol'],http://www.AzillionRecords.blogspot.com,,training,small,256000,0,,2008-11-26 01:48:12,2008-11-26 00:00:00,168,2,Hip-Hop,[21],[21],,4656,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1293,,3,,[],Food
3,3,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,[],AWOL - A Way Of Life,7,Album,2006-01-01 00:00:00,,,"<p>A Way Of Life, A Collective of Hip-Hop from...",0,2008-11-26 01:42:32,9,1,40.0583238,New Jersey,-74.4056612,"Sajje Morocco,Brownbum,ZawidaGod,Custodian of ...",AWOL,The list of past projects is 2 long but every1...,['awol'],http://www.AzillionRecords.blogspot.com,,training,medium,256000,0,,2008-11-26 01:48:14,2008-11-26 00:00:00,237,1,Hip-Hop,[21],[21],,1470,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,514,,4,,[],Electric Ave
4,5,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,[],AWOL - A Way Of Life,7,Album,2006-01-01 00:00:00,,,"<p>A Way Of Life, A Collective of Hip-Hop from...",0,2008-11-26 01:42:32,9,1,40.0583238,New Jersey,-74.4056612,"Sajje Morocco,Brownbum,ZawidaGod,Custodian of ...",AWOL,The list of past projects is 2 long but every1...,['awol'],http://www.AzillionRecords.blogspot.com,,training,small,256000,0,,2008-11-26 01:48:20,2008-11-26 00:00:00,206,6,Hip-Hop,[21],[21],,1933,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1151,,6,,[],This World
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106571,155316,0,2017-03-30 15:20:35,2017-02-17 00:00:00,,0,22940,"<p>A live performance at Monty Hall on Feb 17,...",1506,Monty Hall,[],"Live at Monty Hall, 2/17/2017",6,Live Performance,,,,,0,2017-03-30 15:18:28,0,24357,,New Jersey,,GILLIAN/JENNA/DECLAN/JAIME,Spowder,,['spowder'],https://spowder.bandcamp.com/,,training,large,320000,0,,2017-03-30 15:23:34,,162,1,Rock,[25],"[25, 12]",,122,,Creative Commons Attribution-NonCommercial-NoD...,102,,3,,[],The Auger
106572,155317,0,2017-03-30 15:20:35,2017-02-17 00:00:00,,0,22940,"<p>A live performance at Monty Hall on Feb 17,...",1506,Monty Hall,[],"Live at Monty Hall, 2/17/2017",6,Live Performance,,,,,0,2017-03-30 15:18:28,0,24357,,New Jersey,,GILLIAN/JENNA/DECLAN/JAIME,Spowder,,['spowder'],https://spowder.bandcamp.com/,,training,large,320000,0,,2017-03-30 15:23:36,,217,1,Rock,[25],"[25, 12]",,194,,Creative Commons Attribution-NonCommercial-NoD...,165,,4,,[],Let's Skin Ruby
106573,155318,0,2017-03-30 15:20:35,2017-02-17 00:00:00,,0,22940,"<p>A live performance at Monty Hall on Feb 17,...",1506,Monty Hall,[],"Live at Monty Hall, 2/17/2017",6,Live Performance,,,,,0,2017-03-30 15:18:28,0,24357,,New Jersey,,GILLIAN/JENNA/DECLAN/JAIME,Spowder,,['spowder'],https://spowder.bandcamp.com/,,training,large,320000,0,,2017-03-30 15:23:37,,404,2,Rock,[25],"[25, 12]",,214,,Creative Commons Attribution-NonCommercial-NoD...,168,,6,,[],My House Smells Like Kim Deal/Pulp
106574,155319,0,2017-03-30 15:20:35,2017-02-17 00:00:00,,0,22940,"<p>A live performance at Monty Hall on Feb 17,...",1506,Monty Hall,[],"Live at Monty Hall, 2/17/2017",6,Live Performance,,,,,0,2017-03-30 15:18:28,0,24357,,New Jersey,,GILLIAN/JENNA/DECLAN/JAIME,Spowder,,['spowder'],https://spowder.bandcamp.com/,,training,large,320000,0,,2017-03-30 15:23:39,,146,0,Rock,[25],"[25, 12]",,336,,Creative Commons Attribution-NonCommercial-NoD...,294,,5,,[],The Man With Two Mouths


Have to convert 'Unnamed: 0' column to string as it has track_id entries in both string and int. Track_id im looking for is string type only in function below :

In [83]:
tracks['Unnamed: 0'] = tracks['Unnamed: 0'].astype(str)

In [96]:
tracks[['Unnamed: 0','album.10','track.7','track.8']].iloc[16402]

Unnamed: 0            26555
album.10          Suspenzia
track.7                 NaN
track.8       [15, 38, 514]
Name: 16402, dtype: object

In [88]:
def retrieve_genre(track_id, from_column):  #retrieves input from_column for a given track_id from the meta dataframe, tracks
    track_numb = int(track_id) #or String.TrimStart()
    try:
        return tracks[tracks['Unnamed: 0'] == str(int(track_id))][from_column].values[0]
    except:
        return 'None'

In [None]:
#Creating new columns for each genre columns found in meta data, 'track.7', 8, 9
df_genres_raw['genres_top'] = df_genres_raw.track_id.apply(lambda x : retrieve_genre(x, 'track.7'))
df_genres_raw['genres'] = df_genres_raw.track_id.apply(lambda x : retrieve_genre(x, 'track.8'))
df_genres_raw['genres_all'] = df_genres_raw.track_id.apply(lambda x : retrieve_genre(x, 'track.9'))

In [118]:
df_genres_raw

Unnamed: 0,track_id,folder,genres_top,genres,genres_all
0,000002,000,Hip-Hop,[21],[21]
1,000003,000,Hip-Hop,[21],[21]
2,000005,000,Hip-Hop,[21],[21]
3,000010,000,Pop,[10],[10]
4,000134,000,Hip-Hop,[21],[21]
...,...,...,...,...,...
7944,050975,050,Jazz,[4],[4]
7945,050988,050,Jazz,[4],[4]
7946,050989,050,Jazz,[4],[4]
7947,050990,050,Jazz,[4],[4]


In [119]:
df_genres_raw.isnull().value_counts()

track_id  folder  genres_top  genres  genres_all
False     False   False       False   False         7949
dtype: int64

In [124]:
print("genres_top 'None' values total :",sum(df_genres_raw.genres_top == 'None'))
print("genres 'None' values total :",sum(df_genres_raw.genres == 'None'))
print("genres_all 'None' values total :",sum(df_genres_raw.genres_all == 'None'))

genres_top 'None' values total : 0
genres 'None' values total : 0
genres_all 'None' values total : 0


No null values or values with 'None'

Save / loading df_genres_raw csv file:

In [4]:
'''df_genres_raw.to_csv('/Volumes/Extreme SSD/CAPSTONE_DATA/saved/dictionaries/df_genres_raw.csv')'''#saved csv
df_genres = pd.read_csv('/Volumes/Extreme SSD/CAPSTONE_DATA/saved/dictionaries/df_genres_raw.csv', dtype = {'track_id':str, 'folder':str}) #loading csv
df_genres.drop(axis =1,columns='Unnamed: 0',inplace=True)

In [5]:
df_genres

Unnamed: 0,track_id,folder,genres_top,genres,genres_all
0,000002,000,Hip-Hop,[21],[21]
1,000003,000,Hip-Hop,[21],[21]
2,000005,000,Hip-Hop,[21],[21]
3,000010,000,Pop,[10],[10]
4,000134,000,Hip-Hop,[21],[21]
...,...,...,...,...,...
7944,050975,050,Jazz,[4],[4]
7945,050988,050,Jazz,[4],[4]
7946,050989,050,Jazz,[4],[4]
7947,050990,050,Jazz,[4],[4]


Create function which extracts from each ss feature dictionary all features and places in dataframe as well as respective genre / style for that track (use track_id to match)

Combine each feature dictionary into df_genres_raw dataframe

In [6]:
ss_feat_dict_000139_fp = open('/Volumes/Extreme SSD/CAPSTONE_DATA/saved/ss_feature_dictionaries/dict_ss_feat_000139.pkl', "rb")
ss_feat_dict_000139 = pickle.load(ss_feat_dict_000139_fp)
ss_feat_dict_000139

{'yharm': {'mean': [-4.6489004e-06],
  'median': [-0.0033945264],
  'std': [0.10561557],
  'min': [-0.5848908],
  'max': [0.5880267],
  'kurtosis': [1.0268645105716043],
  'skew': [0.08389510214328766]},
 'melspec': {'mean': [29.244663,
   8.8465185,
   5.737589,
   4.053247,
   4.1738486,
   1.4565071,
   1.5830191,
   2.2931275,
   1.4136465,
   0.70607615,
   0.58230174,
   0.37390986,
   0.11022425,
   0.044468988,
   0.025862614,
   0.01614708],
  'median': [5.8609743,
   1.2600415,
   0.7698909,
   0.53783286,
   0.41226405,
   0.2190738,
   0.45480734,
   0.4850561,
   0.4260375,
   0.25516254,
   0.27047718,
   0.18845913,
   0.048006065,
   0.015879836,
   0.008875962,
   0.0042483336],
  'std': [67.33632,
   20.39787,
   14.753952,
   10.23804,
   12.564953,
   4.567591,
   3.2658617,
   5.246332,
   2.814441,
   1.1847454,
   0.9366098,
   0.52759546,
   0.19641934,
   0.10466671,
   0.057799146,
   0.04999411],
  'min': [1.7842575e-11,
   5.4207493e-11,
   4.0820906e-14,
  

Create empty dataframe from dictionary of feature values after flattening individual dictionaries

In [10]:
dict_X = ss_feat_dict_000139
ss_dict = {}
feat_list = ['yharm', 'melspec', 'chroma_stft_S1', 'chroma_cens', 'mfcc', 'rms', 'spec_centroid', 'spec_bw', 
             'contrast', 'flatness', 'rolloff', 'poly', 'tonnetz', 'ZCR', 'tempo', 'H', 'P']
stat_list = ['mean', 'median', 'std', 'min', 'max', 'kurtosis', 'skew']

ss_dict['track_id'] = 0
for feat in feat_list:
    for stat in stat_list:
        for i, value in enumerate(dict_X[feat][stat]):
            key = '{}_{}_{}'.format(feat,stat, i +1)
            ss_dict[key] = 0

In [12]:
#create empty dataframe from dictionary above
df_ss_feat = pd.DataFrame(columns = ss_dict)
df_ss_feat

Unnamed: 0,track_id,yharm_mean_1,yharm_median_1,yharm_std_1,yharm_min_1,yharm_max_1,yharm_kurtosis_1,yharm_skew_1,melspec_mean_1,melspec_mean_2,...,P_skew_196,P_skew_197,P_skew_198,P_skew_199,P_skew_200,P_skew_201,P_skew_202,P_skew_203,P_skew_204,P_skew_205


In [13]:
df_ss_feat.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Columns: 3389 entries, track_id to P_skew_205
dtypes: object(3389)
memory usage: 0.0+ bytes


Add new row to dataframe after updating dictionary in for loop

**Note :** To avoid mixing up rows and losing order when retrieving a given dictionary in ss_feature_dictionaries folder, i will only pick dictionaries based on track_id given in df_genres, hence preserving order relative to df_genres

'H' & 'P' have values with complex numbers i.e R + Cj, only take real part of these features with .real

In [15]:
total = 0
count = -1
for track_id in df_genres.track_id.values: #preserving order relative to df_genres track_id
    dict_X_fp = open('/Volumes/Extreme SSD/CAPSTONE_DATA/saved/ss_feature_dictionaries/dict_ss_feat_{}.pkl'.format(track_id), "rb")
    dict_X = pickle.load(dict_X_fp)
    ss_dict = {}
    ss_dict['track_id'] = track_id
    for feat in feat_list:
        for stat in stat_list:
            for i, value in enumerate(dict_X[feat][stat]):
                key = '{}_{}_{}'.format(feat,stat, i +1)
                ss_dict[key] = value.real  #only take real part of values as some features have complex numbers
                
    df_ss_feat = df_ss_feat.append(ss_dict, ignore_index=True)
    count = count + 1
    total = total + 1
    if count == 500:
        #print("track_id :",track_id)
        df_ss_feat.to_csv('/Volumes/Extreme SSD/CAPSTONE_DATA/saved/dictionaries/df_ss_feat.csv')
        count = 0
        print("Current no. of iterations :",count)
print("Final no. of iterations :",total)
df_ss_feat.to_csv('/Volumes/Extreme SSD/CAPSTONE_DATA/saved/dictionaries/df_ss_feat.csv')

track_id : 003572
track_id : 006368
track_id : 010279
track_id : 012406
track_id : 014862
track_id : 017644
track_id : 020374
track_id : 023516
track_id : 027148
track_id : 030507
track_id : 034254
track_id : 038521
track_id : 041802
track_id : 044748
track_id : 047882


In [60]:
df_ss_feat

Unnamed: 0,track_id,yharm_mean_1,yharm_median_1,yharm_std_1,yharm_min_1,yharm_max_1,yharm_kurtosis_1,yharm_skew_1,melspec_mean_1,melspec_mean_2,...,P_skew_196,P_skew_197,P_skew_198,P_skew_199,P_skew_200,P_skew_201,P_skew_202,P_skew_203,P_skew_204,P_skew_205
0,000002,1.719662e-04,0.001340,0.065617,-0.441674,0.490110,1.713664,-0.168763,52.794807,9.047201,...,-32.036869,-1.592035,-0.774322,-2.433592,-10.650544,114.682968,-10.176087,4.382184,-9.046790,-9.193595
1,000003,1.352439e-05,0.000432,0.077000,-0.456636,0.403431,1.760048,-0.132668,34.337761,7.709792,...,-16.850039,-0.620028,4.098160,-0.011083,24.893883,-1.973635,-24.022272,1.053429,16.086792,6.467587
2,000005,-1.006557e-04,0.000111,0.092693,-0.543117,0.513209,2.822287,-0.044496,80.876183,7.042160,...,78.939896,-21.763428,-3.254653,-93.607475,9.272054,-14.719774,14.525596,-9.384195,-4.520085,0.508466
3,000010,3.410103e-06,-0.002461,0.137815,-0.684655,0.805533,0.253197,0.127628,38.179165,17.256195,...,-9.478538,-21.217518,0.750832,17.683798,8.193560,5.263644,-16.086927,15.080028,-19.146044,3.932007
4,000134,-5.839608e-05,-0.001115,0.094457,-0.508414,0.490558,0.983781,0.087950,91.566048,6.390040,...,-285.603058,-35.680218,-8.016828,-26.696106,-3.300993,-7.459848,-14.930106,-4.171641,-0.412970,23.477673
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7952,050975,1.933872e-04,-0.000133,0.129812,-0.589347,0.549743,0.669336,0.041043,75.710068,2.339389,...,-7.193809,1.489245,-0.182940,-134.752701,10.910300,7.404191,-80.671669,6.300637,-7.662754,7.088624
7953,050988,2.681863e-07,0.000777,0.145474,-0.701874,0.789463,1.678167,-0.007445,91.458458,1.072967,...,-1.576484,0.675101,3.625332,6.056493,-4.312865,6.530749,-9.423316,5.822365,-7.953242,7.475033
7954,050989,-3.340889e-06,0.000727,0.067113,-0.484633,0.483251,1.903812,-0.123613,15.975612,4.637448,...,-4.987584,-2.886751,-0.522574,5.348998,32.115612,-41.931553,5.557606,-6.900227,2.986025,-20.631742
7955,050990,2.299570e-07,-0.000119,0.048076,-0.441218,0.413483,15.526840,-0.190782,12.147303,0.740133,...,8.662319,47.503395,3.676044,1.118666,0.113696,18.061014,11.878851,-6.704487,7.828660,-7.316547


Drop duplicates from 2 areas : 

- From over saving
- Have some audio files that correspond to the same track. For example, track_id: '033200' and '040370'are same file in different folders.

In [104]:
df_ss_feat.drop_duplicates(inplace= True)
df_ss_feat.reset_index(inplace= True)

In [106]:
df_ss_feat

Unnamed: 0,index,track_id,yharm_mean_1,yharm_median_1,yharm_std_1,yharm_min_1,yharm_max_1,yharm_kurtosis_1,yharm_skew_1,melspec_mean_1,...,P_skew_196,P_skew_197,P_skew_198,P_skew_199,P_skew_200,P_skew_201,P_skew_202,P_skew_203,P_skew_204,P_skew_205
0,0,000002,1.719662e-04,0.001340,0.065617,-0.441674,0.490110,1.713664,-0.168763,52.794807,...,-32.036869,-1.592035,-0.774322,-2.433592,-10.650544,114.682968,-10.176087,4.382184,-9.046790,-9.193595
1,1,000003,1.352439e-05,0.000432,0.077000,-0.456636,0.403431,1.760048,-0.132668,34.337761,...,-16.850039,-0.620028,4.098160,-0.011083,24.893883,-1.973635,-24.022272,1.053429,16.086792,6.467587
2,2,000005,-1.006557e-04,0.000111,0.092693,-0.543117,0.513209,2.822287,-0.044496,80.876183,...,78.939896,-21.763428,-3.254653,-93.607475,9.272054,-14.719774,14.525596,-9.384195,-4.520085,0.508466
3,3,000010,3.410103e-06,-0.002461,0.137815,-0.684655,0.805533,0.253197,0.127628,38.179165,...,-9.478538,-21.217518,0.750832,17.683798,8.193560,5.263644,-16.086927,15.080028,-19.146044,3.932007
4,4,000134,-5.839608e-05,-0.001115,0.094457,-0.508414,0.490558,0.983781,0.087950,91.566048,...,-285.603058,-35.680218,-8.016828,-26.696106,-3.300993,-7.459848,-14.930106,-4.171641,-0.412970,23.477673
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7944,7952,050975,1.933872e-04,-0.000133,0.129812,-0.589347,0.549743,0.669336,0.041043,75.710068,...,-7.193809,1.489245,-0.182940,-134.752701,10.910300,7.404191,-80.671669,6.300637,-7.662754,7.088624
7945,7953,050988,2.681863e-07,0.000777,0.145474,-0.701874,0.789463,1.678167,-0.007445,91.458458,...,-1.576484,0.675101,3.625332,6.056493,-4.312865,6.530749,-9.423316,5.822365,-7.953242,7.475033
7946,7954,050989,-3.340889e-06,0.000727,0.067113,-0.484633,0.483251,1.903812,-0.123613,15.975612,...,-4.987584,-2.886751,-0.522574,5.348998,32.115612,-41.931553,5.557606,-6.900227,2.986025,-20.631742
7947,7955,050990,2.299570e-07,-0.000119,0.048076,-0.441218,0.413483,15.526840,-0.190782,12.147303,...,8.662319,47.503395,3.676044,1.118666,0.113696,18.061014,11.878851,-6.704487,7.828660,-7.316547


Find 2nd form of duplicates by getting rid of track_id column as this would make each row unique and hide duplicates.

In [108]:
df_ss_feat_dup = df_ss_feat.drop(columns=['track_id','index'])
duplicates = df_ss_feat_dup.duplicated()
df_ss_feat_dup[duplicates]

Unnamed: 0,yharm_mean_1,yharm_median_1,yharm_std_1,yharm_min_1,yharm_max_1,yharm_kurtosis_1,yharm_skew_1,melspec_mean_1,melspec_mean_2,melspec_mean_3,...,P_skew_196,P_skew_197,P_skew_198,P_skew_199,P_skew_200,P_skew_201,P_skew_202,P_skew_203,P_skew_204,P_skew_205
6292,-5.098969e-06,0.012151,0.323956,-0.972242,0.985925,-0.440115,-0.049848,397.495361,47.913536,26.23464,...,-8.816051,273.466583,-28.71958,14.811497,-29.23468,6.912237,11.437916,2.883056,-22.962215,4.694305
6494,-2.457019e-06,0.000284,0.131137,-0.515469,0.472397,0.37268,-0.018325,87.17997,4.574124,1.651052,...,-102.208717,19.395346,11.835208,30.031336,-122.225296,6.271557,-7.775839,5.813003,-7.705883,7.473331
6502,-2.241971e-06,8.2e-05,0.157927,-0.672559,0.676193,0.12928,-0.007573,91.016167,34.236732,17.373453,...,51.999081,-4.585421,9.947951,6.633286,4.045849,-10.795014,-27.084635,-18.558674,-0.59366,-0.275077
7285,-0.5647093,-0.614704,0.221583,-0.955487,0.566226,0.412059,0.892261,273.516846,49.733101,32.836468,...,-7.815525,39.054955,-15.208184,3.858125,12.346665,-1.275226,-22.451639,-5.353716,-9.797816,4.83797
7301,-1.96517e-07,2.8e-05,0.066599,-0.571344,0.644098,6.941342,0.123282,0.776448,2.463841,5.321877,...,8.984597,5.589285,-2.606553,-4.598745,0.065438,-3.474626,-2.283206,-14.791294,-12.778749,-2.260308
7707,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7945,2.681863e-07,0.000777,0.145474,-0.701874,0.789463,1.678167,-0.007445,91.458458,1.072967,0.704753,...,-1.576484,0.675101,3.625332,6.056493,-4.312865,6.530749,-9.423316,5.822365,-7.953242,7.475033
7946,-3.340889e-06,0.000727,0.067113,-0.484633,0.483251,1.903812,-0.123613,15.975612,4.637448,2.380274,...,-4.987584,-2.886751,-0.522574,5.348998,32.115612,-41.931553,5.557606,-6.900227,2.986025,-20.631742
7947,2.29957e-07,-0.000119,0.048076,-0.441218,0.413483,15.52684,-0.190782,12.147303,0.740133,0.242292,...,8.662319,47.503395,3.676044,1.118666,0.113696,18.061014,11.878851,-6.704487,7.82866,-7.316547
7948,0.0001933872,-0.000133,0.129812,-0.589347,0.549743,0.669336,0.041043,75.710068,2.339389,0.170948,...,-7.193809,1.489245,-0.18294,-134.752701,10.9103,7.404191,-80.671669,6.300637,-7.662754,7.088624


Get index of duplicates and drop these index's in original df_ss_feat dataframe

In [109]:
index_dup = df_ss_feat_dup[duplicates].index
index_dup

Int64Index([6292, 6494, 6502, 7285, 7301, 7707, 7945, 7946, 7947, 7948], dtype='int64')

Making sure these indexes are correct, see that both table match

In [115]:
df_ss_feat.iloc[list(index_dup),:]

Unnamed: 0,index,track_id,yharm_mean_1,yharm_median_1,yharm_std_1,yharm_min_1,yharm_max_1,yharm_kurtosis_1,yharm_skew_1,melspec_mean_1,...,P_skew_196,P_skew_197,P_skew_198,P_skew_199,P_skew_200,P_skew_201,P_skew_202,P_skew_203,P_skew_204,P_skew_205
6292,6300,40370,-5.098969e-06,0.012151,0.323956,-0.972242,0.985925,-0.440115,-0.049848,397.495361,...,-8.816051,273.466583,-28.71958,14.811497,-29.23468,6.912237,11.437916,2.883056,-22.962215,4.694305
6494,6502,41752,-2.457019e-06,0.000284,0.131137,-0.515469,0.472397,0.37268,-0.018325,87.17997,...,-102.208717,19.395346,11.835208,30.031336,-122.225296,6.271557,-7.775839,5.813003,-7.705883,7.473331
6502,6510,41809,-2.241971e-06,8.2e-05,0.157927,-0.672559,0.676193,0.12928,-0.007573,91.016167,...,51.999081,-4.585421,9.947951,6.633286,4.045849,-10.795014,-27.084635,-18.558674,-0.59366,-0.275077
7285,7293,46514,-0.5647093,-0.614704,0.221583,-0.955487,0.566226,0.412059,0.892261,273.516846,...,-7.815525,39.054955,-15.208184,3.858125,12.346665,-1.275226,-22.451639,-5.353716,-9.797816,4.83797
7301,7309,46648,-1.96517e-07,2.8e-05,0.066599,-0.571344,0.644098,6.941342,0.123282,0.776448,...,8.984597,5.589285,-2.606553,-4.598745,0.065438,-3.474626,-2.283206,-14.791294,-12.778749,-2.260308
7707,7715,48949,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7945,7953,50988,2.681863e-07,0.000777,0.145474,-0.701874,0.789463,1.678167,-0.007445,91.458458,...,-1.576484,0.675101,3.625332,6.056493,-4.312865,6.530749,-9.423316,5.822365,-7.953242,7.475033
7946,7954,50989,-3.340889e-06,0.000727,0.067113,-0.484633,0.483251,1.903812,-0.123613,15.975612,...,-4.987584,-2.886751,-0.522574,5.348998,32.115612,-41.931553,5.557606,-6.900227,2.986025,-20.631742
7947,7955,50990,2.29957e-07,-0.000119,0.048076,-0.441218,0.413483,15.52684,-0.190782,12.147303,...,8.662319,47.503395,3.676044,1.118666,0.113696,18.061014,11.878851,-6.704487,7.82866,-7.316547
7948,7956,50993,0.0001933872,-0.000133,0.129812,-0.589347,0.549743,0.669336,0.041043,75.710068,...,-7.193809,1.489245,-0.18294,-134.752701,10.9103,7.404191,-80.671669,6.300637,-7.662754,7.088624


In [120]:
df_ss_feat.drop(index = list(index_dup), columns = 'index', inplace= True)
df_ss_feat.reset_index(inplace=True)
df_ss_feat.drop(columns = 'index', inplace=True)

In [136]:
df_ss_feat

Unnamed: 0,track_id,yharm_mean_1,yharm_median_1,yharm_std_1,yharm_min_1,yharm_max_1,yharm_kurtosis_1,yharm_skew_1,melspec_mean_1,melspec_mean_2,...,P_skew_196,P_skew_197,P_skew_198,P_skew_199,P_skew_200,P_skew_201,P_skew_202,P_skew_203,P_skew_204,P_skew_205
0,000002,1.719662e-04,0.001340,0.065617,-0.441674,0.490110,1.713664,-0.168763,52.794807,9.047201,...,-32.036869,-1.592035,-0.774322,-2.433592,-10.650544,114.682968,-10.176087,4.382184,-9.046790,-9.193595
1,000003,1.352439e-05,0.000432,0.077000,-0.456636,0.403431,1.760048,-0.132668,34.337761,7.709792,...,-16.850039,-0.620028,4.098160,-0.011083,24.893883,-1.973635,-24.022272,1.053429,16.086792,6.467587
2,000005,-1.006557e-04,0.000111,0.092693,-0.543117,0.513209,2.822287,-0.044496,80.876183,7.042160,...,78.939896,-21.763428,-3.254653,-93.607475,9.272054,-14.719774,14.525596,-9.384195,-4.520085,0.508466
3,000010,3.410103e-06,-0.002461,0.137815,-0.684655,0.805533,0.253197,0.127628,38.179165,17.256195,...,-9.478538,-21.217518,0.750832,17.683798,8.193560,5.263644,-16.086927,15.080028,-19.146044,3.932007
4,000134,-5.839608e-05,-0.001115,0.094457,-0.508414,0.490558,0.983781,0.087950,91.566048,6.390040,...,-285.603058,-35.680218,-8.016828,-26.696106,-3.300993,-7.459848,-14.930106,-4.171641,-0.412970,23.477673
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7934,050958,3.007578e-04,0.000181,0.066795,-0.323913,0.342199,0.602974,0.020923,1.225565,7.489033,...,1.032571,-123.603004,-11.316278,-7.615706,8.117843,-7.173779,7.829058,-5.985820,7.685109,-7.216407
7935,050970,2.681863e-07,0.000777,0.145474,-0.701874,0.789463,1.678167,-0.007445,91.458458,1.072967,...,-1.576484,0.675101,3.625332,6.056493,-4.312865,6.530749,-9.423316,5.822365,-7.953242,7.475033
7936,050971,-3.340889e-06,0.000727,0.067113,-0.484633,0.483251,1.903812,-0.123613,15.975612,4.637448,...,-4.987584,-2.886751,-0.522574,5.348998,32.115612,-41.931553,5.557606,-6.900227,2.986025,-20.631742
7937,050972,2.299570e-07,-0.000119,0.048076,-0.441218,0.413483,15.526840,-0.190782,12.147303,0.740133,...,8.662319,47.503395,3.676044,1.118666,0.113696,18.061014,11.878851,-6.704487,7.828660,-7.316547


No NaN values :

In [130]:
sum(df_ss_feat.isna().sum())

0

Save cleaned dataframe 

In [139]:
'''df_ss_feat.to_csv('/Volumes/Extreme SSD/CAPSTONE_DATA/saved/dictionaries/df_ss_feat_clean.csv')'''

In dictionaries folder changed file names :
- df_ss_feat --> df_ss_feat_0_50

- df_ss_feat_clean --> df_ss_feat_clean_0_50

In [140]:
df_ss_clean_0_50 = pd.read_csv('/Volumes/Extreme SSD/CAPSTONE_DATA/saved/dictionaries/df_ss_feat_clean_0_50.csv', dtype = {'track_id':str}, ) #loading csv
df_ss_clean_0_50.drop(axis =1,columns='Unnamed: 0',inplace=True)

Need to load df_genres and drop duplicates of corresponding track_id's from above duplicate list found.

In [154]:
dup_list_id = ['040370','041752','041809','046514','046648','048949','050988','050989','050990','050993']
df_genres_clean_0_50 = df_genres[~df_genres.track_id.isin(dup_list_id)]

In [None]:
df_genres_clean_0_50.reset_index(inplace=True, drop =True)

Save df_genres after clean to new dictionary and load again

In [162]:
'''df_genres_clean_0_50.to_csv('/Volumes/Extreme SSD/CAPSTONE_DATA/saved/dictionaries/df_genres_clean_0_50.csv')'''
df_genres_clean_0_50 = pd.read_csv('/Volumes/Extreme SSD/CAPSTONE_DATA/saved/dictionaries/df_genres_clean_0_50.csv',dtype = {'track_id':str, 'folder':str}) #loading csv
df_genres_clean_0_50.drop(axis =1,columns='Unnamed: 0',inplace=True)

In [163]:
df_genres_clean_0_50

Unnamed: 0,track_id,folder,genres_top,genres,genres_all
0,000002,000,Hip-Hop,[21],[21]
1,000003,000,Hip-Hop,[21],[21]
2,000005,000,Hip-Hop,[21],[21]
3,000010,000,Pop,[10],[10]
4,000134,000,Hip-Hop,[21],[21]
...,...,...,...,...,...
7934,050958,050,Classical,[659],"[659, 5]"
7935,050970,050,Jazz,[4],[4]
7936,050971,050,Jazz,[4],[4]
7937,050972,050,Jazz,[4],[4]


Combine genres dataframe with ss features :

In [169]:
df_classifier1_0_50 = pd.merge(df_genres_clean_0_50, df_ss_clean_0_50, on = 'track_id')

In [29]:
'''df_classifier1_0_50.to_csv('/Volumes/Extreme SSD/CAPSTONE_DATA/saved/final_dfs/df_classifier1_0_50.csv')'''
df_classifier1_0_50 = pd.read_csv('/Volumes/Extreme SSD/CAPSTONE_DATA/saved/final_dfs/df_classifier1_0_50.csv',dtype = {'track_id':str, 'folder':str}) #loading csv
df_classifier1_0_50.drop(axis = 1, columns='Unnamed: 0',inplace =True)

Repeated same extraction/conversion procedure as in this section for tracks in folders 51 - 100, found in notebook "collate_genre_meta_into_df_51_100".

In [30]:
df_classifier1_51_100 = pd.read_csv('/Volumes/Extreme SSD/CAPSTONE_DATA/saved/final_dfs/df_classifier1_51_100.csv',dtype = {'track_id':str, 'folder':str}) #loading csv
df_classifier1_51_100.drop(axis = 1, columns='Unnamed: 0',inplace =True)

### Combine 0 - 50 with 51 - 100 for final dataframe

Combine both full dataframes from into complete one covering tracks from folders 0 - 100. Have to combine row-wise.

In [38]:
df_classifier1_0_100 = pd.concat([df_classifier1_0_50, df_classifier1_51_100], axis=0)
df_classifier1_0_100.reset_index(inplace=True, drop =True)

In [527]:
'''df_classifier1_0_100.to_csv('/Volumes/Extreme SSD/CAPSTONE_DATA/saved/final_dfs/df_classifier1_0_100.csv')'''
df_classifier1_0_100 = pd.read_csv('/Volumes/Extreme SSD/CAPSTONE_DATA/saved/final_dfs/df_classifier1_0_100.csv',dtype = {'track_id':str, 'folder':str}) #loading csv
df_classifier1_0_100.drop(axis = 1, columns='Unnamed: 0',inplace =True)