In [40]:
import pandas as pd
import numpy as np

import os
import re
from pathlib import Path
from tqdm.auto import tqdm
import shutil

RANDOM_SEED = 90210

In [41]:
!ls *.csv

class_labels_indices.csv  eval_segments_no_header.csv
eval_segments.csv	  eval_segments_tmp.csv


In [42]:
df = pd.read_csv('class_labels_indices.csv', index_col='index')
df.head()

Unnamed: 0_level_0,mid,display_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,/m/09x0r,Speech
1,/m/05zppz,"Male speech, man speaking"
2,/m/02zsn,"Female speech, woman speaking"
3,/m/0ytgt,"Child speech, kid speaking"
4,/m/01h8n0,Conversation


In [43]:
mid_lookup = {}
for r in df.itertuples():
    mid_lookup[r.mid] = r.display_name
    
mid_lookup

{'/m/09x0r': 'Speech',
 '/m/05zppz': 'Male speech, man speaking',
 '/m/02zsn': 'Female speech, woman speaking',
 '/m/0ytgt': 'Child speech, kid speaking',
 '/m/01h8n0': 'Conversation',
 '/m/02qldy': 'Narration, monologue',
 '/m/0261r1': 'Babbling',
 '/m/0brhx': 'Speech synthesizer',
 '/m/07p6fty': 'Shout',
 '/m/07q4ntr': 'Bellow',
 '/m/07rwj3x': 'Whoop',
 '/m/07sr1lc': 'Yell',
 '/m/04gy_2': 'Battle cry',
 '/t/dd00135': 'Children shouting',
 '/m/03qc9zr': 'Screaming',
 '/m/02rtxlg': 'Whispering',
 '/m/01j3sz': 'Laughter',
 '/t/dd00001': 'Baby laughter',
 '/m/07r660_': 'Giggle',
 '/m/07s04w4': 'Snicker',
 '/m/07sq110': 'Belly laugh',
 '/m/07rgt08': 'Chuckle, chortle',
 '/m/0463cq4': 'Crying, sobbing',
 '/t/dd00002': 'Baby cry, infant cry',
 '/m/07qz6j3': 'Whimper',
 '/m/07qw_06': 'Wail, moan',
 '/m/07plz5l': 'Sigh',
 '/m/015lz1': 'Singing',
 '/m/0l14jd': 'Choir',
 '/m/01swy6': 'Yodeling',
 '/m/02bk07': 'Chant',
 '/m/01c194': 'Mantra',
 '/t/dd00003': 'Male singing',
 '/t/dd00004': 'Female

In [44]:
df_all = pd.read_csv('eval_segments.csv', skiprows=3, sep=',\s+', names=['YTID', 'start_seconds', 'end_seconds', 'positive_labels'], header=None, engine='python').replace('"','', regex=True)
# df_all = pd.read_csv('balanced_train_segments.csv', skiprows=3, sep=',\s+', names=['YTID', 'start_seconds', 'end_seconds', 'positive_labels'], header=None, engine='python').replace('"','', regex=True)
df_all.head()


Unnamed: 0,YTID,start_seconds,end_seconds,positive_labels
0,--4gqARaEJE,0.0,10.0,"/m/068hy,/m/07q6cd_,/m/0bt9lr,/m/0jbk"
1,--BfvyPmVMo,20.0,30.0,/m/03l9g
2,--U7joUcTCo,0.0,10.0,/m/01b_21
3,--i-y1v8Hy8,0.0,9.0,"/m/04rlf,/m/09x0r,/t/dd00004,/t/dd00005"
4,-0BIyqJj9ZU,30.0,40.0,"/m/07rgt08,/m/07sq110,/t/dd00001"


# Keep only existing files (Map to mat file)

In [45]:
ls /datasets/spiking/Sound2spks/result_mats/*kuihSZXsIJ4*.mat

/datasets/spiking/Sound2spks/result_mats/16232_kuihSZXsIJ4.mat


In [46]:
files = []
sizes = []
missing = 0
lst_exists = []
for ix in tqdm(df_all.YTID):
    exists = False
#     print(ix)
    rootdir = "/datasets/spiking/Sound2spks/result_mats/"
#     rootdir = "/datasets/spiking/Sound2spks/wav_data/"
    for path in Path(rootdir).glob(f'*_{ix}*.mat'):
#         print(path)
        exists = True
        files.append(path.name)
        sizes.append(path.stat().st_size/1024/1024) #Mbs
        break

    if not exists:
#         print(ix, end=',')
        missing = missing+1
    
    lst_exists.append(exists)

# print()
print(missing)
print(df_all.shape)

HBox(children=(IntProgress(value=0, max=20371), HTML(value='')))


7026
(20371, 4)


In [47]:
path.stat().st_size/1024/1024

42.671417236328125

In [48]:
df_all = df_all[lst_exists]
df_all['file'] = files
df_all['size'] = sizes
df_all.shape

(13345, 6)

# Prepare for binary classification

In [49]:
df_all.head()

Unnamed: 0,YTID,start_seconds,end_seconds,positive_labels,file,size
0,--4gqARaEJE,0.0,10.0,"/m/068hy,/m/07q6cd_,/m/0bt9lr,/m/0jbk",0_--4gqARaEJE.mat,47.410202
1,--BfvyPmVMo,20.0,30.0,/m/03l9g,1_--BfvyPmVMo.mat,42.671417
2,--U7joUcTCo,0.0,10.0,/m/01b_21,2_--U7joUcTCo.mat,42.671417
3,--i-y1v8Hy8,0.0,9.0,"/m/04rlf,/m/09x0r,/t/dd00004,/t/dd00005",3_--i-y1v8Hy8.mat,37.951828
4,-0BIyqJj9ZU,30.0,40.0,"/m/07rgt08,/m/07sq110,/t/dd00001",4_-0BIyqJj9ZU.mat,47.410202


In [50]:
# https://stackoverflow.com/a/39946744/1640414
def tidy_split(df, column, sep='|', keep=False):
    """
    Split the values of a column and expand so the new DataFrame has one split
    value per row. Filters rows where the column is missing.

    Params
    ------
    df : pandas.DataFrame
        dataframe with the column to split and expand
    column : str
        the column to split and expand
    sep : str
        the string used to split the column's values
    keep : bool
        whether to retain the presplit value as it's own row

    Returns
    -------
    pandas.DataFrame
        Returns a dataframe with the same columns as `df`.
    """
    indexes = list()
    new_values = list()
    df = df.dropna(subset=[column])
    for i, presplit in enumerate(df[column].astype(str)):
        values = presplit.split(sep)
        if keep and len(values) > 1:
            indexes.append(i)
            new_values.append(presplit)
        for value in values:
            indexes.append(i)
            new_values.append(value)
    new_df = df.iloc[indexes, :].copy()
    new_df[column] = new_values
    return new_df

df_exploded = tidy_split(df_all, 'positive_labels', sep=',')
df_exploded.rename(columns={'positive_labels': 'mid'}, inplace=True)
df_exploded['annotation'] = df_exploded["mid"].replace(mid_lookup, inplace=False)
df_exploded.head()

Unnamed: 0,YTID,start_seconds,end_seconds,mid,file,size,annotation
0,--4gqARaEJE,0.0,10.0,/m/068hy,0_--4gqARaEJE.mat,47.410202,"Domestic animals, pets"
0,--4gqARaEJE,0.0,10.0,/m/07q6cd_,0_--4gqARaEJE.mat,47.410202,Squeak
0,--4gqARaEJE,0.0,10.0,/m/0bt9lr,0_--4gqARaEJE.mat,47.410202,Dog
0,--4gqARaEJE,0.0,10.0,/m/0jbk,0_--4gqARaEJE.mat,47.410202,Animal
1,--BfvyPmVMo,20.0,30.0,/m/03l9g,1_--BfvyPmVMo.mat,42.671417,Hammer


In [51]:
df_exploded['annotation'].value_counts()

Music                                      3559
Speech                                     3417
Vehicle                                     507
Animal                                      444
Inside, small room                          379
Musical instrument                          270
Singing                                     228
Domestic animals, pets                      222
Water                                       201
Guitar                                      194
Car                                         187
Percussion                                  177
Wind instrument, woodwind instrument        173
Boat, Water vehicle                         171
Plucked string instrument                   166
Bird                                        157
Fowl                                        153
Siren                                       146
Brass instrument                            145
Outside, urban or manmade                   139
Dog                                     

In [52]:
# df_binary = df_exploded[df_exploded['annotation'].isin(['Musical instrument'])]
df_binary = df_exploded[df_exploded['annotation'].isin(['Water'])]
df_binary.iloc[:10, :]

Unnamed: 0,YTID,start_seconds,end_seconds,mid,file,size,annotation
16,-1EXhfqLLwQ,150.0,160.0,/m/0838f,16_-1EXhfqLLwQ.mat,47.410202,Water
100,-DSNfC2EJhU,20.0,30.0,/m/0838f,100_-DSNfC2EJhU.mat,42.671417,Water
117,-Gr9P8Abnlk,110.0,120.0,/m/0838f,117_-Gr9P8Abnlk.mat,42.671417,Water
146,-NRx0SBMjo0,24.0,34.0,/m/0838f,146_-NRx0SBMjo0.mat,42.671417,Water
187,-UTdhK0lwuw,30.0,40.0,/m/0838f,187_-UTdhK0lwuw.mat,47.410202,Water
190,-V9U5F14AVY,40.0,50.0,/m/0838f,190_-V9U5F14AVY.mat,47.410202,Water
390,-zgGL2o1jqw,80.0,90.0,/m/0838f,390_-zgGL2o1jqw.mat,42.671417,Water
611,0W1sNicXboU,60.0,70.0,/m/0838f,611_0W1sNicXboU.mat,47.410202,Water
796,1009ux1xbkg,0.0,10.0,/m/0838f,796_1009ux1xbkg.mat,47.410202,Water
839,16vy6Lz8tkY,310.0,320.0,/m/0838f,839_16vy6Lz8tkY.mat,42.671417,Water


In [33]:
!paplay "/datasets/spiking/Sound2spks/wav_data/796_1009ux1xbkg.wav"

In [53]:
CLASS0 = 'Water'
CLASS1 = 'Musical instrument'

In [54]:
df_binary = df_exploded[df_exploded['annotation'].isin(['Water', 'Musical instrument'])]
df_binary.head()

Unnamed: 0,YTID,start_seconds,end_seconds,mid,file,size,annotation
16,-1EXhfqLLwQ,150.0,160.0,/m/0838f,16_-1EXhfqLLwQ.mat,47.410202,Water
37,-3Kv4fdm7Uk,30.0,40.0,/m/04szw,37_-3Kv4fdm7Uk.mat,42.671417,Musical instrument
69,-8cgbhIR_pw,30.0,40.0,/m/04szw,69_-8cgbhIR_pw.mat,42.671417,Musical instrument
100,-DSNfC2EJhU,20.0,30.0,/m/0838f,100_-DSNfC2EJhU.mat,42.671417,Water
117,-Gr9P8Abnlk,110.0,120.0,/m/0838f,117_-Gr9P8Abnlk.mat,42.671417,Water


In [55]:
df_binary.shape

(471, 7)

In [56]:
df_binary['YTID'].nunique()

471

In [57]:
df_binary = df_binary.drop_duplicates(subset=['YTID'], keep=False, inplace=False) #drop dups
df_binary.shape

(471, 7)

In [58]:
ser = df_binary['annotation'].value_counts()
ser = ser.sort_values()
total_small = ser.values[0]
ser

Water                 201
Musical instrument    270
Name: annotation, dtype: int64

# Subsample according to small class

In [59]:
df_binary = pd.concat([df_binary[df_binary['annotation'] == CLASS0].sample(total_small, random_state=RANDOM_SEED), df_binary[df_binary['annotation'] == CLASS1].sample(total_small, random_state=RANDOM_SEED)])
df_binary['annotation'].value_counts()

Musical instrument    201
Water                 201
Name: annotation, dtype: int64

In [60]:
df_binary['label'] = df_binary['annotation'].map({CLASS0: 0, CLASS1: 1})
df_binary.head()

Unnamed: 0,YTID,start_seconds,end_seconds,mid,file,size,annotation,label
3657,9PdzaYwgQBE,30.0,40.0,/m/0838f,3657_9PdzaYwgQBE.mat,42.671417,Water,0
187,-UTdhK0lwuw,30.0,40.0,/m/0838f,187_-UTdhK0lwuw.mat,47.410202,Water,0
7497,Jlc4DETobHE,50.0,60.0,/m/0838f,7497_Jlc4DETobHE.mat,47.410202,Water,0
8582,N7MQjLZYJKc,10.0,20.0,/m/0838f,8582_N7MQjLZYJKc.mat,47.410202,Water,0
16571,m8JVAsG1vp0,40.0,50.0,/m/0838f,16571_m8JVAsG1vp0.mat,47.410202,Water,0


In [61]:
df_binary.set_index('YTID', drop=True, inplace=True)
df_binary.head()

Unnamed: 0_level_0,start_seconds,end_seconds,mid,file,size,annotation,label
YTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9PdzaYwgQBE,30.0,40.0,/m/0838f,3657_9PdzaYwgQBE.mat,42.671417,Water,0
-UTdhK0lwuw,30.0,40.0,/m/0838f,187_-UTdhK0lwuw.mat,47.410202,Water,0
Jlc4DETobHE,50.0,60.0,/m/0838f,7497_Jlc4DETobHE.mat,47.410202,Water,0
N7MQjLZYJKc,10.0,20.0,/m/0838f,8582_N7MQjLZYJKc.mat,47.410202,Water,0
m8JVAsG1vp0,40.0,50.0,/m/0838f,16571_m8JVAsG1vp0.mat,47.410202,Water,0


# Store only existing files dataset

In [62]:
# df_binary.to_csv('balanced_dataset_for_tempotron.v1.csv', index_label='YTID')
# df_binary.to_csv('/datasets/spiking/Sound2spks/tempotron/eval_dataset_for_tempotron.v1.csv', index_label='YTID')
df_binary.to_csv('/datasets/spiking/Sound2spks/tempotron/eval_dataset_for_tempotron.v2.csv', index_label='YTID')

In [63]:
df_binary['annotation'].value_counts()

Musical instrument    201
Water                 201
Name: annotation, dtype: int64

In [64]:
df_binary['size'].sum()/1024 #Gb

17.163724526762962

# Copy to another dir

In [66]:
src_dir = "/datasets/spiking/Sound2spks/result_mats/"
dest_dir = "/datasets/spiking/Sound2spks/tempotron/raw/"
with tqdm(total=df_binary.shape[0]) as t:
    for tup in df_binary.itertuples():
        f = tup.file
        label = tup.label
        
        t.set_description(f'{label}/{f}')
        t.update()
                
#         print(f, end=",")
        src = Path(src_dir, f)
        dest = Path(dest_dir + str(label), f)
        shutil.copyfile(src, dest)

HBox(children=(IntProgress(value=0, max=402), HTML(value='')))




# Split into sets

In [67]:
# df_binary = pd.read_csv('/datasets/spiking/Sound2spks/tempotron/eval_dataset_for_tempotron.v1.csv', index_col='YTID')
df_binary = pd.read_csv('/datasets/spiking/Sound2spks/tempotron/eval_dataset_for_tempotron.v2.csv', index_col='YTID')
df_binary.head()

Unnamed: 0_level_0,start_seconds,end_seconds,mid,file,size,annotation,label
YTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9PdzaYwgQBE,30.0,40.0,/m/0838f,3657_9PdzaYwgQBE.mat,42.671417,Water,0
-UTdhK0lwuw,30.0,40.0,/m/0838f,187_-UTdhK0lwuw.mat,47.410202,Water,0
Jlc4DETobHE,50.0,60.0,/m/0838f,7497_Jlc4DETobHE.mat,47.410202,Water,0
N7MQjLZYJKc,10.0,20.0,/m/0838f,8582_N7MQjLZYJKc.mat,47.410202,Water,0
m8JVAsG1vp0,40.0,50.0,/m/0838f,16571_m8JVAsG1vp0.mat,47.410202,Water,0


In [68]:
test_size = .2
train_size = 1-test_size
val_size = .15 #of train_size

TRAIN_SET, VAL_SET, TEST_SET = 0, 1, 2

In [69]:
df_binary['set'] = TRAIN_SET;

In [70]:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, random_state=RANDOM_SEED, test_size=test_size)
train_ix, test_ix = next(iter(sss.split(df_binary.index, df_binary['label'])))
df_binary['set'][test_ix] = TEST_SET

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [71]:
df_binary['set'].value_counts()

0    321
2     81
Name: set, dtype: int64

In [72]:
df_train = df_binary[df_binary['set'] == TRAIN_SET]
sss = StratifiedShuffleSplit(n_splits=1, random_state=RANDOM_SEED, test_size=val_size)
train_ix, test_ix = next(iter(sss.split(df_train.index, df_train['label'])))
df_train['set'][test_ix] = VAL_SET

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._set_with(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [73]:
df_train['set'].value_counts()

0    272
1     49
Name: set, dtype: int64

In [74]:
df_binary['set'].loc[df_train.index] = df_train['set']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [75]:

ser = df_binary['set'].value_counts()

assert np.isclose(ser[VAL_SET]/(ser[VAL_SET]+ser[TRAIN_SET]), val_size, atol=1e-2), ser[VAL_SET]/(ser[VAL_SET]+ser[TRAIN_SET])
assert np.isclose(ser[TEST_SET]/(ser.sum()), test_size, atol=1e-2), ser[TEST_SET]/(ser.sum())

ser

0    272
2     81
1     49
Name: set, dtype: int64

In [76]:
#ensure stratified
df_binary.groupby(['label', 'set']).size()

label  set
0      0      136
       1       25
       2       40
1      0      136
       1       24
       2       41
dtype: int64

In [77]:
# df_binary.to_csv('/datasets/spiking/Sound2spks/tempotron/eval_dataset_for_tempotron.v1.csv', index_label='YTID')
df_binary.to_csv('/datasets/spiking/Sound2spks/tempotron/eval_dataset_for_tempotron.v2.csv', index_label='YTID')