In [1]:
import pandas as pd
import numpy as np
import glob
import scipy
import matplotlib.pyplot as plt
import re
import os

In [2]:
POSSIBLE_LABELS = 'yes no up down left right on off stop go unknown silence'.split()
id2name = {i: name for i, name in enumerate(POSSIBLE_LABELS)}
name2id = {name: i for i, name in id2name.items()}

In [3]:
def load_data(data_dir):
    """ Return 2 lists of tuples:
    [(class_id, user_id, path), ...] for train
    [(class_id, user_id, path), ...] for validation
    """
    # Just a simple regexp for paths with three groups:
    # prefix, label, user_id
    pattern = re.compile("(.+\/)?(\w+)\/([^_]+)_.+wav")
    all_files = []
    for names in os.listdir('./train/audio'):
        for files in os.listdir(os.path.join('./train/audio',names)):
            all_files.append('train/audio/' + names + '/' + files)
            
    with open(os.path.join(data_dir, 'train/validation_list.txt'), 'r') as fin:
        validation_files = fin.readlines()
    valset = set()
    
    for entry in validation_files:
        r = re.match(pattern, entry)
        if r:
            valset.add(r.group(3))
            
    possible = set(POSSIBLE_LABELS)
    train, val = [], []
    for entry in all_files:
        r = re.match(pattern, entry)
        if r:
            label, uid = r.group(2), r.group(3)
            if label == '_background_noise_':
                label = 'silence'
            if label not in possible:
                label = 'unknown'

            label_id = name2id[label]

            sample = (label, label_id, uid, entry)
            if uid in valset:
                val.append(sample)
            else:
                train.append(sample)

    print('There are {} train and {} val samples'.format(len(train), len(val)))
    
    columns_list = ['label', 'label_id', 'user_id', 'wav_file']
    
    train_df = pd.DataFrame(train, columns = columns_list)
    valid_df = pd.DataFrame(val, columns = columns_list)
    
    return train_df, valid_df

In [4]:
data_dir = os.getcwd()
print(data_dir)

C:\Users\cc\Desktop\Project


In [5]:
train_df, valid_df = load_data(data_dir)

There are 57929 train and 6798 val samples


In [6]:
pd.to_pickle(valid_df, 'valid_df.pkl')

In [7]:
from scipy.io import wavfile

In [8]:
word_data = np.array(train_df[train_df.label_id != 11].wav_file.apply(wavfile.read))

In [9]:
word_data2 = [sr_wav[1] for sr_wav in word_data]
word_data2 = np.array(word_data2)
word_data2.shape

(57923,)

In [10]:
train_df = train_df[train_df.label_id != 11]
train_df.shape

(57923, 4)

In [11]:
all_sounds = list(word_data2)
len(all_sounds)

57923

In [12]:
train_df['sound'] = all_sounds
train_df.to_pickle('train_df.pkl')
train_df.label.value_counts()

unknown    36818
stop        2134
yes         2116
up          2115
go          2112
right       2111
on          2110
left        2106
no          2105
off         2101
down        2095
Name: label, dtype: int64

In [13]:
train_df.head(10)

Unnamed: 0,label,label_id,user_id,wav_file,sound
0,unknown,10,00176480,train/audio/bed/00176480_nohash_0.wav,"[3, 1, 6, 6, -1, 1, 8, 3, -3, -2, 3, 9, 4, -9,..."
1,unknown,10,004ae714,train/audio/bed/004ae714_nohash_0.wav,"[-109, -149, -144, -110, -120, -137, -132, -87..."
2,unknown,10,004ae714,train/audio/bed/004ae714_nohash_1.wav,"[-82, -69, -57, -92, -75, -107, -122, -121, -1..."
3,unknown,10,00f0204f,train/audio/bed/00f0204f_nohash_0.wav,"[-8, -11, -11, -12, -20, -26, -22, -29, -39, -..."
4,unknown,10,00f0204f,train/audio/bed/00f0204f_nohash_1.wav,"[-12, -15, -15, -23, -22, -24, -22, -24, -25, ..."
5,unknown,10,012c8314,train/audio/bed/012c8314_nohash_0.wav,"[218, 239, 212, 263, 282, 234, 276, 260, 233, ..."
6,unknown,10,012c8314,train/audio/bed/012c8314_nohash_1.wav,"[269, 327, 377, 414, 361, 378, 396, 344, 308, ..."
7,unknown,10,0132a06d,train/audio/bed/0132a06d_nohash_0.wav,"[-3, -5, -4, -3, -1, 0, 2, 5, 7, 5, 4, 6, 3, 1..."
8,unknown,10,0135f3f2,train/audio/bed/0135f3f2_nohash_0.wav,"[2, 1, 0, 0, 0, 0, 0, 0, -1, 0, 0, 1, 2, 1, 2,..."
9,unknown,10,0137b3f4,train/audio/bed/0137b3f4_nohash_0.wav,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
