# 載入套件

In [1]:
import numpy as np
import scipy.io.wavfile as wav
import pandas as pd
import tensorflow as tf
import os, time, glob, json, csv
import librosa.display

from python_speech_features import mfcc
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, recall_score, precision_score
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Flatten, Embedding, GRU, SimpleRNN, Activation, Reshape
from tensorflow import keras
#from wave import open

Using TensorFlow backend.


# 限制GPU記憶體使用

In [2]:
physical_gpus = tf.config.list_physical_devices("GPU")
tf.config.experimental.set_virtual_device_configuration(
    physical_gpus[0],
    [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4096)] 
)
logical_gpus = tf.config.list_logical_devices("GPU")

2022-07-26 23:52:21.279661: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2022-07-26 23:52:21.299887: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-26 23:52:21.299959: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce RTX 3090 computeCapability: 8.6
coreClock: 1.695GHz coreCount: 82 deviceMemorySize: 23.70GiB deviceMemoryBandwidth: 871.81GiB/s
2022-07-26 23:52:21.300089: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2022-07-26 23:52:21.301265: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2022-07-26 23:52:21.302274: I tensorflow/stream_executor/pl

# 準備資料

In [3]:
#各檔案路徑
audio_path = 'dataset/data_sr/data'
label_path = 'dataset/data_sr/data'

audio_csv_path = 'dataset/data_sr/audio_sample.csv'
#label_csv_path = 'dataset/data_sr/label_sample.csv'
label_csv_path = 'dataset/data_sr/label_sample_twoclass.csv'

In [4]:
y_path = label_csv_path
y = pd.read_csv(y_path)
y.head()

name = y[y.columns[0]].values
name = name.tolist()
name = map(str, name)
name = list(name)

In [5]:
# wav not exists
name.remove('1621260098')
name.remove('1621260094')
name.remove('1630587053')
name.remove('1620877500')
name.remove('1618243179')
name.remove('1622944740')
name.remove('1623546872')
name.remove('1620877516')
name.remove('1620955238')
name.remove('1623546876')
name.remove('1629341577')
name.remove('1626860174')
name.remove('1629341588')
name.remove('1620898178')
name.remove('1637748622')
name.remove('1626860172')
name.remove('1620898181')
name.remove('1620955240')
name.remove('1627775236')
name.remove('1618243233')
name.remove('1650866474')
name.remove('1618243237')
name.remove('1629450355')
name.remove('1620877512')
name.remove('1626860184')
name.remove('1627775229')
name.remove('1622340186')
name.remove('1621260096')
name.remove('1635565854')
name.remove('1635565858')
name.remove('1620955233')
name.remove('1620898192')
name.remove('1620955236')
name.remove('1621260104')
name.remove('1627775238')
name.remove('1650866472')
name.remove('1627956275')

In [6]:
# 取得 wav 音檔路徑
def get_wavs(wav_path):
    wavs = []
    for (dirpath, dirnames, filenames) in os.walk(wav_path):
        for n in name:
            n = n + ".wav"
            filename_path = os.path.join(dirpath, n)
            wavs.append(filename_path)
            '''
        for filename in filenames:
            if filename.endswith('.wav') or filename.endswith('.WAV'):
                filename_path = os.path.join(dirpath, filename)
                wavs.append(filename_path)
                '''
    return wavs

wavs = get_wavs(audio_path)

In [None]:
# mfcc function
# 将音频数据转为时间序列（列）和MFCC（行）的矩阵，将对应的译文转成字向量
def get_mfccs(wavs, n_input, contexts):
    file = []
    audio = []
    audio_len = []
    
    for wav_file in wavs:
        # load audio and convert to features
        audio_data = audiofile_to_input_vector(wav_file, n_input, contexts)
        try:
            audio_data = audio_data.astype('float32')
            
            file.append(wav_file)
            audio.append(audio_data)
            audio_len.append(np.int32(len(audio_data)))
        except AttributeError:
            print()
        except FileNotFoundError:
            print()
 
    audio = np.asarray(audio,dtype=object)
    audio_len = np.asarray(audio_len)
    return file, audio, audio_len
     
# 将音频信息转成MFCC特征
# 参数说明---audio_filename：音频文件   numcep：梅尔倒谱系数个数
#       numcontext：对于每个时间段，要包含的上下文样本个数
def audiofile_to_input_vector(audio_filename, numcep, numcontext):
    # 加载音频文件
    try:
        fs, audio = wav.read(audio_filename)
        orig_inputs = mfcc(audio, samplerate=fs, numcep=numcep)
        # 打印MFCC系数的形状，得到比如(955, 26)的形状
        # 955表示时间序列，26表示每个序列的MFCC的特征值为26个
        # 这个形状因文件而异，不同文件可能有不同长度的时间序列，但是，每个序列的特征值数量都是一样的
        #print('orig_inputs shape', np.shape(orig_inputs))

        # 因为我们使用双向循环神经网络来训练,它的输出包含正、反向的结
        # 果,相当于每一个时间序列都扩大了一倍,所以
        # 为了保证总时序不变,使用orig_inputs =
        # orig_inputs[::2]对orig_inputs每隔一行进行一次
        # 取样。这样被忽略的那个序列可以用后文中反向
        # RNN生成的输出来代替,维持了总的序列长度。
        orig_inputs = orig_inputs[::2]  # (478, 26)
        # print(np.shape(orig_inputs))
        # 因为我们讲解和实际使用的numcontext=9，所以下面的备注我都以numcontext=9来讲解
        # 这里装的就是我们要返回的数据，因为同时要考虑前9个和后9个时间序列，
        # 所以每个时间序列组合了19*26=494个MFCC特征数
        train_inputs = np.array([], np.float32)
        train_inputs.resize((orig_inputs.shape[0], numcep + 2 * numcep * numcontext))
        #print('train_inputs shape', np.shape(train_inputs))#)(478, 494)

        # Prepare pre-fix post fix context
        empty_mfcc = np.array([])
        empty_mfcc.resize((numcep))

        # Prepare train_inputs with past and future contexts
        # time_slices保存的是时间切片，也就是有多少个时间序列
        time_slices = range(train_inputs.shape[0])

        # context_past_min和context_future_max用来计算哪些序列需要补零
        context_past_min = time_slices[0] + numcontext
        context_future_max = time_slices[-1] - numcontext

        # 开始遍历所有序列
        for time_slice in time_slices:
            # 对前9个时间序列的MFCC特征补0，不需要补零的，则直接获取前9个时间序列的特征
            need_empty_past = max(0, (context_past_min - time_slice))
            empty_source_past = list(empty_mfcc for empty_slots in range(need_empty_past))
            data_source_past = orig_inputs[max(0, time_slice - numcontext):time_slice]
            assert (len(empty_source_past) + len(data_source_past) == numcontext)

            # 对后9个时间序列的MFCC特征补0，不需要补零的，则直接获取后9个时间序列的特征
            need_empty_future = max(0, (time_slice - context_future_max))
            empty_source_future = list(empty_mfcc for empty_slots in range(need_empty_future))
            data_source_future = orig_inputs[time_slice + 1:time_slice + numcontext + 1]
            assert (len(empty_source_future) + len(data_source_future) == numcontext)

            # 前9个时间序列的特征
            if need_empty_past:
                past = np.concatenate((empty_source_past, data_source_past))
            else:
                past = data_source_past
            # 后9个时间序列的特征
            if need_empty_future:
                future = np.concatenate((data_source_future, empty_source_future))
            else:
                future = data_source_future

            # 将前9个时间序列和当前时间序列以及后9个时间序列组合
            past = np.reshape(past, numcontext * numcep)
            now = orig_inputs[time_slice]
            future = np.reshape(future, numcontext * numcep)

            train_inputs[time_slice] = np.concatenate((past, now, future))
            assert (len(train_inputs[time_slice]) == numcep + 2 * numcep * numcontext)

        # 将数据使用正太分布标准化，减去均值然后再除以方差
        train_inputs = (train_inputs - np.mean(train_inputs)) / np.std(train_inputs)

        return train_inputs
    except ValueError:
        print(audio_filename)
    except FileNotFoundError:
        print(audio_filename)
    
 
#对齐处理
def pad_sequences(sequences, maxlen=None, dtype=np.float32,
                  padding='post', truncating='post', value=0.):
    #[478 512 503 406 481 509 422 465]
    lengths = np.asarray([len(s) for s in sequences], dtype=np.int64)
 
    nb_samples = len(sequences)
 
    #maxlen，该批次中，最长的序列长度
    if maxlen is None:
        maxlen = np.max(lengths)
 
    # 在下面的主循环中，从第一个非空序列中获取样本形状以检查一致性
    sample_shape = tuple()
    for s in sequences:
        if len(s) > 0:
            sample_shape = np.asarray(s).shape[1:]
            break
 
    x = (np.ones((nb_samples, maxlen) + sample_shape) * value).astype(dtype)
    for idx, s in enumerate(sequences):
        if len(s) == 0:
            continue  # 序列为空，跳过
 
        #post表示后补零，pre表示前补零
        if truncating == 'pre':
            trunc = s[-maxlen:]
        elif truncating == 'post':
            trunc = s[:maxlen]
        else:
            raise ValueError('Truncating type "%s" not understood' % truncating)
 
        # check `trunc` has expected shape
        trunc = np.asarray(trunc, dtype=dtype)
        if trunc.shape[1:] != sample_shape:
            raise ValueError('Shape of sample %s of sequence at position %s is different from expected shape %s' %
                             (trunc.shape[1:], idx, sample_shape))
 
        if padding == 'post':
            x[idx, :len(trunc)] = trunc
        elif padding == 'pre':
            x[idx, -len(trunc):] = trunc
        else:
            raise ValueError('Padding type "%s" not understood' % padding)
 
    return x, lengths


# 梅尔倒谱系数的个数
n_input = 26
# 对于每个时间序列，要包含上下文样本的个数
contexts = 9

file, audio, audio_len = get_mfccs(wavs, n_input, contexts)
audio, audio_len = pad_sequences(audio)

In [None]:
df = pd.DataFrame(name)

df.columns=["filename"]
df["filename"] = df["filename"].astype(int)

df1 = pd.concat([df,y], axis="columns")
df1 = df1.dropna()

In [None]:
print(len(df1))

In [27]:
# 音檔的 label
y = df1.drop(["filename"], axis=1)
#y.shape
y = y[y.columns[1:6]].values
print(y.shape)
y = y.tolist()
y = np.array(y)

(91111, 1)


In [29]:
x = audio.reshape((91111, 108, 494))

In [39]:
X_train, X_temp, y_train, y_temp = train_test_split(x, y, test_size=0.4, random_state=0)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=0)

# 建立模型

In [None]:
'''
### 1 
model = Sequential()
model.add(Embedding(input_dim = 100, output_dim = 20, input_shape=(108, 494, )))
model.add(Flatten())
model.add(Dense(12, activation='relu'))
model.add(Dense(4, activation='sigmoid'))
model.summary()
'''
#model.add(Embedding(input_dim = 100, output_dim = 20, input_shape=(108, 494, )))
'''
### 2
HIDDEN_LAYERS = 4
model = Sequential()
model.add(Dense(input_dim=108, units=4))
model.add(Activation("relu"))
model.add(Reshape((1,4)))
model.add(SimpleRNN(4))
model.add(Dense(units=2))
model.add(Activation("softmax"))
model.summary()
'''

'''
### 3
model = keras.Sequential()

model.add(
    layers.Bidirectional(layers.LSTM(64, return_sequences=True), input_shape=(5, 10))
)
model.add(layers.Bidirectional(layers.LSTM(32)))
model.add(layers.Dense(10))

model.summary()
'''

In [69]:
model = Sequential()
model.add(Embedding(input_dim = 100, output_dim = 20))
#model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Reshape((1,4)))
model.add(SimpleRNN(4))
model.add(Dense(4, activation='sigmoid'))
model.summary()

Model: "sequential_19"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, None, 20)          2000      
_________________________________________________________________
dense_17 (Dense)             (None, None, 16)          336       
_________________________________________________________________
reshape_4 (Reshape)          (None, 1, 4)              0         
_________________________________________________________________
simple_rnn_11 (SimpleRNN)    (None, 4)                 36        
_________________________________________________________________
dense_18 (Dense)             (None, 4)                 20        
Total params: 2,392
Trainable params: 2,392
Non-trainable params: 0
_________________________________________________________________


In [70]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

# 訓練模型

In [71]:
if not os.path.exists('./model/checkpoints'):
    os.makedirs('./model/checkpoints')

# 按照 val_f1 保存模型
ck_callback = tf.keras.callbacks.ModelCheckpoint('./model/checkpoints/weights.{epoch:02d}-{val_f1:.4f}.hdf5',
                                                 monitor='val_f1', 
                                                 mode='max', verbose=2,
                                                 save_best_only=True,
                                                 save_weights_only=True)
tb_callback = tf.keras.callbacks.TensorBoard(log_dir='./model/logs', profile_batch=0)

model.fit(X_train, y_train, batch_size=32, epochs=20)
model_path = 'model/model_rnn_2class_orig.h5'
model.save(model_path)

ValueError: Error when checking input: expected embedding_14_input to have 2 dimensions, but got array with shape (54666, 108, 494)

# 模型評估

In [37]:
# 載入模型
model_path = 'model/model.h5'

model = keras.models.load_model(model_path)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])





In [38]:
model.evaluate(X_train, y_train)

2022-07-19 09:26:54.934764: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 11666161728 exceeds 10% of free system memory.


ValueError: in user code:

    /home/bason/.conda/envs/SR/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:941 test_function  *
        outputs = self.distribute_strategy.run(
    /home/bason/.conda/envs/SR/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:951 run  **
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /home/bason/.conda/envs/SR/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2290 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /home/bason/.conda/envs/SR/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2649 _call_for_each_replica
        return fn(*args, **kwargs)
    /home/bason/.conda/envs/SR/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:911 test_step  **
        self.compiled_loss(
    /home/bason/.conda/envs/SR/lib/python3.8/site-packages/tensorflow/python/keras/engine/compile_utils.py:205 __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    /home/bason/.conda/envs/SR/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:143 __call__
        losses = self.call(y_true, y_pred)
    /home/bason/.conda/envs/SR/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:246 call
        return self.fn(y_true, y_pred, **self._fn_kwargs)
    /home/bason/.conda/envs/SR/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:1595 binary_crossentropy
        K.binary_crossentropy(y_true, y_pred, from_logits=from_logits), axis=-1)
    /home/bason/.conda/envs/SR/lib/python3.8/site-packages/tensorflow/python/keras/backend.py:4692 binary_crossentropy
        return nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)
    /home/bason/.conda/envs/SR/lib/python3.8/site-packages/tensorflow/python/ops/nn_impl.py:171 sigmoid_cross_entropy_with_logits
        raise ValueError("logits and labels must have the same shape (%s vs %s)" %

    ValueError: logits and labels must have the same shape ((None, 4) vs (None, 1))


In [42]:
model.evaluate(X_valid, y_valid)



[0.17106951463400447, 0.9271210432052612]

In [43]:
model.evaluate(X_test, y_test)



[0.17132094184849792, 0.9254788160324097]

# 其他

In [27]:
with open('audio_test.csv', 'w') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in d.items():
        writer.writerow([key, value])

In [129]:
print(audio.shape)

(91111, 108, 494)


In [None]:
x = pd.DataFrame(audio_reshape)
x.head()

In [None]:
df = pd.merge(x,y, how="right", left_on="檔名", right_on="檔名")

In [None]:
audio_reshape = audio.reshape(audio.shape[0], -1)

files = []
for(dirpath, dirnames, filenames) in os.walk(audio_path):
    for filename in filenames:
            if filename.endswith('.wav') or filename.endswith('.WAV'):                
                #取得 wav的檔名
                index = filename.index('.')
                filename = filename[:index]
                files.append(filename)
                
#files.remove(1618243237)
#files.remove(1618243179)
#files.remove(1618243233)
'''              
b = {}
c = []
i = 0
for a in aa:
    b = {"檔名":aa[i], "特徵":(audio_reshape)}
    c.append(b)
    i += 1
col = ["檔名", "特徵"]
try:
    with open('audio_csv.csv', 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=col, delimiter =",")
        writer.writeheader()

        for data in c:
            writer.writerow(data)
except IOError:
    print("I/O error")
'''

In [None]:
#
audio_reshape.astype(np.float32)
print(audio_reshape.shape)

In [None]:
#
aa = []
for(dirpath, dirnames, filenames) in os.walk(audio_path):
    for filename in filenames:
            if filename.endswith('.wav') or filename.endswith('.WAV'):                
                #取得 wav的檔名
                index = filename.index('.')
                filename = filename[:index]
                aa.append(filename)
                
b = {}
c = []
i = 0
for a in aa:
    b = {"檔名":aa[i], "特徵":(audio_reshape)}
    c.append(b)
    i += 1
col = ["檔名", "特徵"]
try:
    with open('audio_csv.csv', 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=col, delimiter =",")
        writer.writeheader()

        for data in c:
            writer.writerow(data)
except IOError:
    print("I/O error")

In [46]:
# 取得 label 檔名與內容
def get_labels_and_texts(label_path):
    labels = []
    labels_path = []
    label_texts = []
    for(dirpath, dirnames, filenames) in os.walk(label_path):
        for filename in filenames:
            if filename.endswith('.json'):
                #取得 label 的完整路徑，後面進行讀檔
                filename_path = os.path.join(dirpath, filename)
                labels_path.append(filename_path)
                
                #取得 label的檔名，後面對應到音檔
                try:
                    index = filename.index("_")
                except ValueError:
                    index = filename.index(".")
                    
                filename = filename[:index]
                labels.append(filename)
    for label_file in labels_path:
        #讀取 label 檔，並將 command 以 one-hot encoding 方式改寫 label
        try:
            with open(label_file) as fd:
                data = json.load(fd)
                a = 0
                b = 0
                #c = 0
                #d = 0
                if data['command'] == 'Not Activated':
                    a = 1
                else:
                    b = 1
                data = [a, b]
                label_texts.append(data)
                fd.close()
        except ValueError:
            continue
    return labels, label_texts

labels, label_texts = get_labels_and_texts(label_path)

print(labels[0], label_texts[0])

1633056677 [1, 0]


In [47]:
#將 檔名與 label 寫入 csv 檔

i = 0
for label_text in label_texts:
    label_text.insert(0, labels[i])
    i+=1

with open('dataset/data_sr/label_sample_twoclass.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['檔名','沒指令','有指令'])
    
    for label_text in label_texts:
        writer.writerow(label_text)
print(label_texts[0])


['1633056677', 1, 0]


In [None]:
# 將 檔名與特徵寫入 csv 檔

aa = []
for(dirpath, dirnames, filenames) in os.walk(audio_path):
    for filename in filenames:
            if filename.endswith('.wav') or filename.endswith('.WAV'):                
                #取得 wav的檔名
                index = filename.index('.')
                filename = filename[:index]
                aa.append(filename)

a_list = list(audio)
i = 0
b = {}
c = []
for a in a_list:
    b = {"檔名":aa[i],"特徵": a}
    c.append(b)
    i+=1
col = ["檔名", "特徵"]
try:
    with open(audio_csv_path, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=col)
        writer.writeheader()
        #writer.writerow(['檔名','特徵'])

        for data in c:
            writer.writerow(data)
except IOError:
    print("I/O error")


In [None]:
#load label csv into pd to preview data
labels = pd.read_csv(label_csv_path)
labels

In [None]:
#load audio csv into pd to preview data
audios = pd.read_csv(audio_csv_path)
audios

In [None]:
m,n,r = audio.shape
out_arr = np.column_stack((np.repeat(np.arange(m),n),audio.reshape(m*n,-1)))
out_df = pd.DataFrame(out_arr)

X = out_df[out_df.columns[:]].values   #dead kernel


#np.savetxt('audio.csv', audio, delimiter=',')    # can't save filename

In [None]:
# using python_speech_features to extract mfcc feartures
from python_speech_features import mfcc
from python_speech_features import logfbank
import scipy.io.wavfile as wav
import numpy
import os

# directory where we your .wav files are
directoryName = "dataset/data_sr/data" # put your own directory here
# directory to put our results in, you can change the name if you like
resultsDirectory = 'dataset/data_sr/audio_sample'

# make a new folder in this directory to save our results in
if not os.path.exists(resultsDirectory):
    os.makedirs(resultsDirectory)

# get MFCCs for every .wav file in our specified directory 
for filename in os.listdir(directoryName):
    if filename.endswith('.wav'): # only get MFCCs from .wavs
        # read in our file
        
        try:
            (rate,sig) = wav.read(directoryName + "/" +filename)
        except ValueError:
            continue
        # get mfcc
        mfcc_feat = mfcc(sig,rate)

        # get filterbank energies
        fbank_feat = logfbank(sig,rate)
        
        # create a file to save our results in
       
        outputFile = resultsDirectory + "/" + os.path.splitext(filename)[0] + ".csv"
        file = open(outputFile, 'w+') # make file/over write existing file
        numpy.savetxt(file, fbank_feat, delimiter=",") #save MFCCs as .csv
        file.close() # close file

In [None]:
# read all audio sample csv
audio_csvs_path = 'dataset/data_sr/audio_sample'
csv = glob.glob(audio_csvs_path + "/*.csv")

li = []
for filename in csv:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)
    
df = pd.concat(li, axis=0, ignore_index=True)
df.head()

In [None]:
# combine all csvs into one
def return_contents(file_name):
    with open(file_name) as infile:
        reader = csv.reader(infile)
        return list(reader)

all_files = os.listdir('dataset/data_sr/audio_sample')
combined_output = []

for file in all_files:
    data = return_contents('dataset/data_sr/audio_sample/{}'.format(file))
    for row in data:
        combined_output.extend(row)

with open('dataset/data_sr/audio_sample_2.csv', 'w', newline='') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(combined_output)

In [None]:
# using librosa to extract mfcc features
x, sr = librosa.load(wavs[0])

#Plot the signal:
plt.figure(figsize=(14, 5))
librosa.display.waveshow(x, sr=sr)
# Zooming in
n0 = 9000
n1 = 9100
plt.figure(figsize=(14, 5))
plt.plot(x[n0:n1])
plt.grid()

fs=10
mfccs = librosa.feature.mfcc(x, sr=fs)

print(mfccs.shape)
#Displaying  the MFCCs:
plt.figure(figsize=(15, 7))
librosa.display.specshow(mfccs, sr=sr, x_axis='time')

In [None]:
audio_reshaped = audio.reshape(audio.shape[0], -1)

np.savetxt("audio.csv", audio_reshaped)

In [45]:
files = []
for(dirpath, dirnames, filenames) in os.walk(audio_path):
    for filename in filenames:
            if filename.endswith('.wav') or filename.endswith('.WAV'):                
                #取得 wav的檔名
                index = filename.index('.')
                filename = filename[:index]
                files.append(filename)
                
d = dict()
for i in range(len(audio)):
    d[files[i]] = audio[i]
print(len(d))

### Store dictionary d into csv

85484


In [6]:
# 將 flac 音檔轉成 wav 音檔
def get_flacs(audio_path):
    flacs = []
    flac_filename = []
    for (dirpath, dirnames, filenames) in os.walk(audio_path):
        for filename in filenames:
            if filename.endswith('.flac'):
                filename_path = os.path.join(dirpath, filename)
                
                index = filename.index(".")
                    
                filename = filename[:index]
                flac_filename.append(filename)
                
                flacs.append(filename_path)
    return flacs, flac_filename

flacs,flac_filename = get_flacs(audio_path)

i = 0
for flac in flacs:
    try:
        audio, sr = sf.read(flac)
    except RuntimeError:
        print(flac)
    sf.write(audio_path+'/'+flac_filename[i]+'.wav', audio, sr, 'PCM_16')
    i+=1

dataset/data_sr/data/1623372994.flac
dataset/data_sr/data/1623373198.flac
dataset/data_sr/data/1624246529.flac
dataset/data_sr/data/1619576718.flac
dataset/data_sr/data/1623373076.flac
dataset/data_sr/data/1623373306.flac
dataset/data_sr/data/1620697758.flac
dataset/data_sr/data/1624246517.flac
dataset/data_sr/data/1620697415.flac
dataset/data_sr/data/1620697761.flac
dataset/data_sr/data/1620698062.flac
dataset/data_sr/data/1623373169.flac
dataset/data_sr/data/1620698334.flac
dataset/data_sr/data/1623373060.flac
dataset/data_sr/data/1620697764.flac
dataset/data_sr/data/1623372974.flac
dataset/data_sr/data/1620697767.flac
dataset/data_sr/data/1620697512.flac
