# SVM、MFCCのハイパーパラメーターの値と音声ファイルの加工方法ごとの比較検証結果

In [4]:
import numpy as np
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [5]:
# MFCCで特徴量抽出
class create_mfcc():
    def mfcc(self, audio_path, mfcc_param):
        y, sr = librosa.load(audio_path, sr=4096)    # 約4kHzでリサンプリングして読み込む
        mfccs = librosa.feature.mfcc(y, sr=sr, n_mfcc=mfcc_param)
        ceps = mfccs.mean(axis=1)
        #print(len(ceps))
        return ceps
    
    def fit(self, path_list, mfcc_param):
        self.x_test = []
        self.y_test = []
        count = 0
        current_path = os.getcwd()
        for data_path in path_list:
            os.chdir(data_path)
            for file in os.listdir():
                filename, ext = os.path.splitext(file)
                if ext == ".wav":
                    self.x_test.append(self.mfcc(file, mfcc_param))
                    self.y_test.append(count)
            count += 1
            os.chdir(current_path)
            

In [9]:
class create_model():
    def main(self, path_list, mfcc_param):
        raspi_data_mfcc = create_mfcc()
        raspi_data_mfcc.fit(path_list, mfcc_param)
        x_test = np.array(raspi_data_mfcc.x_test)
        y_test = np.array(raspi_data_mfcc.y_test)
        x_train, x_val, y_train, y_val = train_test_split(x_test, y_test, test_size = 0.2, random_state = 1)
        #標準化
        sc = StandardScaler()
        sc.fit(x_train)
        x_train_std = sc.transform(x_train)
        x_val_std = sc.transform(x_val)
        #SVMのインスタンスを生成
        self.model_linear = SVC(kernel='linear', random_state = 1)
        self.model_poly = SVC(kernel = "poly", random_state = 1)
        self.model_rbf = SVC(kernel = "rbf", random_state =1)
        self.model_linear.fit(x_train_std, y_train)
        self.model_poly.fit(x_train_std, y_train)
        self.model_rbf.fit(x_train_std, y_train)
        pred_linear_train = self.model_linear.predict(x_train_std)
        pred_poly_train = self.model_poly.predict(x_train_std)
        pred_rbf_train = self.model_rbf.predict(x_train_std)
        accuracy_linear_train =accuracy_score(y_train, pred_linear_train)
        accuracy_poly_train =accuracy_score(y_train, pred_poly_train)
        accuracy_rbf_train =accuracy_score(y_train, pred_rbf_train)
        print("train_result")
        print("Linear : "+str(accuracy_linear_train))
        print("Poly : "+str(accuracy_poly_train))
        print("RBF : "+ str(accuracy_rbf_train))
        pred_linear_val = self.model_linear.predict(x_val_std)
        pred_poly_val = self.model_poly.predict(x_val_std)
        pred_rbf_val = self.model_rbf.predict(x_val_std)
        accuracy_linear_val = accuracy_score(y_val, pred_linear_val)
        accuracy_poly_val = accuracy_score(y_val, pred_poly_val)
        accuracy_rbf_val = accuracy_score(y_val, pred_rbf_val)

        print("val_result")
        print("Linear : "+str(accuracy_linear_val))
        print("Poly : "+str(accuracy_poly_val))
        print("RBF : "+ str(accuracy_rbf_val))
        acclist = [[accuracy_linear_train, accuracy_poly_train, accuracy_rbf_train],[accuracy_linear_val, accuracy_poly_val, accuracy_rbf_val]]
        return acclist

MFCC特徴量を100、ラズパイで5秒間収録した音声

In [11]:
path_list = ["sound/raspi/a", "sound/raspi/k", "sound/raspi/endo", "sound/raspi/takahashi"]
param100_m = create_model()
param100_acclist = param100_m.main(path_list, 100)

train_result
Linear : 1.0
Poly : 0.967741935483871
RBF : 1.0
val_result
Linear : 0.875
Poly : 0.25
RBF : 0.75


MFCC特徴量を12、ラズパイで5秒間収録した音声

In [12]:
path_list = ["sound/raspi/a", "sound/raspi/k", "sound/raspi/endo", "sound/raspi/takahashi"]
param12_m = create_model()
param12_acclist = param12_m.main(path_list, 12)

train_result
Linear : 1.0
Poly : 0.6451612903225806
RBF : 1.0
val_result
Linear : 1.0
Poly : 0.375
RBF : 0.875


MFCC特徴量を24、ラズパイで5秒間収録した音声

In [13]:
path_list = ["sound/raspi/a", "sound/raspi/k", "sound/raspi/endo", "sound/raspi/takahashi"]
param24_m = create_model()
param24_acclist = param24_m.main(path_list, 24)

train_result
Linear : 1.0
Poly : 0.7096774193548387
RBF : 1.0
val_result
Linear : 0.75
Poly : 0.25
RBF : 0.75


MFCC特徴量を1000、ラズパイで5秒間収録した音声

In [19]:
path_list = ["sound/raspi/a", "sound/raspi/k", "sound/raspi/endo", "sound/raspi/takahashi"]
param1000_m = create_model()
param1000_acclist = param1000_m.main(path_list, 1000)

train_result
Linear : 1.0
Poly : 0.967741935483871
RBF : 1.0
val_result
Linear : 0.875
Poly : 0.125
RBF : 0.625


MFCC特徴量を500、ラズパイで5秒間収録した音声

In [20]:
path_list = ["sound/raspi/a", "sound/raspi/k", "sound/raspi/endo", "sound/raspi/takahashi"]
param500_m = create_model()
param500_acclist = param1000_m.main(path_list, 500)

train_result
Linear : 1.0
Poly : 0.967741935483871
RBF : 1.0
val_result
Linear : 0.875
Poly : 0.125
RBF : 0.625


MFCC特徴量を100、ラズパイで5秒間収録した音声からWavPadを使用してノイズを除去した5秒間の音声

In [15]:
path_list = ["sound/raspi_noise/a", "sound/raspi_noise/k", "sound/raspi_noise/endo", "sound/raspi_noise/takahashi"]
noise_param100_m = create_model()
noise_param100_acclist = noise_param100_m.main(path_list, 100)

train_result
Linear : 1.0
Poly : 1.0
RBF : 1.0
val_result
Linear : 1.0
Poly : 0.875
RBF : 1.0


MFCC特徴量を12、ラズパイで5秒間収録した音声からWavPadを使用してノイズを除去した5秒間の音声

In [17]:
path_list = ["sound/raspi_noise/a", "sound/raspi_noise/k", "sound/raspi_noise/endo", "sound/raspi_noise/takahashi"]
noise_param12_m = create_model()
noise_param12_acclist = noise_param12_m.main(path_list, 12)

train_result
Linear : 1.0
Poly : 0.90625
RBF : 1.0
val_result
Linear : 0.75
Poly : 0.25
RBF : 0.875


MFCC特徴量を24、ラズパイで5秒間収録した音声からWavPadを使用してノイズを除去した5秒間の音声

In [18]:
path_list = ["sound/raspi_noise/a", "sound/raspi_noise/k", "sound/raspi_noise/endo", "sound/raspi_noise/takahashi"]
noise_param24_m = create_model()
noise_param24_acclist = noise_param24_m.main(path_list, 24)

train_result
Linear : 1.0
Poly : 0.96875
RBF : 1.0
val_result
Linear : 0.75
Poly : 0.875
RBF : 1.0


MFCC特徴量を500、ラズパイで5秒間収録した音声からWavPadを使用してノイズを除去した5秒間の音声

In [21]:
path_list = ["sound/raspi_noise/a", "sound/raspi_noise/k", "sound/raspi_noise/endo", "sound/raspi_noise/takahashi"]
noise_param500_m = create_model()
noise_param500_acclist = noise_param500_m.main(path_list, 500)

train_result
Linear : 1.0
Poly : 1.0
RBF : 1.0
val_result
Linear : 1.0
Poly : 0.875
RBF : 1.0


MFCC特徴量を100、ラズパイで5秒間収録し、無音部分をカットした音声

In [22]:
path_list = ["sound/raspi_cut/a", "sound/raspi_cut/k", "sound/raspi_cut/endo", "sound/raspi_cut/takahashi"]
cut_param100_m = create_model()
cut_param100_acclist = cut_param100_m.main(path_list, 100)

train_result
Linear : 1.0
Poly : 1.0
RBF : 1.0
val_result
Linear : 1.0
Poly : 0.5
RBF : 1.0


MFCC特徴量を12、ラズパイで5秒間収録し、無音部分をカットした音声

In [23]:
path_list = ["sound/raspi_cut/a", "sound/raspi_cut/k", "sound/raspi_cut/endo", "sound/raspi_cut/takahashi"]
cut_param12_m = create_model()
cut_param12_acclist = cut_param12_m.main(path_list, 12)

train_result
Linear : 0.96875
Poly : 0.8125
RBF : 0.96875
val_result
Linear : 0.75
Poly : 0.375
RBF : 0.75


MFCC特徴量を24、ラズパイで5秒間収録し、無音部分をカットした音声

In [24]:
path_list = ["sound/raspi_cut/a", "sound/raspi_cut/k", "sound/raspi_cut/endo", "sound/raspi_cut/takahashi"]
cut_param24_m = create_model()
cut_param24_acclist = cut_param24_m.main(path_list, 24)

train_result
Linear : 1.0
Poly : 0.6875
RBF : 1.0
val_result
Linear : 0.875
Poly : 0.25
RBF : 0.875


MFCC特徴量を500、ラズパイで5秒間収録し、無音部分をカットした音声

In [25]:
path_list = ["sound/raspi_cut/a", "sound/raspi_cut/k", "sound/raspi_cut/endo", "sound/raspi_cut/takahashi"]
cut_param500_m = create_model()
cut_param500_acclist = cut_param500_m.main(path_list, 500)

train_result
Linear : 1.0
Poly : 0.96875
RBF : 1.0
val_result
Linear : 1.0
Poly : 0.5
RBF : 1.0


MFCC特徴量を1000、ラズパイで5秒間収録し、無音部分をカットした音声

In [26]:
path_list = ["sound/raspi_cut/a", "sound/raspi_cut/k", "sound/raspi_cut/endo", "sound/raspi_cut/takahashi"]
cut_param1000_m = create_model()
cut_param1000_acclist = cut_param1000_m.main(path_list, 1000)

train_result
Linear : 1.0
Poly : 0.96875
RBF : 1.0
val_result
Linear : 1.0
Poly : 0.5
RBF : 1.0


MFCC特徴量を100、ラズパイで5秒間収録し、WavPadでノイズ除去と無音部分をカットした音声

In [28]:
path_list = ["sound/raspi_cut_e/a", "sound/raspi_cut_e/k", "sound/raspi_cut_e/endo", "sound/raspi_cut_e/takahashi"]
e_change_param100_m = create_model()
e_change_param100_acclist = e_change_param100_m.main(path_list, 100)

train_result
Linear : 1.0
Poly : 0.90625
RBF : 1.0
val_result
Linear : 1.0
Poly : 0.375
RBF : 1.0


MFCC特徴量を12、ラズパイで5秒間収録し、WavPadでノイズ除去と無音部分をカットした音声

In [29]:
path_list = ["sound/raspi_cut_e/a", "sound/raspi_cut_e/k", "sound/raspi_cut_e/endo", "sound/raspi_cut_e/takahashi"]
e_change_param12_m = create_model()
e_change_param12_acclist = e_change_param12_m.main(path_list, 12)

train_result
Linear : 1.0
Poly : 0.78125
RBF : 1.0
val_result
Linear : 1.0
Poly : 0.625
RBF : 1.0


MFCC特徴量を24、ラズパイで5秒間収録し、WavPadでノイズ除去と無音部分をカットした音声

In [30]:
path_list = ["sound/raspi_cut_e/a", "sound/raspi_cut_e/k", "sound/raspi_cut_e/endo", "sound/raspi_cut_e/takahashi"]
e_change_param24_m = create_model()
e_change_param24_acclist = e_change_param24_m.main(path_list, 24)

train_result
Linear : 1.0
Poly : 0.65625
RBF : 1.0
val_result
Linear : 1.0
Poly : 0.25
RBF : 1.0


MFCC特徴量を500、ラズパイで5秒間収録し、WavPadでノイズ除去と無音部分をカットした音声

In [31]:
path_list = ["sound/raspi_cut_e/a", "sound/raspi_cut_e/k", "sound/raspi_cut_e/endo", "sound/raspi_cut_e/takahashi"]
e_change_param500_m = create_model()
e_change_param500_acclist = e_change_param500_m.main(path_list, 500)

train_result
Linear : 1.0
Poly : 0.90625
RBF : 1.0
val_result
Linear : 1.0
Poly : 0.375
RBF : 1.0


In [34]:
dict1=dict(p12base=param12_acclist[1], p24base=param24_acclist[1], p100base=param100_acclist[1], p500base=param500_acclist[1],
          p12noise=noise_param12_acclist[1], p24noise=noise_param24_acclist[1],p100noise=noise_param100_acclist[1],p500noise=noise_param500_acclist[1],
          p12cut=cut_param12_acclist[1],p24cut=cut_param24_acclist[1],p100cut=cut_param100_acclist[1],p500cut=cut_param500_acclist[1],
          p12data=e_change_param12_acclist[1],p24data=e_change_param24_acclist[1],p100data=e_change_param100_acclist[1],p500data=e_change_param500_acclist[1])
index1 = ["Linear", "Poly", "RBF"]
pd.DataFrame(data=dict1, index=index1)

Unnamed: 0,p12base,p24base,p100base,p500base,p12noise,p24noise,p100noise,p500noise,p12cut,p24cut,p100cut,p500cut,p12data,p24data,p100data,p500data
Linear,1.0,0.75,1.0,0.875,0.75,0.75,1.0,1.0,0.75,0.875,1.0,1.0,1.0,1.0,1.0,1.0
Poly,0.375,0.25,0.875,0.125,0.25,0.875,0.875,0.875,0.375,0.25,0.5,0.5,0.625,0.25,0.375,0.375
RBF,0.875,0.75,1.0,0.625,0.875,1.0,1.0,1.0,0.75,0.875,1.0,1.0,1.0,1.0,1.0,1.0


【考察】
MFCC特徴量は100、SVMのカーネルはRBF、音声は加工しないまたはノイズ除去のみが良さそうであった<br>
無音音声をカットすることで長さが揃わなくなったことが悪影響を及ぼしている可能性も高いと推察される<br>
収録した音声では精度が出ても、実際にRaspberryPi上で検証を行うと誰が声をかけても同じ人と認識してしまうため、根本的に見直す必要がありそうだ