<a href="https://colab.research.google.com/github/catforest/mfcc_sample/blob/master/mfcc_sample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# ------------------------------
# Module Check
# ------------------------------
hasModule = !pip list | grep python-speech-features

if hasModule:
  print('module ... ok')
else:
  print('module ... installing')
  !pip install python_speech_features

# ------------------------------
# Runtime Check
# ------------------------------
# Runtime: Python3 & GPU
# （「ランタイム」＞「ランタイムのタイプを変更」より設定可能）

import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

# 【必須】ラベル、音声ファイルのアップロード
# ラベルは、labels.tsv

In [0]:
# ------------------------------
# Timestamp
# ------------------------------
from datetime import datetime, timedelta, timezone

JST = timezone(timedelta(hours=+9), 'JST')
now = datetime.now(JST)
print('start @ {0:%m/%d %H:%M:%S}'.format(now))


import numpy as np
from scipy.io.wavfile import read
from python_speech_features import mfcc
from python_speech_features import delta

# cut out a part with SOURCE_LENGTH from the center
#  https://python-speech-features.readthedocs.io/en/latest/
def get_mfcc(filename):
  SOURCE_LENGTH = 3000
  WINDOW_LENGTH = 0.025
  WINDOW_STEP = 0.01
  CEPSTRUM_NUMBER = 13
  FILTER_NUMBER = 26
  FFT_SIZE = 512
  PREEMPH_COEFFICIENT = 0.97
  CEPSTRUM_FILTER = 26
  IS_APPEND_ENERGY = False

  sampling, data = read(filename)
  start = int(len(data)/2)
  data = data[start : start + SOURCE_LENGTH]
  signal = data / 32768
  feat = mfcc(signal,
              samplerate=sampling,
              winlen = WINDOW_LENGTH,
              winstep = WINDOW_STEP,
              numcep = CEPSTRUM_NUMBER,
              nfilt = FILTER_NUMBER,
              nfft = FFT_SIZE,
              preemph = PREEMPH_COEFFICIENT,
              ceplifter = CEPSTRUM_FILTER,
              appendEnergy = IS_APPEND_ENERGY)
  return feat, signal, filename

# MFCC processing for ./*.wav
mfccs = []

import glob

path = './*.wav'
files = glob.glob(path)

for f in files:
    mfccs.append(get_mfcc(f))

# 描画用
import matplotlib.pyplot as plt

In [0]:
# 音声ソースの確認
# 【TASK】無音部分のみが選択されていないかの確認
# 最終的な精度に影響を及ぼす可能性がある
# 【TASK】音声部分のみを選択するにはどうすれば良いか
plt.plot(mfccs[0][1])

In [0]:
# MFCCの確認
# 【TASK】無音部分だとどのような形になるかの確認
plt.plot(mfccs[0][0])

In [0]:
# Load labels.tsv
LABEL_FILE = 'labels.tsv'
import pandas as pd
df = pd.read_table(LABEL_FILE, names = ('filename', 'status'))

In [0]:
# 【TASK】t-SNE, PCA等を用いて次元圧縮
# 【TASK】どのような前処理を行うと分類し易くなるかを確認