# Basic Framework
First attempt to create a graph and a clustering of the words

In [1]:
import os
from os.path import isdir, join
from pathlib import Path
import pandas as pd
from tqdm import tqdm
# Math
import numpy as np
import scipy.stats
from scipy.fftpack import fft
from scipy import signal
from scipy.io import wavfile
import librosa
import librosa.display
from scipy import sparse, stats, spatial
import scipy.sparse.linalg

from sklearn.decomposition import PCA

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import IPython.display as ipd

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import pandas as pd



%matplotlib inline

## 1. Import the Data
----
Use `N` random samples for each word 

In [30]:
N = 100

train_audio_path = '../Project/data/train/audio'
dirs = [f for f in os.listdir(train_audio_path) if isdir(join(train_audio_path, f))]
dirs.sort()

path = []
word = []
speaker = []
iteration = []

for direct in dirs:
    if not direct.startswith('_'):
        # Random selection of N files per folder 
        list_files = os.listdir(join(train_audio_path, direct))
        wave_selected  = list(np.random.choice([ f for f in list_files if f.endswith('.wav')],N,replace=False))
        
        # Extraction of file informations for dataframe
        word.extend(list(np.repeat(direct,N,axis=0)))
        speaker.extend([wave_selected[f].split('.')[0].split('_')[0] for f in range(N) ])
        iteration.extend([wave_selected[f].split('.')[0].split('_')[-1] for f in range(N) ])
        path.extend([train_audio_path + '/' + direct + '/' + wave_selected[f] for f in range(N)])


Initialize the dataframe of the original data

In [31]:
features_og = pd.DataFrame({('info','word',''): word,
                            ('info','speaker',''): speaker,
                            ('info','iteration',''): iteration,
                            ('info','path',''): path})
index_og = [('info','word',''),('info','speaker',''),('info','iteration','')]
#features_og.set_index(index_og,inplace=True)
features_og.head()

Unnamed: 0_level_0,info,info,info,info
Unnamed: 0_level_1,iteration,path,speaker,word
,,,,
0.0,0.0,../Project/data/train/audio/bed/7fd25f7c_nohas...,7fd25f7c,bed
1.0,0.0,../Project/data/train/audio/bed/a1a59b72_nohas...,a1a59b72,bed
2.0,0.0,../Project/data/train/audio/bed/982babaf_nohas...,982babaf,bed
3.0,0.0,../Project/data/train/audio/bed/105a0eea_nohas...,105a0eea,bed
4.0,0.0,../Project/data/train/audio/bed/c6ca5d01_nohas...,c6ca5d01,bed


## 2. Features Extraction
----
### 2.1 MFCC
A classical but reliable set a features

In [32]:
N_MFCC = 20

def compute_mfcc(filepath):
    audio, sampling_rate = librosa.load(filepath, sr=None, mono=True)
    return librosa.feature.mfcc(y=audio,sr=sampling_rate)

In [33]:
stat_name= ['mean','std','skew','kurtosis','median']
col_names = [('mfcc',stat_name[i],j) for i in range(len(stat_name))  for j in range(N_MFCC)]
features_mfcc =pd.DataFrame(columns=pd.MultiIndex.from_tuples(col_names),index=features_og.index)
# sorting the columns in order to improve index performances (see lexsort errors)
features_mfcc.sort_index(axis=1,inplace=True,sort_remaining=True)

# MFCC FEATURES :
for w in tqdm(range(len(features_og)),total=len(features_og),unit='waves'):
    mfcc = compute_mfcc(features_og[('info','path')].iloc[w])
    features_mfcc.loc[w, ('mfcc', 'mean')] = np.mean(mfcc,axis=1)
    features_mfcc.loc[w, ('mfcc', 'std')] = np.std(mfcc,axis=1)
    features_mfcc.loc[w, ('mfcc', 'skew')] = scipy.stats.skew(mfcc,axis=1)
    features_mfcc.loc[w, ('mfcc', 'kurtosis')] = scipy.stats.kurtosis(mfcc,axis=1)
    features_mfcc.loc[w, ('mfcc', 'median')] = np.median(mfcc,axis=1)

features_og = features_og.merge(features_mfcc,left_index=True,right_index=True)
features_og.head()

100%|██████████| 3000/3000 [03:06<00:00, 16.12waves/s]


Unnamed: 0_level_0,info,info,info,info,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc
Unnamed: 0_level_1,iteration,path,speaker,word,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,...,std,std,std,std,std,std,std,std,std,std
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,0,1,2,3,4,5,...,10,11,12,13,14,15,16,17,18,19
0,0,../Project/data/train/audio/bed/7fd25f7c_nohas...,7fd25f7c,bed,-0.939983,1.66579,-1.23714,-1.26997,-1.30172,1.04863,...,7.58005,7.58285,4.69827,8.3393,16.1328,4.59683,6.51323,5.25445,5.28337,6.03412
1,0,../Project/data/train/audio/bed/a1a59b72_nohas...,a1a59b72,bed,-1.3397,-0.435846,-1.00314,-1.13759,-1.106,-1.30456,...,9.46385,9.73336,9.92067,10.6509,8.20809,7.22551,10.314,7.68922,8.27049,6.19148
2,0,../Project/data/train/audio/bed/982babaf_nohas...,982babaf,bed,-0.455215,1.94067,4.07365,-1.03242,1.48366,-1.18843,...,9.534,5.88814,7.34011,3.54251,5.60392,8.07473,4.75943,6.45313,4.50032,4.02977
3,0,../Project/data/train/audio/bed/105a0eea_nohas...,105a0eea,bed,-1.31689,-1.29381,-1.18099,-0.994555,1.5672,-1.33937,...,12.061,9.84947,6.7949,13.0806,9.51939,5.24062,7.40497,4.49693,6.30983,10.277
4,0,../Project/data/train/audio/bed/c6ca5d01_nohas...,c6ca5d01,bed,-0.941066,0.802391,0.228132,-1.17898,-0.811291,0.104499,...,8.47487,6.11479,5.48952,3.74991,9.35402,7.23911,12.2956,3.9111,5.39558,7.0583
5,0,../Project/data/train/audio/bed/d1a4fb3f_nohas...,d1a4fb3f,bed,-0.409955,-1.30618,0.54494,-0.724295,-0.871406,-1.10088,...,13.6584,7.95274,10.3159,5.96423,7.04317,9.91171,6.41331,9.91748,7.22836,6.04972
6,1,../Project/data/train/audio/bed/d84829e0_nohas...,d84829e0,bed,-0.558749,-0.0460948,0.192449,-0.731776,-0.284511,-0.490825,...,8.67176,5.06176,5.76797,6.98137,9.53667,5.07878,5.5909,12.9445,8.61666,3.25395
7,0,../Project/data/train/audio/bed/3e3fa7fd_nohas...,3e3fa7fd,bed,-0.303959,-1.61208,-0.951477,-1.25223,-0.626595,-0.294564,...,7.33095,3.46869,6.708,8.26273,3.11376,5.82007,5.51999,7.72597,8.53526,5.99106
8,0,../Project/data/train/audio/bed/18c54a68_nohas...,18c54a68,bed,-0.586626,0.613767,1.60205,-0.826557,-0.621347,-0.540083,...,7.71385,4.29924,10.2393,3.59411,4.95462,9.13367,5.6534,9.29754,10.7797,3.79273
9,0,../Project/data/train/audio/bed/e2362167_nohas...,e2362167,bed,-0.885091,-0.92037,-0.70515,-0.787198,-0.939759,-0.495751,...,7.22032,3.6914,5.9333,5.67845,5.76905,8.64617,5.86275,6.87285,4.10098,3.65971


Saving the dataset features into a pickle to avoid to redo the computation on the training set

In [35]:
features_og.to_pickle('./Features Data/trainingFeatures.pickle')

## 3. Analysis

Importing the pickle containing the previously saved features

In [36]:
features_og = pd.read_pickle('./Features Data/trainingFeatures.pickle')

In [None]:
# drop features
features = pd.DataFrame(features_og)
features -= features.mean(axis=0)
features /= features.std(axis=0)

distances = spatial.distance.squareform(spatial.distance.pdist(features,'cosine'))

n=distances.shape[0]
kernel_width = distances.mean()
weights = np.exp(np.divide(-np.square(distances),kernel_width**2))
np.fill_diagonal(weights,0)

In [None]:
fix, axes = plt.subplots(2, 2, figsize=(17, 8))
def plot(weights, axes):
    axes[0].spy(weights)
    axes[1].hist(weights[weights > 0].reshape(-1), bins=50);
plot(weights, axes[:, 0])

NEIGHBORS = 30

for i in range(weights.shape[0]):
    idx = weights[i,:].argsort()[:-NEIGHBORS]
    weights[i,idx] = 0
    weights[idx,i] = 0

plot(weights, axes[:, 1])
   

In [None]:
degrees = np.sum(weights,axis=0)
laplacian = np.diag(degrees**-0.5) @ (np.diag(degrees) - weights) @ np.diag(degrees**-0.5)
laplacian = sparse.csr_matrix(laplacian)
plt.matshow(laplacian.todense());

In [None]:
eigenvalues, eigenvectors = sparse.linalg.eigsh(A=laplacian,k=10,which='SM')

plt.plot(eigenvalues, '.-', markersize=15);

x = eigenvectors[:,1]
y = eigenvectors[:,2]

In [None]:
fix, axes = plt.subplots(5, 5, figsize=(17, 8))
for i in range(1,6):
    for j in range(1,6):
        x = eigenvectors[:,i]
        y = eigenvectors[:,j]
        labels = np.sign(x)
        axes[i-1,j-1].scatter(x, y, c=labels, cmap='RdBu', alpha=0.5)