# Basic Framework, adapted for sem-supervised learning
This framework heavily borrows from the asic framework. It can be used to anaqlyze sem-supervised learning for the problem of speech recognition.

In [None]:
import os
from os.path import isdir, join
from pathlib import Path
import pandas as pd
from tqdm import tqdm
# Math
import numpy as np
import scipy.stats
from scipy.fftpack import fft
from scipy import signal
from scipy.io import wavfile
import librosa
import librosa.display
from scipy import sparse, stats, spatial
import scipy.sparse.linalg

from sklearn.decomposition import PCA

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import IPython.display as ipd

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import pandas as pd
import pygsp

%matplotlib inline

plt.rcParams['figure.figsize'] = (17, 5)
pygsp.plotting.BACKEND = 'matplotlib'

## 1. Import the Data
----
Use `N` random samples for each word 

In [None]:
N = 100

train_audio_path = '../Data/train/audio'
dirs = [f for f in os.listdir(train_audio_path) if isdir(join(train_audio_path, f))]
dirs.sort()

path = []
word = []
speaker = []
iteration = []

for direct in dirs:
    if not direct.startswith('_'):
        # Random selection of N files per folder 
        list_files = os.listdir(join(train_audio_path, direct))
        wave_selected  = list(np.random.choice([ f for f in list_files if f.endswith('.wav')],N,replace=False))
        
        # Extraction of file informations for dataframe
        word.extend(list(np.repeat(direct,N,axis=0)))
        speaker.extend([wave_selected[f].split('.')[0].split('_')[0] for f in range(N) ])
        iteration.extend([wave_selected[f].split('.')[0].split('_')[-1] for f in range(N) ])
        path.extend([train_audio_path + '/' + direct + '/' + wave_selected[f] for f in range(N)])


Initialize the dataframe of the original data

In [None]:
features_og = pd.DataFrame({('info','word',''): word,
                            ('info','speaker',''): speaker,
                            ('info','iteration',''): iteration,
                            ('info','path',''): path})
index_og = [('info','word',''),('info','speaker',''),('info','iteration','')]
#features_og.set_index(index_og,inplace=True)
features_og.head()

## 2. Features Extraction
----
### 2.1 MFCC
A classical but reliable set a features

In [None]:
N_MFCC = 20

def compute_mfcc(filepath):
    audio, sampling_rate = librosa.load(filepath, sr=None, mono=True)
    return librosa.feature.mfcc(y=audio,sr=sampling_rate)

In [None]:
stat_name= ['mean','std','skew','kurtosis','median']
col_names = [('mfcc',stat_name[i],j) for i in range(len(stat_name))  for j in range(N_MFCC)]
features_mfcc =pd.DataFrame(columns=pd.MultiIndex.from_tuples(col_names),index=features_og.index)
# sorting the columns in order to improve index performances (see lexsort errors)
features_mfcc.sort_index(axis=1,inplace=True,sort_remaining=True)

# MFCC FEATURES :
for w in tqdm(range(len(features_og)),total=len(features_og),unit='waves'):
    mfcc = compute_mfcc(features_og[('info','path')].iloc[w])
    features_mfcc.loc[w, ('mfcc', 'mean')] = np.mean(mfcc,axis=1)
    features_mfcc.loc[w, ('mfcc', 'std')] = np.std(mfcc,axis=1)
    features_mfcc.loc[w, ('mfcc', 'skew')] = scipy.stats.skew(mfcc,axis=1)
    features_mfcc.loc[w, ('mfcc', 'kurtosis')] = scipy.stats.kurtosis(mfcc,axis=1)
    features_mfcc.loc[w, ('mfcc', 'median')] = np.median(mfcc,axis=1)
    
features_og = features_og.merge(features_mfcc,left_index=True,right_index=True)
features_og.head()

Saving the dataset features into a pickle to avoid to redo the computation on the training set

In [None]:
features_og.to_pickle('./Features Data/trainingFeatures.pickle')

## 3. Set up graph using the features

In this section we take the features and build from it a weight matrix using the cosine distance (for now). We sparsify the weight matrix using the nearest neighbour method and finally make sure that the weight matrix is symmetric.

In [None]:
# Importing the pickle containing the previously saved features
features_og = pd.read_pickle('./Features Data/trainingFeatures.pickle')

In [None]:
# Normalize Features
features = features_og['mfcc']
features -= features.mean(axis=0)
features /= features.std(axis=0)

distances = spatial.distance.squareform(spatial.distance.pdist(features,'cosine'))

n=distances.shape[0]
kernel_width = distances.mean()
weights = np.exp(np.divide(-np.square(distances),kernel_width**2))
np.fill_diagonal(weights,0)

# Show sthe weight matrix
plt.matshow(weights)

In [None]:
# Sparsify using nearest neighbours
fix, axes = plt.subplots(2, 2, figsize=(17, 8))
def plot(weights, axes):
    axes[0].spy(weights)
    axes[1].hist(weights[weights > 0].reshape(-1), bins=50);
plot(weights, axes[:, 0])

NEIGHBORS = 200

for i in range(weights.shape[0]):
    idx = weights[i,:].argsort()[:-NEIGHBORS]
    weights[i,idx] = 0
    weights[idx,i] = 0

plot(weights, axes[:, 1]) 

## Semi-supervised clustering

In this section we perform semi-supervised clustering on the training set, i.e. we assume that we only know a percentage of the labels of the points and infere the other labels by optimizing a certain cost function. From here on we're working with teh package PyGSP.

In [None]:
# Initialize Graph using the weights matrix
G = pygsp.graphs.Graph(weights)

# Compute the normalized Graph Laplacian corresponding to the above constructed graph
G.compute_laplacian('normalized')

# Compute the Fourier basis of the Laplacian
G.compute_fourier_basis(recompute=True)

We can use any two eigenvectors to plot the graph, I chose vecto 1 and two (not sure if it is the best).

In [None]:
# Use Laplacian Eigenmaps to plot the graph in 2D
G.set_coordinates(G.U[:,(1,2)])
G.plot()

Now we create a vector containing the label of each class that we have to classify (Leaving away silence for now). We then define the label vector as a graph signal and thus plot it on the graph.

In [None]:
# Build Label vector
# Define class name vector, the index will correspond to the calss label
class_names = ['unknown','yes','no','up','down','left','right','on','off','stop','go']
label_vec = np.ones(G.N)
for i in range(0,len(class_names)):
    label_vec +=(features_og['info','word'] == class_names[i]) *i

G.plot_signal(label_vec, vertex_size=20)

The next two functions are used two randomly pick some percentage of samples form the label vector (prepare_observations) and two solve for the estimated vector (solve). The problem is set up in the exact same way as in assigment 4 (except we have more then two kind of labels).

In [None]:
def prepare_observations(p):
    """Prepare observations, where p is the percentage of values to keep."""
    rs = np.random.RandomState(42)
    M = np.diag(rs.uniform(size=G.N) < p)
    return M.dot(label_vec)

def solve(y, alpha):
    """
    Solve the optimization problem.
    
    Parameters:
        y: the observations
        alpha: the balance between fidelity and smoothness prior.
    
    Returns:
        x_pred: the predicted class
        x_star: the solution of the optimization problem
    """
    M = np.diag(y!=0)
    x_star = np.linalg.solve((M+alpha*G.L),y)
    x_pred = np.round(x_star)

    return x_pred, x_star

# Play with the percentage of observed values.
y = prepare_observations(p=0.5)

In [None]:
# Plot the label_vector with only 50% of its labels
G.plot_signal(y, vertex_size=20)

Finally we compute and plot teh error rate of the estimation for compression rates.

In [None]:
p =[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

err = np.zeros(len(p))
for i in range(len(p)):
    y = prepare_observations(p[i])
    x_pred, x_star = solve(y, alpha=1e-5)
    err[i] = np.count_nonzero(label_vec-x_pred)/G.N 
    
plt.plot(p,err)
plt.title('Error vs. Compression Rate')
plt.xlabel('Compression rate p')
plt.ylabel('Error Rate')
plt.grid()

## Conclusion: 
Using the exact same model formulation as in assignment 4 and only one vector for the clustering seems not to work well. We have to adapth the model for our more complex case.