# Vocalization Classification

Using the spectral features, I will train a classifier based on the words: {BRICK, CLOCK, JUICE, PANTS, GLASS} and determine how well different classifiers perform using this data.

In [1]:
# Import Necessary Libraries
import numpy as np
import scipy.io

import matplotlib
from matplotlib import *
from matplotlib import pyplot as plt
import itertools
from mpl_toolkits.axes_grid1 import make_axes_locatable

from sklearn.decomposition import PCA
import scipy.stats as stats
from scipy.spatial import distance as Distance

# pretty charting
import seaborn as sns
sns.set_palette('muted')
sns.set_style('darkgrid')

%matplotlib inline

In [11]:
#### Extract wordpairs data into a dictionary for a subject/session/block
#### dictionary{wordpair:{channels}}
def extractSubjVocalizedData(subj, word):
    # file directory for a subj/session/block
    filedir = '../../condensed_data_' + subj + '/summary_vocalization/' + word
    
    # initialize data dictionary with meta data
    data_dict = {}
    data_dict['meta'] = {'subject': subj,
                         'word': word}
    
    all_channel_mats = os.listdir(filedir)
    for channel in all_channel_mats: # loop thru all channels
        chan_file = filedir + '/' + channel

        ## 00: load in data
        data = scipy.io.loadmat(chan_file)
        data = data['data']

        ## 01: get the time point for probeword on
        timeZero = data['timeZero'][0][0][0]

        ## 02: get the time point of vocalization
        vocalization = data['vocalization'][0][0][0]

        ## 03: Get Power Matrix
        power_matrix = data['powerMatZ'][0][0]

        chan = channel.split('_')[0]

        # convert channel data into a json dict
        data_dict[chan] = {'timeZero': timeZero,
                                      'timeVocalization':vocalization,
                                      'powerMat': power_matrix}
    return data_dict

def extractSubjVocalizedDataCat(subj, word):
    # file directory for a subj/session/block
    filedir = '../../condensed_data_' + subj + '/summary_vocalization/' + word
    
    # initialize data dictionary with meta data
    data_dict = {}
    data_dict['meta'] = {'subject': subj,
                         'word': word}
    
    all_channel_mats = os.listdir(filedir)
    for cdx, channel in enumerate(all_channel_mats): # loop thru all channels
        chan_file = filedir + '/' + channel

        ## 00: load in data
        data = scipy.io.loadmat(chan_file)
        data = data['data']

        ## 01: get the time point for probeword on
        timeZero = data['timeZero'][0][0][0]

        ## 02: get the time point of vocalization
        vocalization = data['vocalization'][0][0][0]

        ## 03: Get Power Matrix
        power_matrix = data['powerMatZ'][0][0]
        power_matrix = power_matrix[:, :, timeZero:timeZero+25]
        
        chan = channel.split('_')[0]

        # convert channel data into a json dict
        if cdx == 0:
            data_dict['data'] = {'timeZero': timeZero,
                             'timeVocalization':vocalization,
                             'powerMat': power_matrix}
        else:
            data_dict['data']['powerMat'] = np.concatenate((data_dict['data']['powerMat'], power_matrix), axis=1)
        
    return data_dict

In [13]:
######## Get list of files (.mat) we want to work with ########
subj = 'NIH034' # change the directories if you want
filedir = '../../condensed_data_' + subj + '/summary_vocalization/'
targetWords = os.listdir(filedir)

print targetWords

spectralData = {}
for word in targetWords:
    wordDir = filedir + word
    
    ## 01: Extract the data of every channel for this subject and targetWord
    wordData = extractSubjVocalizedDataCat(subj, word)
    spectralData[word] = wordData['data']['powerMat']

['BRICK', 'CLOCK', 'GLASS', 'JUICE', 'PANTS']




In [14]:
print spectralData.keys()

['JUICE', 'GLASS', 'BRICK', 'PANTS', 'CLOCK']


# Train Classifiers

In [15]:
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import LeaveOneOut
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis


In [16]:
np.random.seed(12345678)  # for reproducibility, set random seed

names = ["Nearest Neighbors", "Linear SVM", "Random Forest",
         "Linear Discriminant Analysis", "Quadratic Discriminant Analysis",
        "Logistic Regression"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    LogisticRegression()]

In [18]:
comb = sum([map(list, itertools.combinations(targetWords, 2))], [])
print comb

[['BRICK', 'CLOCK'], ['BRICK', 'GLASS'], ['BRICK', 'JUICE'], ['BRICK', 'PANTS'], ['CLOCK', 'GLASS'], ['CLOCK', 'JUICE'], ['CLOCK', 'PANTS'], ['GLASS', 'JUICE'], ['GLASS', 'PANTS'], ['JUICE', 'PANTS']]


In [22]:
accuracy=np.zeros((len(comb),len(classifiers),2))
print accuracy.shape
for i,pair in enumerate(comb): 
    # Create classes and feature vects
    firstWordData = np.mean(spectralData[pair[0]], axis=2)
    secondWordData = np.mean(spectralData[pair[1]], axis=2)

    features = np.append(firstWordData, secondWordData, axis=0)
    y = np.ones((firstWordData.shape[0],))
    y = np.concatenate((y, np.zeros((secondWordData.shape[0],))))
    
    print "\n"
    print("Accuracy for pair: ", pair)
    for idx, cla in enumerate(classifiers):
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(features, y, test_size=0.4, random_state=0)

        clf = cla.fit(X_train, y_train)
        loo = LeaveOneOut(len(features))
        scores = cross_validation.cross_val_score(clf, features, y, cv=loo)
        accuracy[i,idx,] = [scores.mean(), scores.std()]
        print("Accuracy of %s: %0.2f (+/- %0.2f)" % (names[idx], scores.mean(), scores.std() * 2))

(10, 6, 2)


('Accuracy for pair: ', ['BRICK', 'CLOCK'])
Accuracy of Nearest Neighbors: 0.55 (+/- 0.99)
Accuracy of Linear SVM: 0.58 (+/- 0.99)
Accuracy of Random Forest: 0.67 (+/- 0.94)
Accuracy of Linear Discriminant Analysis: 0.54 (+/- 1.00)




Accuracy of Quadratic Discriminant Analysis: 0.52 (+/- 1.00)
Accuracy of Logistic Regression: 0.62 (+/- 0.97)


('Accuracy for pair: ', ['BRICK', 'GLASS'])
Accuracy of Nearest Neighbors: 0.89 (+/- 0.63)
Accuracy of Linear SVM: 0.90 (+/- 0.60)
Accuracy of Random Forest: 0.88 (+/- 0.65)
Accuracy of Linear Discriminant Analysis: 0.67 (+/- 0.94)
Accuracy of Quadratic Discriminant Analysis: 0.51 (+/- 1.00)


KeyboardInterrupt: 