In [63]:
import pandas as pd
import numpy as np

from sklearn import  neighbors, svm
from sklearn.linear_model import LogisticRegressionCV
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,\
    AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB 

from sklearn import preprocessing
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score,\
    mean_absolute_error, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, train_test_split,\
    learning_curve
from sklearn.feature_selection import RFE
from sklearn.neural_network import MLPClassifier
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

import itertools
import pickle

DIR_DATA = "data"
DIR_PROCESSED = "processed"
CV_FOLDS = 5

In [1]:
# %load examples/python/utility.py
#!/usr/bin/env python
""" collections of utility functions """

import sys
import os
import csv
import numpy as np
import scipy.io.wavfile


class Instance(object):
    """
        Instance class represents set of raw data collected per data instance
    """
    def __init__(self, dir):
        self.audio = self._load_audio(dir)
        self.touch = self._load_touch_data(dir)
        self.info = self._load_instance_info(dir)

    def _load_audio(self, dir):
        """ load audio data
            param dir: a path to an instance directory
            return: audio data
        """
        rate, wav = scipy.io.wavfile.read(os.path.join(dir, "audio.wav"))
        return wav
        
    def _load_touch_data(self, dir):
        """ load touch data
            param dir: a path to an instance directory
            return : a dictionary contains touch data
        """
        with open(os.path.join(dir, "touch.csv"), "rU") as f:
            reader = csv.DictReader(f)
            for touch in reader:
                for key in  touch.keys():
                    touch[key] = float(touch[key])
                break
        return touch
    
    def _load_instance_info(self, dir):
        """ load instance info from a directory path
            param dir: a path to an instance directory
            return: a dictionary contains basic instance information
        """
        info = {}
        user_dirnames = os.path.basename(os.path.dirname(dir)).split("-")
        info["surface"] = user_dirnames[0]
        info["user"] = user_dirnames[1]
        instance_dirnames = os.path.basename(dir).split("-")
        info["timestamp"] = instance_dirnames[0]
        # set None to classlabel if it's test data
        info["classlabel"] = instance_dirnames[1] if len(instance_dirnames) == 2 else None
        return info


def load_instances(dir):
    """ function for loading raw data instances
        param dir: a path to a data directory (i.e. task_data/train or task_data/test)
        return: a list of data instance objects
    """
    instances = []
    for root, dirs, files in os.walk(os.path.join(dir)):
        for filename in files:
            if filename == "audio.wav":
                instances.append(Instance(root))
    return instances


def load_labels(instances):
    """ load class labels
        param instances: a list of data instance objects
        return: class labels mapped to a number (0=pad, 1=knuckle)
    """
    y = np.array([{"pad": 0, "knuckle": 1}[instance.info["classlabel"]] for instance in instances], dtype=int)
    return y


def load_timestamps(instances):
    """ load timestamps
        param instances: a list of data instance objects
    """
    timestamps = [instance.info["timestamp"] for instance in instances]
    return timestamps


def convert_to_classlabels(y):
    """ convert to classlabels
        param y: mapped class labels
        return: class labels
    """
    classlabels = [["pad", "knuckle"][y[i]] for i in range(len(y))]
    return classlabels


def write_results(timestamps, classlabels, output):
    """ write classification results to an output file
        param timestamps: a list of timestamps
        param classlabels: a list of predicted class labels
        return : None
    """
    if len(timestamps) != len(classlabels):
        raise Exception("The number of timestamps and classlabels doesn't match.")
    with open(output, "w") as f:
        f.write("timestamp,label\n")
        for timestamp, classlabel in zip(timestamps, classlabels):
            f.write(timestamp + "," + classlabel + "\n")

In [44]:
# %load examples/python/qeexo_ml_challenge_example.py
#!/usr/bin/env python
""" an example of performing a simple classification with Qeexo ML Challenge data set """

import sys
import os
import numpy as np

def generate_audio_features(instances):
    """ generate features
        param instances: a list of Instance class objects
        return: a feature matrix
    """
    
    X = pd.DataFrame(np.array([instance.audio.astype(float) for instance in instances]))
    
    
    return X

def generate_features(instances):
    """ generate features
        param instances: a list of Instance class objects
        return: a feature matrix
    """
    
    # Load info as well later
    touch = generate_touch_features(instances)
    audio = generate_audio_features(instances)
    
    
    
    return pd.concat([touch, audio], axis = 1)

def generate_touch_features(instances):
    """ generate features
        param instances: a list of Instance class objects
        return: a feature matrix
    """
    touch_dict = [instance.touch for instance in train_instances]

    X = pd.DataFrame.from_dict(touch_dict)
    
    return X


def train_model(X, y):
    """ train a model (1 nearest neighbor)
        param X: a feature matrix
        param y: a vector contains labels
        return : trained model
    """
    # train a naive model (1-NN)
    model = {"X": X, "y": y}
    return model

                  
def test_model(X_test, model):
    """ test a model (1 nearest neighbor)
        param X_test: a feature matrix
        param model: trained 1 NN model (copy of training data)
        return : predicted labels for test data
    """
    # test a 1-NN model
    X_train = model["X"]
    y_train = model["y"]
    y_test = np.zeros(X_test.shape[0], dtype=int)
    for i in range(X_test.shape[0]):
        y_test[i] = y_train[np.argmin(np.sum((X_train-X_test[i,:])**2, axis=1)**0.5)]
    return y_test





In [49]:
df_train.head()

Unnamed: 0,major,minor,orientation,pressure,x,y,0,1,2,3,...,246,247,248,249,250,251,252,253,254,255
0,5.0,5.0,-1.0,0.0,186.0,1054.0,-616.0,-653.0,-628.0,-647.0,...,-529.0,-534.0,-538.0,-535.0,-543.0,-539.0,-551.0,-582.0,-602.0,-611.0
1,5.0,4.0,-1.0,0.0,659.0,25.0,-1329.0,-1303.0,-1297.0,-1328.0,...,-1523.0,-1559.0,-1586.0,-1549.0,-1546.0,-1534.0,-1549.0,-1552.0,-1579.0,-1590.0
2,4.0,4.0,-1.0,0.0,550.0,1232.0,-1250.0,-1296.0,-1289.0,-1277.0,...,-827.0,-813.0,-844.0,-853.0,-850.0,-853.0,-859.0,-883.0,-880.0,-862.0
3,4.0,3.0,-1.0,0.0,185.0,1683.0,-1519.0,-1508.0,-1535.0,-1518.0,...,-1065.0,-1064.0,-1058.0,-1118.0,-1106.0,-1110.0,-1128.0,-1128.0,-1134.0,-1165.0
4,4.0,4.0,-1.0,0.0,655.0,1150.0,-2013.0,-1978.0,-1971.0,-1998.0,...,-1636.0,-1642.0,-1662.0,-1648.0,-1650.0,-1649.0,-1632.0,-1666.0,-1673.0,-1663.0


In [51]:
def predict_left(df, clf, test_size=0.2):
    X = df.drop(['label'],1)
    y = df.label 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    
    
    clf.fit(X_train, y_train)
    
    train_pred = clf.predict(X_train)
    test_pred = clf.predict(X_test)
    print('Training Score: {:.3f}'.format(clf.score(X_train, y_train)))
    print('Testing Score: %.3f' % (clf.score(X_test, y_test)))
    
    print()

In [55]:
df_audio = generate_audio_features(train_instances)
df_audio['label'] = y_train

classifiers = [RandomForestClassifier(n_jobs=-1), # Yes
               RandomForestClassifier(criterion='entropy', n_jobs=-1), #Yes
               LogisticRegressionCV(n_jobs=-1), # No, unless feature engineering helps
               AdaBoostClassifier(), # No
               GradientBoostingClassifier(), # Better, low variance
               neighbors.KNeighborsClassifier(n_jobs=-1) # BEST
#                MultinomialNB(),
#                svm.SVC()
              ] 

np.random.seed(0)

for i, clf in enumerate(classifiers):
    print('Classifier ', i)
    
    predict_left(df_audio, clf, test_size=0.2)

Classifier  0
Training Score: 0.995
Testing Score: 0.885

Classifier  1
Training Score: 0.995
Testing Score: 0.885

Classifier  2
Training Score: 0.615
Testing Score: 0.613

Classifier  3
Training Score: 0.811
Testing Score: 0.793

Classifier  4
Training Score: 0.862
Testing Score: 0.842

Classifier  5
Training Score: 0.924
Testing Score: 0.888



In [54]:
df_touch = generate_touch_features(train_instances)
df_touch['label'] = y_train

classifiers = [RandomForestClassifier(n_jobs=-1), # Yes
               RandomForestClassifier(criterion='entropy', n_jobs=-1), #Yes
               LogisticRegressionCV(n_jobs=-1), # No, unless feature engineering helps
               AdaBoostClassifier(), # No
               GradientBoostingClassifier(), # Better, low variance
               neighbors.KNeighborsClassifier(n_jobs=-1) # BEST
#                MultinomialNB(),
#                svm.SVC()
              ] 

np.random.seed(0)

for i, clf in enumerate(classifiers):
    print('Classifier ', i)
    
    predict_left(df_touch, clf, test_size=0.2)

Classifier  0
Training Score: 0.986
Testing Score: 0.743

Classifier  1
Training Score: 0.984
Testing Score: 0.737

Classifier  2
Training Score: 0.790
Testing Score: 0.795

Classifier  3
Training Score: 0.793
Testing Score: 0.799

Classifier  4
Training Score: 0.799
Testing Score: 0.796

Classifier  5
Training Score: 0.705
Testing Score: 0.536



In [62]:
classifiers = [RandomForestClassifier(n_jobs=-1), # Yes
               RandomForestClassifier(criterion='entropy', n_jobs=-1), #Yes
               LogisticRegressionCV(n_jobs=-1), # No, unless feature engineering helps
               AdaBoostClassifier(), # No
               GradientBoostingClassifier(), # Better, low variance
               neighbors.KNeighborsClassifier(n_jobs=-1), # BEST
               MLPClassifier(hidden_layer_sizes=(200,50,30),max_iter=500)
#                MultinomialNB(),
#                svm.SVC()
              ] 

np.random.seed(0)

for i, clf in enumerate(classifiers):
    print('Classifier ', i)
    
    predict_left(df_train, clf, test_size=0.2)

Classifier  0
Training Score: 0.998
Testing Score: 0.926

Classifier  1
Training Score: 0.997
Testing Score: 0.930

Classifier  2
Training Score: 0.614
Testing Score: 0.596

Classifier  3
Training Score: 0.898
Testing Score: 0.888

Classifier  4
Training Score: 0.923
Testing Score: 0.912

Classifier  5
Training Score: 0.923
Testing Score: 0.891



In [68]:
clf = MLPClassifier(hidden_layer_sizes=(200,100,20),max_iter=1000)
predict_left(df_train, clf, test_size=0.2)

Training Score: 0.904
Testing Score: 0.899

