In [16]:
import os
import pandas as pd
import numpy as np
import scipy as sp
import sklearn
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn.cross_validation import KFold
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import zero_one_loss

In [17]:
# Setting up data input types. 

#Train 

X_train = np.loadtxt('/home/aditya/Documents/UIUC/spring_2016/STAT427/project/uci_har_dataset/train/X_train.txt')
y_train = np.loadtxt('/home/aditya/Documents/UIUC/spring_2016/STAT427/project/uci_har_dataset/train/y_train.txt', dtype=np.int)
persons_train = np.loadtxt('/home/aditya/Documents/UIUC/spring_2016/STAT427/project/uci_har_dataset/train/subject_train.txt', dtype=np.int)

#Test 
X_test = np.loadtxt('/home/aditya/Documents/UIUC/spring_2016/STAT427/project/uci_har_dataset/test/X_test.txt')
y_test = np.loadtxt('/home/aditya/Documents/UIUC/spring_2016/STAT427/project/uci_har_dataset/test/y_test.txt', dtype=np.int)
persons_test = np.loadtxt('/home/aditya/Documents/UIUC/spring_2016/STAT427/project/uci_har_dataset/test/subject_test.txt', dtype=np.int)

# X and Y for both axises. 
X_all = np.concatenate([X_train, X_test])
y_all = np.concatenate([y_train, y_test])
    
feature_names = [x.split(' ')[1] for x in open('/home/aditya/Documents/UIUC/spring_2016/STAT427/project/uci_har_dataset/features.txt').read().split('\n') if len(x) > 0]


In [18]:
def SVM_feature_extraction():
    clf = svm.LinearSVC
    clf.fit(X_train,y_train)
    X_train_t = clf.decision_function(X_train)
    X_test_t = clf.decision_function(X_test)
    
    return (X_train_t, X_test_t)


In [19]:
def run_clfs_on_data(classifiers, Xs, ys, add_last_action = False):
    results = {}
    for name, clf in classifiers.iteritems():
        print "running %s" % name
        clf_results = fit_clf_kfold(clf['clf'], Xs, ys, flatten=not clf['structured'], add_last_action=add_last_action)
        # with feature selection:
        clf_results = fit_clf_kfold(clf['clf'], [X[:,select_features] for X in X_pers_all], y_pers_all,flatten=not clf['structured'])
        results[name] = clf_results
    return results


In [23]:
def plot_most_important_features(clf, label_names, feature_names, n=10, best=True, absolut=True):
    if absolut:
        ranked_features = np.argsort(np.abs(clf.coef_), axis=None)
    else:
        ranked_features = np.argsort(clf.coef_, axis=None)
        
    if best:
        ranked_features = ranked_features[::-1] #inverse to get the best first
        
    for i, fweights_idx in enumerate(ranked_features[:n]):
            label_idx,feature_idx = np.unravel_index(fweights_idx, clf.coef_.shape)
            print "%d. f: %s\t\t c: %s\t value: %f" % (i, feature_names[feature_idx], label_names[label_idx], clf.coef_[(label_idx,feature_idx)])


In [25]:
def unflatten_per_person(X_all,y_all,persons_all):
    """
        X: n_samples, n_features
            The full feature matrix.
        y: label for each row in X
        person: person label for each row in X
        
        returns: (X_person, y_person) 
            X_person: n_persons array of X and y that apply to this person.
    """
    Xtotal = []
    y_total = []
    
    Xperson = []
    y_person = []
    last_person = persons_all[0]
    for row,y,person in zip(X_all,y_all,persons_all):
        if person != last_person:
            Xtotal.append(Xperson)
            y_total.append(y_person)
            Xperson = []
            y_person = []
            
        Xperson.append(row)
        y_person.append(y)
        
        last_person = person
        
    Xtotal.append(Xperson)
    y_total.append(y_person)
    
    return ([np.array(x) for x in Xtotal], [np.array(y) for y in y_total])

In [28]:
def main(X_train,y_train,persons_train, X_test, y_test, persons_test, X_all, y_all): 
    print "Data:"
    X_train_pers, y_train_pers = unflatten_per_person(X_train, y_train, persons_train)
    X_test_pers, y_test_pers = unflatten_per_person(X_test, y_test, persons_test)
    X_pers_all = []
    X_pers_all.extend(X_train_pers)
    X_pers_all.extend(X_test_pers)
    y_pers_all = []
    y_pers_all.extend(y_train_pers)
    y_pers_all.extend(y_test_pers)
    
    
        print "training classifier"
    
    ensemble_classifiers = {
                                "linear Support Vector Classifier": {'clf': LinearSVC(), 'structured': False},
                                "Logistic Regression": {'clf': LogisticRegression(), 'structured': False},
                                "SGDClassifier":{'clf': SGDClassifier(),'structured':False},
                                }
    
    crf_ensemble = LinearCRFEnsemble(ensemble_classifiers, addone=True, regularization=None, lmbd=0.01, sigma=100, transition_weighting=True)
    
    classifiers = {
                   "SGDClassifier":{'clf': SGDClassifier(),'structured':False},
                   "Logistic Regression": {'clf': LogisticRegression(), 'structured': False},
                   "linear Support Vector Classifier": {'clf': LinearSVC(), 'structured': False},
                   "Gaussian Naive Bayes": {'clf': GaussianNB(), 'structured': False},
                   #"SVMHMM": {'clf': SVMHMMCRF(C=1), 'structured': True},
                   "KNN (weights: uniform, neighbors=5)": {'clf': KNeighborsClassifier(), 'structured': False},
                   "Decision Tree": {'clf': DecisionTreeClassifier(), 'structured': False},
                   "RandomForest": {'clf': RandomForestClassifier(), 'structured': False},
                   "CRF": {'clf': LinearCRF(feature_names=feature_names, label_names=labels, addone=True, regularization="l2", lmbd=0.01, sigma=100, transition_weighting=False),
                            'structured': True},
                   }
    
    results = run_clfs_on_data(classifiers, X_pers_all, y_pers_all)
    
    results_last_action = run_clfs_on_data(classifiers, X_pers_all, y_pers_all, add_last_action=True)
    
    for clf_name in results:
        clf_results = results[clf_name]
        accuracies = np.array([accuracy_score(gold, predict) for gold, predict in clf_results])
        print accuracies
        print "%s accuracy: %f +- %f" % (clf_name, accuracies.mean(), accuracies.std())
        smoothness_predict = np.array([label_smoothness(predict) for gold, predict in clf_results])
        print "%s smoothness: %f +- %f" % (clf_name, smoothness_predict.mean(), smoothness_predict.std())
        smoothness_gold = np.array([label_smoothness(gold) for gold, predict in clf_results])
        print "smoothess(gold): %f +- %f" % (smoothness_gold.mean(), smoothness_gold.std())
        
        y_all_gold = np.concatenate(zip(*clf_results)[0])
        y_all_predict = np.concatenate(zip(*clf_results)[1])
        
        print classification_report(y_all_gold, y_all_predict, target_names = labels)
        print confusion_matrix_report(y_all_gold, y_all_predict, labels)
        print confusion_matrix(y_all_gold, y_all_predict)
        
        
    
    crf_classifiers =  {
                        "CRF": {'clf': LinearCRF(feature_names=feature_names, label_names=labels, addone=True, regularization="l2", lmbd=0.01, sigma=100, transition_weighting=False),
                            'structured': True},
                        "CRF transition weights": {'clf': LinearCRF(feature_names=feature_names, label_names=labels, addone=True, regularization="l2", lmbd=0.01, sigma=100, transition_weighting=True),
                            'structured': True},
                        }
    
    crf_unregularized_classifiers =  {
                        "CRF": {'clf': LinearCRF(feature_names=feature_names, label_names=labels, addone=True, regularization=None, lmbd=0.01, sigma=10, transition_weighting=False),
                            'structured': True},
                        "CRF transition weights": {'clf': LinearCRF(feature_names=feature_names, label_names=labels, addone=True, regularization=None, lmbd=0.01, sigma=10, transition_weighting=True),
                            'structured': True},
                        }
    
    crf_classifiers_l2_best = {
                   "CRF (sigma=1)": {'clf': LinearCRF(feature_names=feature_names, label_names=labels, addone=True, regularization="l2", lmbd=0.01, sigma=1, transition_weighting=False),
                            'structured': True},
                   "CRF (sigma=10)": {'clf': LinearCRF(feature_names=feature_names, label_names=labels, addone=True, regularization="l2", lmbd=0.01, sigma=10, transition_weighting=False),
                            'structured': True},
                    "CRF (sigma=100)": {'clf': LinearCRF(feature_names=feature_names, label_names=labels, addone=True, regularization="l2", lmbd=0.01, sigma=100, transition_weighting=False),
                            'structured': True},
                    "CRF (sigma=1000)": {'clf': LinearCRF(feature_names=feature_names, label_names=labels, addone=True, regularization="l2", lmbd=0.01, sigma=1000, transition_weighting=False),
                            'structured': True},
                    "CRF (sigma=.1)": {'clf': LinearCRF(feature_names=feature_names, label_names=labels, addone=True, regularization="l2", lmbd=0.01, sigma=0.1, transition_weighting=False),
                            'structured': True},
                   }

    

([array([[ 0.28858451, -0.02029417, -0.13290514, ..., -0.84124676,
         0.17994061, -0.05862692],
       [ 0.27841883, -0.01641057, -0.12352019, ..., -0.8447876 ,
         0.18028889, -0.05431672],
       [ 0.27965306, -0.01946716, -0.11346169, ..., -0.84893347,
         0.18063731, -0.04911782],
       ..., 
       [ 0.18594631, -0.01180373, -0.03613971, ..., -0.69993026,
         0.2892392 ,  0.099656  ],
       [ 0.33584724,  0.01144556, -0.08003737, ..., -0.71841233,
         0.28283857,  0.08020741],
       [ 0.2860648 ,  0.01679982, -0.16313273, ..., -0.74048689,
         0.26956937,  0.07169672]]), array([[ 0.23601277, -0.02691314,  0.18639534, ..., -0.75078761,
         0.26836677,  0.0445423 ],
       [ 0.26731751, -0.03235518, -0.06292407, ..., -0.74284374,
         0.2689676 , -0.03227248],
       [ 0.28335241, -0.0107287 , -0.11733272, ..., -0.7274617 ,
         0.27574901, -0.04613594],
       ..., 
       [ 0.27764222, -0.04387914, -0.08554055, ..., -0.58565511,
     