## Imports

In [335]:
import pandas as pd
import numpy as np
import os
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
import math
from statsmodels import robust
from scipy.stats import iqr

In [336]:
## Features

def mean(x, y, z):
    """Calculates mean"""
    mean_x = np.mean(x)
    mean_y = np.mean(y)
    mean_z = np.mean(z)
    return mean_x, mean_y, mean_z

def std_dev(x, y, z):
    """Calculates standard deviation"""
    std_x = np.std(x)
    std_y = np.std(y)
    std_z = np.std(z)
    return std_x, std_y, std_z  
    
def mad(x, y, z):
    """Calculates median absolute deviation"""
    mad_x = np.median(np.abs(x - np.median(x)))
    mad_y = np.median(np.abs(y - np.median(y)))
    mad_z = np.median(np.abs(z - np.median(z)))
    return mad_x, mad_y, mad_z

def minimum(x, y, z):
    """Calculates minimum"""
    return min(x), min(y), min(z)

def maximum(x, y, z):
    """Calculates maximum"""
    return max(x), max(y), max(z)

def energy_measure(x, y, z):
    """Calculates energy measures"""
    em_x = np.mean(np.square(x))
    em_y = np.mean(np.square(y))
    em_z = np.mean(np.square(z))
    return em_x, em_y, em_z

def inter_quartile_range(x, y, z):
    """Calculates inter-quartile range"""
    iqr_x = np.subtract(*np.percentile(x, [75, 25]))
    iqr_y = np.subtract(*np.percentile(y, [75, 25]))
    iqr_z = np.subtract(*np.percentile(z, [75, 25]))
    return iqr_x, iqr_y, iqr_z

def sma(x, y, z):
    """Calculates signal magnitude area"""   
    abs_x = np.absolute(x)
    abs_y = np.absolute(y)
    abs_z = np.absolute(z)
    return np.mean(abs_x), np.mean(abs_y), np.mean(abs_z)

In [337]:
def process_avg_acceleration(input_df, output_df):
    i = 0
    while i < len(input_df[0]):
        max_window = input_df[0][i] + 2000
        min_window = input_df[0][i]
        if i != 0:                                  #handling overlap
            min_window -= 1000
            max_window -= 1000
            i -= int(count/2)
        x = [input_df[1][i]]
        y = [input_df[2][i]]
        z = [input_df[3][i]]
        
        count = 0
        while min_window < max_window:
            i += 1
            count += 1
            try:
                x.append(input_df[1][i])
                y.append(input_df[2][i])
                z.append(input_df[3][i])
                min_window = input_df[0][i]
            except KeyError:
                break
        
        acc_columns = ['mean_acc_x', 'mean_acc_y', 'mean_acc_z', 
                       'std_acc_x', 'std_acc_y', 'std_acc_z', 
                       'mad_acc_x', 'mad_acc_y', 'mad_acc_z',
                       'min_acc_x', 'min_acc_y', 'min_acc_z', 
                       'max_acc_x', 'max_acc_y', 'max_acc_z', 
                       'em_acc_x', 'em_acc_y', 'em_acc_z', 
                       'iqr_acc_x', 'iqr_acc_y', 'iqr_acc_z', 
                       'sma_acc_x', 'sma_acc_y', 'sma_acc_z']

        acc_data = list(mean(x, y, z)+std_dev(x, y, z)+mad(x, y, z)+minimum(x, y, z)+maximum(x, y, z)+energy_measure(x, y, z)+inter_quartile_range(x, y, z)+sma(x, y, z))
        output_df = output_df.append(pd.Series(acc_data, index=acc_columns), ignore_index=True)

    return output_df

In [338]:
def process_avg_gyroscope(input_df, output_df):

    label = []
    
    i = 0
    while i < len(input_df[0]):
        max_window = input_df[0][i] + 2000
        min_window = input_df[0][i]
        if i != 0:                                                                 #handling overlap
            min_window -= 1000
            max_window -= 1000
            i -= int(count/2)
        x = [input_df[1][i]]
        y = [input_df[2][i]]
        z = [input_df[3][i]]
        count = 0
        while min_window < max_window:
            i += 1
            count += 1
            try:
                x.append(input_df[1][i])
                y.append(input_df[2][i])
                z.append(input_df[3][i])
                min_window = input_df[0][i]
            except KeyError:
                break
        
#         label.append(input_df[5][1])
        gy_columns = ['mean_gy_x', 'mean_gy_y', 'mean_gy_z', 
                       'std_gy_x', 'std_gy_y', 'std_gy_z', 
                       'mad_gy_x', 'mad_gy_y', 'mad_gy_z',
                       'min_gy_x', 'min_gy_y', 'min_gy_z', 
                       'max_gy_x', 'max_gy_y', 'max_gy_z', 
                       'em_gy_x', 'em_gy_y', 'em_gy_z', 
                       'iqr_gy_x', 'iqr_gy_y', 'iqr_gy_z', 
                       'sma_gy_x', 'sma_gy_y', 'sma_gy_z', 
                       'label']
        
        l = (input_df[5][1], )
        
        gy_data = list(mean(x, y, z)+std_dev(x, y, z)+mad(x, y, z)+minimum(x, y, z)+maximum(x, y, z)+energy_measure(x, y, z)+inter_quartile_range(x, y, z)+sma(x, y, z)+l) 
        output_df = output_df.append(pd.Series(gy_data, index=gy_columns), ignore_index=True)
        
    return output_df

In [339]:
def get_session_dirs(path):
    path = path
    dirs = []
    dir_list = [x[0] for x in os.walk(path)]
    last_dirs = ["1_android.sensor.accelerometer.data.csv.gz","4_android.sensor.gyroscope.data.csv.gz"]
    dirs = []
    for each_dir in dir_list[2:]:

        if "data" in each_dir:
            dirs.append(each_dir)

    dirs.sort(key=lambda x:x[-22:])
    return dirs

In [347]:
dirs = get_session_dirs("Sessions/")

acc_columns = ['mean_acc_x', 'mean_acc_y', 'mean_acc_z', 
               'std_acc_x', 'std_acc_y', 'std_acc_z', 
               'mad_acc_x', 'mad_acc_y', 'mad_acc_z',
               'min_acc_x', 'min_acc_y', 'min_acc_z', 
               'max_acc_x', 'max_acc_y', 'max_acc_z', 
               'em_acc_x', 'em_acc_y', 'em_acc_z', 
               'iqr_acc_x', 'iqr_acc_y', 'iqr_acc_z', 
               'sma_acc_x', 'sma_acc_y', 'sma_acc_z']

output_acc_df = pd.DataFrame(columns = acc_columns)

gy_columns = ['mean_gy_x', 'mean_gy_y', 'mean_gy_z', 
                       'std_gy_x', 'std_gy_y', 'std_gy_z', 
                       'mad_gy_x', 'mad_gy_y', 'mad_gy_z',
                       'min_gy_x', 'min_gy_y', 'min_gy_z', 
                       'max_gy_x', 'max_gy_y', 'max_gy_z', 
                       'em_gy_x', 'em_gy_y', 'em_gy_z', 
                       'iqr_gy_x', 'iqr_gy_y', 'iqr_gy_z', 
                       'sma_gy_x', 'sma_gy_y', 'sma_gy_z',
                       'label']

output_gy_df = pd.DataFrame(columns = gy_columns)

acc_gy_df = pd.DataFrame()

for i in dirs: 
    full_path_acc = os.path.normpath(i + "/" + "1_android.sensor.accelerometer.data.csv.gz")
    full_path_gy = os.path.normpath(i + "/" + "4_android.sensor.gyroscope.data.csv.gz")
    
    print("processing : " + full_path_acc)
    in_df_ac = pd.read_csv(full_path_acc, compression='gzip', sep=',', header=None)
    
    print("processing : " + full_path_gy)
    in_df_gy = pd.read_csv(full_path_gy, compression='gzip', sep=',', header=None)

    output_acc_df = process_avg_acceleration(in_df_ac, output_acc_df)
    output_gy_df = process_avg_gyroscope(in_df_gy, output_gy_df)
    
    if output_gy_df.shape[0] < output_acc_df.shape[0]:
        output_acc_df = output_acc_df[:output_gy_df.shape[0]]
    
    acc_gy_df = pd.concat([output_acc_df, output_gy_df], axis=1)
    
print(acc_gy_df.shape)


processing : Sessions/14442D57F7E1B88_Thu_Feb_08_00-16_2018_PST/data/1_android.sensor.accelerometer.data.csv.gz
processing : Sessions/14442D57F7E1B88_Thu_Feb_08_00-16_2018_PST/data/4_android.sensor.gyroscope.data.csv.gz
processing : Sessions/14442D57F7E1B88_Thu_Feb_08_00-18_2018_PST/data/1_android.sensor.accelerometer.data.csv.gz
processing : Sessions/14442D57F7E1B88_Thu_Feb_08_00-18_2018_PST/data/4_android.sensor.gyroscope.data.csv.gz
processing : Sessions/14442D57F7E1B88_Thu_Feb_08_00-25_2018_PST/data/1_android.sensor.accelerometer.data.csv.gz
processing : Sessions/14442D57F7E1B88_Thu_Feb_08_00-25_2018_PST/data/4_android.sensor.gyroscope.data.csv.gz
processing : Sessions/14442D57F7E1B88_Thu_Feb_08_00-31_2018_PST/data/1_android.sensor.accelerometer.data.csv.gz
processing : Sessions/14442D57F7E1B88_Thu_Feb_08_00-31_2018_PST/data/4_android.sensor.gyroscope.data.csv.gz
processing : Sessions/14442D57F7E1B88_Thu_Feb_08_00-35_2018_PST/data/1_android.sensor.accelerometer.data.csv.gz
processi

In [348]:
combined_df = acc_gy_df
combined_df = combined_df.iloc[np.random.permutation(len(combined_df))]
combined_df.head()

Unnamed: 0,mean_acc_x,mean_acc_y,mean_acc_z,std_acc_x,std_acc_y,std_acc_z,mad_acc_x,mad_acc_y,mad_acc_z,min_acc_x,...,em_gy_x,em_gy_y,em_gy_z,iqr_gy_x,iqr_gy_y,iqr_gy_z,sma_gy_x,sma_gy_y,sma_gy_z,label
7521,1.740143,8.0861,-5.306657,0.062205,0.052877,0.098737,0.047873,0.038298,0.057448,1.594164,...,0.00063,4.6e-05,8.2e-05,0.02397,0.005327,0.010653,0.017726,0.004429,0.006055,laying_down
529,-9.60076,-2.65805,1.392206,2.163885,1.303792,1.310334,1.665974,1.0915,0.698943,-14.548545,...,0.945236,0.75051,2.137364,1.438887,1.596026,2.813853,0.825823,0.750627,1.341247,walking
9803,4.156679,-8.333373,3.738062,0.06692,0.160455,0.112797,0.047873,0.134044,0.086171,3.987804,...,0.0024,0.000128,0.000272,0.063921,0.01598,0.026634,0.038368,0.008792,0.013392,sitting
12113,3.611503,-7.140243,4.63064,1.484362,2.848403,2.077562,0.201066,1.014903,1.359588,1.986721,...,0.846008,0.803491,0.960294,1.126607,0.409494,0.212404,0.695187,0.522282,0.447819,sitting
11651,-8.663305,-3.891155,-0.575098,0.636207,0.659318,1.292362,0.277663,0.268088,0.349471,-11.350641,...,0.822735,0.163191,0.31372,1.243795,0.48207,0.527348,0.759617,0.288386,0.397516,standing


In [362]:
data = combined_df[acc_columns + gy_columns].as_matrix()
targets = data[:, -1]
data = data[:, :-1]
print(len(data), len(data[0]))
print(len(targets))

16022 48
16022


In [365]:
cache = {'laying_down' : 0, 'sitting' : 1, 'walking' : 2, 'standing' : 3}
targets.reshape(len(targets))
print(targets)
print(targets.shape)

['laying_down' 'walking' 'sitting' ..., 'laying_down' 'laying_down'
 'laying_down']
(16022,)


In [371]:
bin_targets = []
for i in targets:
    bin_targets.append(cache[i])
print(len(bin_targets))

16022


In [374]:
bin_targets[: 5]

[0, 2, 1, 1, 3]

In [375]:
clf = DecisionTreeClassifier(random_state=0)

In [376]:
acc = cross_val_score(clf, data, bin_targets, cv=10)
acc

array([ 0.92955112,  0.91328759,  0.93075483,  0.93761697,  0.92326887,
        0.93695381,  0.92509363,  0.92259675,  0.90875   ,  0.920625  ])

In [377]:
np.mean(acc)

0.92484985764644789

In [378]:
avg_acc_gy.groupby(by='label').groups

{'sitting': Int64Index([18, 12,  1, 33, 40,  7,  4, 32, 37, 39, 13,  8, 10, 26,  5, 28,  0,
             14,  9, 22, 15, 19, 30, 17, 41, 23, 24,  6, 29, 21, 25, 34, 20, 31,
             35, 11, 36, 42, 27,  3, 38, 16,  2],
            dtype='int64'),
 'walking': Int64Index([264, 174, 277, 108, 258, 104, 206, 241, 170, 107,
             ...
             219, 230, 190,  60, 315, 194, 121, 231, 259, 216],
            dtype='int64', length=282)}