## Import the libraries we'll be using.

In [1]:
# Global imports and settings

# OS for I/O operations
import os
import csv
import glob

# Matplotlib
%matplotlib inline
from matplotlib import pyplot as plt
plt.rcParams["figure.figsize"] = (8, 8)
plt.rcParams["figure.max_open_warning"] = -1

# Numpy & print options
import numpy as np
np.set_printoptions(precision=3)

# Pandas
import pandas as pd

# Sklearn
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn import svm

# Sklearn-pandas
from sklearn_pandas import DataFrameMapper, cross_val_score

# Silence warnings
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)
warnings.simplefilter(action="ignore", category=RuntimeWarning)

## Import all CSVs in the `data` subdirectory

In [2]:
path = os.path.join(os.getcwd(), 'data') # pull files from the 'data' subdirectory
allFiles = glob.glob(path + "/*.csv")
df = pd.DataFrame()
list_ = []
for file_ in allFiles:
    print('Reading file:' + file_)
    this_df = pd.read_csv(file_,index_col=None, header=0)
    this_df['user'] = os.path.basename(file_)
    list_.append(this_df)
df = pd.concat(list_)

print df.columns
print df.shape

Reading file:/Users/afaucher/Documents/Grad School 2017/CMU HCI/Courses/_0X Applied ML/Assignments/Final Project/_Notebook/data/0BFC35E2-4817-4865-BFA7-764742302A2D.features_labels.csv
Reading file:/Users/afaucher/Documents/Grad School 2017/CMU HCI/Courses/_0X Applied ML/Assignments/Final Project/_Notebook/data/0A986513-7828-4D53-AA1F-E02D6DF9561B.features_labels.csv
Reading file:/Users/afaucher/Documents/Grad School 2017/CMU HCI/Courses/_0X Applied ML/Assignments/Final Project/_Notebook/data/0E6184E1-90C0-48EE-B25A-F1ECB7B9714E.features_labels.csv
Index([u'timestamp', u'raw_acc:magnitude_stats:mean',
       u'raw_acc:magnitude_stats:std', u'raw_acc:magnitude_stats:moment3',
       u'raw_acc:magnitude_stats:moment4',
       u'raw_acc:magnitude_stats:percentile25',
       u'raw_acc:magnitude_stats:percentile50',
       u'raw_acc:magnitude_stats:percentile75',
       u'raw_acc:magnitude_stats:value_entropy',
       u'raw_acc:magnitude_stats:time_entropy',
       ...
       u'label:ELEVAT

In [3]:
df.head()

Unnamed: 0,timestamp,raw_acc:magnitude_stats:mean,raw_acc:magnitude_stats:std,raw_acc:magnitude_stats:moment3,raw_acc:magnitude_stats:moment4,raw_acc:magnitude_stats:percentile25,raw_acc:magnitude_stats:percentile50,raw_acc:magnitude_stats:percentile75,raw_acc:magnitude_stats:value_entropy,raw_acc:magnitude_stats:time_entropy,...,label:ELEVATOR,label:OR_standing,label:AT_SCHOOL,label:PHONE_IN_HAND,label:PHONE_IN_BAG,label:PHONE_ON_TABLE,label:WITH_CO-WORKERS,label:WITH_FRIENDS,label_source,user
0,1445366534,1.023488,0.024622,-0.028726,0.066325,1.01879,1.021975,1.026449,0.765474,6.684317,...,,0.0,1.0,,,,0.0,0.0,2,0BFC35E2-4817-4865-BFA7-764742302A2D.features_...
1,1445366594,1.025689,0.065634,0.042226,0.129952,1.011969,1.020782,1.032243,1.446182,6.682553,...,,0.0,1.0,,,,0.0,0.0,2,0BFC35E2-4817-4865-BFA7-764742302A2D.features_...
2,1445366674,1.022835,0.039641,-0.052978,0.100098,1.0201,1.022686,1.025343,0.616602,6.683829,...,,0.0,1.0,,,,0.0,0.0,2,0BFC35E2-4817-4865-BFA7-764742302A2D.features_...
3,1445366703,1.022907,0.008549,-0.015235,0.029863,1.020766,1.022855,1.025048,0.807471,6.684576,...,,0.0,1.0,,,,0.0,0.0,2,0BFC35E2-4817-4865-BFA7-764742302A2D.features_...
4,1445366761,1.023375,0.011622,0.017318,0.030025,1.020367,1.022504,1.024599,0.682306,6.684548,...,,0.0,1.0,,,,0.0,0.0,2,0BFC35E2-4817-4865-BFA7-764742302A2D.features_...


## Select the features we actually want to use.

In [4]:
# What is the class value we'll want to predict?
class_ft = 'label:LOC_home'

relevant_ft = [
    
    ### USER REFERENCE ###
    'user',
    
    ### NUMERIC FEATURES ###
    
    # Timestamp
    'timestamp',
    # Accelerometer Features
    'raw_acc:magnitude_stats:percentile25',
    'raw_acc:magnitude_stats:percentile50',
    'raw_acc:magnitude_stats:percentile75',
    'raw_acc:magnitude_stats:value_entropy',
    'raw_acc:magnitude_stats:time_entropy',
    'raw_acc:magnitude_autocorrelation:period',
    'raw_acc:magnitude_autocorrelation:normalized_ac',
    # Gyroscope Features
    'proc_gyro:magnitude_stats:mean',
    'proc_gyro:magnitude_stats:std',
    'proc_gyro:magnitude_stats:percentile25',
    'proc_gyro:magnitude_stats:percentile50',
    'proc_gyro:magnitude_stats:percentile75',
    'proc_gyro:magnitude_stats:value_entropy',
    'proc_gyro:magnitude_autocorrelation:period',
    'proc_gyro:magnitude_autocorrelation:normalized_ac',
    # Location Features
    'location_quick_features:std_lat',
    'location_quick_features:std_long',
    'location_quick_features:lat_change',
    'location_quick_features:long_change',
    'location_quick_features:mean_abs_lat_deriv',
    'location_quick_features:mean_abs_long_deriv',
    # Audio Features
    'audio_properties:max_abs_value',
    
    ### BINARY FEATURES ###
    
    # App Status
    'discrete:app_state:is_active',
    'discrete:app_state:is_inactive',
    'discrete:app_state:is_background',
    'discrete:app_state:missing',
    # Battery
    'discrete:battery_plugged:is_ac',
    'discrete:battery_plugged:is_usb',
    'discrete:battery_plugged:is_wireless',
    'discrete:battery_plugged:missing',
    'discrete:battery_state:is_unknown',
    'discrete:battery_state:is_unplugged',
    'discrete:battery_state:is_not_charging',
    'discrete:battery_state:is_discharging',
    'discrete:battery_state:is_charging',
    'discrete:battery_state:is_full',
    'discrete:battery_state:missing',
    # On the Phone
    'discrete:on_the_phone:is_False',
    'discrete:on_the_phone:is_True',
    'discrete:on_the_phone:missing',
    # Ringer Mode
    'discrete:ringer_mode:is_normal',
    'discrete:ringer_mode:is_silent_no_vibrate',
    'discrete:ringer_mode:is_silent_with_vibrate',
    'discrete:ringer_mode:missing',
    # Wifi Status
    'discrete:wifi_status:is_not_reachable',
    'discrete:wifi_status:is_reachable_via_wifi',
    'discrete:wifi_status:is_reachable_via_wwan',
    'discrete:wifi_status:missing',
    # Time of Day
    'discrete:time_of_day:between0and6',
    'discrete:time_of_day:between3and9',
    'discrete:time_of_day:between6and12',
    'discrete:time_of_day:between9and15',
    'discrete:time_of_day:between12and18',
    'discrete:time_of_day:between15and21',
    'discrete:time_of_day:between18and24',
    'discrete:time_of_day:between21and3',
    
    ### CLASS ###
    class_ft]

df = df[relevant_ft]

## Eliminate instances where the class value is `NaN`

In [5]:
print('Before NaN elimination: ' + str(df.shape))
df = df.dropna(axis=0, subset=[class_ft])
print('After NaN elimination: ' + str(df.shape))

Before NaN elimination: (14589, 59)
After NaN elimination: (14589, 59)


## Assign the right `dtypes` to these features.

In [8]:
for col in df.columns:
    
    # If the column starts with 'discrete' or 'label',
    # make it a 'bool' dtype.
    if col.startswith('discrete') or col.startswith('label'):
        df[col] = df[col].astype(bool)
    elif col.startswith('user'):
        df[col] = df[col].astype(str)
    # Otherwise, make it 'float64'
    else:
        df[col] = df[col].astype(float)
    


In [9]:
df.dtypes

user                                                  object
timestamp                                            float64
raw_acc:magnitude_stats:percentile25                 float64
raw_acc:magnitude_stats:percentile50                 float64
raw_acc:magnitude_stats:percentile75                 float64
raw_acc:magnitude_stats:value_entropy                float64
raw_acc:magnitude_stats:time_entropy                 float64
raw_acc:magnitude_autocorrelation:period             float64
raw_acc:magnitude_autocorrelation:normalized_ac      float64
proc_gyro:magnitude_stats:mean                       float64
proc_gyro:magnitude_stats:std                        float64
proc_gyro:magnitude_stats:percentile25               float64
proc_gyro:magnitude_stats:percentile50               float64
proc_gyro:magnitude_stats:percentile75               float64
proc_gyro:magnitude_stats:value_entropy              float64
proc_gyro:magnitude_autocorrelation:period           float64
proc_gyro:magnitude_auto

In [10]:
df.head()

Unnamed: 0,user,timestamp,raw_acc:magnitude_stats:percentile25,raw_acc:magnitude_stats:percentile50,raw_acc:magnitude_stats:percentile75,raw_acc:magnitude_stats:value_entropy,raw_acc:magnitude_stats:time_entropy,raw_acc:magnitude_autocorrelation:period,raw_acc:magnitude_autocorrelation:normalized_ac,proc_gyro:magnitude_stats:mean,...,discrete:wifi_status:missing,discrete:time_of_day:between0and6,discrete:time_of_day:between3and9,discrete:time_of_day:between6and12,discrete:time_of_day:between9and15,discrete:time_of_day:between12and18,discrete:time_of_day:between15and21,discrete:time_of_day:between18and24,discrete:time_of_day:between21and3,label:LOC_home
0,0BFC35E2-4817-4865-BFA7-764742302A2D.features_...,1445367000.0,1.01879,1.021975,1.026449,0.765474,6.684317,0.482794,0.073672,0.246308,...,False,False,False,True,True,False,False,False,False,False
1,0BFC35E2-4817-4865-BFA7-764742302A2D.features_...,1445367000.0,1.011969,1.020782,1.032243,1.446182,6.682553,7.221784,0.143749,0.357748,...,False,False,False,True,True,False,False,False,False,False
2,0BFC35E2-4817-4865-BFA7-764742302A2D.features_...,1445367000.0,1.0201,1.022686,1.025343,0.616602,6.683829,3.600838,0.146281,0.019397,...,False,False,False,True,True,False,False,False,False,False
3,0BFC35E2-4817-4865-BFA7-764742302A2D.features_...,1445367000.0,1.020766,1.022855,1.025048,0.807471,6.684576,3.962931,0.277193,0.004336,...,False,False,False,True,True,False,False,False,False,False
4,0BFC35E2-4817-4865-BFA7-764742302A2D.features_...,1445367000.0,1.020367,1.022504,1.024599,0.682306,6.684548,2.936994,0.141068,0.00722,...,False,False,False,True,True,False,False,False,False,False


In [None]:
df.plot.scatter(x='label_sitting',y='num_one')

In [None]:
df.plot.scatter(x='label_sitting',y='num_two')

In [None]:
df.plot.scatter(x='label_sitting',y='num_three')

In [None]:
df.plot.scatter(x='label_sitting',y='disc_one')

In [None]:
df['num_three'] = df['num_three'] + np.arange(3960)
df.plot.hexbin(x='label_sitting', y='num_three', gridsize=25)

In [None]:
df.hist(bins=2)