In [1]:
import pandas as pd
import numpy as np
import random
from lib.get_plots import *
from lib.sba_transform import *
from lib.model_pipeline import *
from lib.dt_model import *
from lib.demo_model_pipeline import * 

In [2]:
%pylab inline
pylab.rcParams['figure.figsize'] = (20, 7)   # caution: plt namespace may be taken

Populating the interactive namespace from numpy and matplotlib



pylab import has clobbered these variables: ['figure', 'random']
`%matplotlib` prevents importing * from pylab and numpy



In [3]:
data = pd.read_csv('demo/fitbituser.csv', parse_dates=['date_time'])
data = filter_na(data)
print(data.describe())

             heart       steps
count  5950.000000  5950.00000
mean     71.840504    93.72084
std      16.009608   228.97116
min      43.000000     0.00000
25%      58.000000     0.00000
50%      70.000000     0.00000
75%      80.000000    49.00000
max     156.000000  2306.00000


# Fill missing data with different methods

In [4]:
ndata1 = fill_missing(data, method='mean')
ndata2 = fill_missing(data, method='rndminmax')
ndata3 = fill_missing(data, method='rnd50zeromax')
ndata4 = fill_missing(data, method='zero')
ndata5 = fill_missing(data, method='max')
ndatacontrol = fill_missing(data, method='none')

In [5]:
ndata1 = sba_pipeline(ndata1)
ndata2 = sba_pipeline(ndata2)
ndata3 = sba_pipeline(ndata3)
ndata4 = sba_pipeline(ndata4)
ndata5 = sba_pipeline(ndata5)
ndatacontrol = sba_pipeline(ndatacontrol)

# Heatmap

In [None]:
print(ndata1.describe())
ndata1.head(5)

In [None]:
get_heatmap(ndata1, 'steps', 'ndata1 steps')
get_heatmap(ndata2, 'steps', 'ndata2 steps')
get_heatmap(ndata3, 'steps', 'ndata3 steps')
get_heatmap(ndata4, 'steps', 'ndata4 steps')
get_heatmap(ndata5, 'steps', 'ndata5 steps')
get_heatmap(ndatacontrol, 'steps', 'ndatacontrol steps')

# Autocorrelation Analysis

In [None]:
# with sleeping time
get_plot_acf(ndatacontrol.steps, 96*21, 96, title='Autocorrelations of steps in ndatacontrol')
get_plot_acf(ndata1.steps, 96*21, 96, title='Autocorrelations of steps in ndata1')
get_plot_acf(ndata2.steps, 96*21, 96, title='Autocorrelations of steps in ndata2')
get_plot_acf(ndata3.steps, 96*21, 96, title='Autocorrelations of steps in ndata3')
get_plot_acf(ndata4.steps, 96*21, 96, title='Autocorrelations of steps in ndata4')
get_plot_acf(ndata5.steps, 96*21, 96, title='Autocorrelations of steps in ndata5')


In [None]:
# without sleeping time
get_plot_acf(filter_sleeping_time(ndatacontrol).steps, 68*21, 68, title='Autocorrelations of steps in ndatacontrol')
get_plot_acf(filter_sleeping_time(ndata1).steps, 68*21, 68, title='Autocorrelations of steps in ndata1')
get_plot_acf(filter_sleeping_time(ndata2).steps, 68*21, 68, title='Autocorrelations of steps in ndata2')
get_plot_acf(filter_sleeping_time(ndata3).steps, 68*21, 68, title='Autocorrelations of steps in ndata3')
get_plot_acf(filter_sleeping_time(ndata4).steps, 68*21, 68, title='Autocorrelations of steps in ndata4')
get_plot_acf(filter_sleeping_time(ndata5).steps, 68*21, 68, title='Autocorrelations of steps in ndata5')

# Get features
    1) Weekly and daily circadian rhythmicity of physical activity
    2) Autocorrelation analysis for feature selection

In [22]:
# 1. circadian rhythmicity features: time, time of week, day of week
ndata1_cr, ndata1_cr_features = get_features(ndata1, method='rhythm')
# 2. autocorrelated parameters: steps
ndata1_ac_s, ndata1_ac_s_features = get_features(ndata1, method='autoco')
# 3. autocorrelated parameters: steps and heart
ndata1_ac_sh_params = ['steps', 'heart']
ndata1_ac_sh, ndata1_ac_sh_features_subset = get_features(ndata1, method='autoco', params=ndata1_ac_sh_params)
ndata1_ac_sh_features = [item for sublist in ndata1_ac_sh_features_subset for item in sublist]
# 4. circadian rhythmicity features + autocorrelated parameters: time, time of week, day of week, steps, heart
ndata1_ac_shr, ndata1_ac_shr_features_subset = get_features(ndata1_ac_sh, method='rhythm')
ndata1_ac_shr_features = ndata1_ac_shr_features_subset + ndata1_ac_sh_features

the 671 th index has ac of 0.283958
the 672 th index has ac of 0.211081
the 1343 th index has ac of 0.230691
feature indexes are 671, 672, 1343
index 671 is valid
index 672 is valid
index 1343 is valid
the 671 th index has ac of 0.283958
the 672 th index has ac of 0.211081
the 1343 th index has ac of 0.230691
feature indexes are 671, 672, 1343
index 671 is valid
index 672 is valid
index 1343 is valid
the 671 th index has ac of 0.283958
the 672 th index has ac of 0.211081
the 1343 th index has ac of 0.230691
feature indexes are 671, 672, 1343
index 671 is valid
index 672 is valid
index 1343 is valid


In [48]:
ndata1_ac_shr.head()

Unnamed: 0,date_time,heart,steps,inactive,stepsfeature_index671,stepsfeature_index672,stepsfeature_index1343,heartfeature_index671,heartfeature_index672,heartfeature_index1343,time,dayofweek,timeofweek
0,2016-09-03 23:45:00,65.0,0.0,10,166.0,93.72084,0.0,115.0,71.840504,74.0,1425,5,8620
1,2016-09-04 00:00:00,65.0,0.0,10,1.0,166.0,93.72084,73.0,115.0,71.840504,0,6,8634
2,2016-09-04 00:15:00,64.0,0.0,10,0.0,1.0,93.72084,68.0,73.0,71.840504,15,6,8649
3,2016-09-04 00:30:00,88.0,284.0,-10,0.0,0.0,93.72084,70.0,68.0,71.840504,30,6,8664
4,2016-09-04 00:45:00,73.0,52.0,-10,0.0,0.0,93.72084,64.0,70.0,71.840504,45,6,8679


# Machine Learning Models
1. Use rhythm features
2. Use autoco features
    - 1) steps
    - 2) steps, heart

In [None]:
# get train and test
ndata1_ac_shr_train, ndata1_ac_shr_test = get_train_test(ndata1_ac_shr, split=0.8)

In [49]:
# Decision Tree 
# 80.59% 58.92%
# prediction score on test data is 0.589200
# false positive rate is 0.207329
# false negative rate is 0.203472
ndata1_ac_shr_model, ndata1_ac_shr_model_features = get_dt_model(ndata1_ac_shr_train, ndata1_ac_shr_features, 
                                                             'inactive', optimized=True)
ndata1_ac_shr_model_predicted, ndata1_ac_shr_model_accuracy = get_prediction(ndata1_ac_shr_test, 
                                                                             ndata1_ac_shr_model, 
                                                                             'inactive', 
                                                                             ndata1_ac_shr_model_features)
get_model_accuracy(ndata1_ac_shr_model_predicted, ndata1_ac_shr_test['inactive'])

accuracy based on training data is 0.656461
accuracy based on training data is 0.564127
accuracy based on training data is 0.716731
accuracy based on training data is 0.626567
accuracy based on training data is 0.626808
accuracy based on training data is 0.630665
accuracy based on training data is 0.624156
accuracy based on training data is 0.630183
accuracy based on training data is 0.629942
accuracy based on training data is 0.711427
accuracy based on training data is 0.717936
accuracy based on training data is 0.703230
accuracy based on training data is 0.715284
accuracy based on training data is 0.721311
accuracy based on training data is 0.720588
accuracy based on training data is 0.722758
accuracy based on training data is 0.729026
accuracy based on training data is 0.716731
accuracy based on training data is 0.662006
accuracy based on training data is 0.659836
accuracy based on training data is 0.670685
accuracy based on training data is 0.679605
accuracy based on training data 

prediction score on test data is 0.589200
false positive rate is 0.207329
false negative rate is 0.203472


In [30]:
from lib.newMachineLearningModels import *

In [45]:
ndata1_ac_shr_model, ndata1_ac_shr_model_features = get_selected_model(ndata1_ac_shr_train, 
                                                                       ndata1_ac_shr_features[2:], 
                                                                       'inactive', 'Random Forest', optimized=False)
ndata1_ac_shr_model_predicted, ndata1_ac_shr_model_accuracy = get_prediction(ndata1_ac_shr_test, 
                                                                             ndata1_ac_shr_model, 
                                                                             'inactive', 
                                                                             ndata1_ac_shr_model_features)

ValueError: too many values to unpack (expected 2)

TypeError: list indices must be integers or slices, not tuple

In [35]:
ndata1_ac_shr_model_predicted, ndata1_ac_shr_model_accuracy = get_prediction(ndata1_ac_shr_test, 
                                                                             ndata1_ac_shr_model, 
                                                                             'inactive', 
                                                                             ndata1_ac_shr_features)

['time', 'dayofweek', 'timeofweek', 'stepsfeature_index671', 'stepsfeature_index672', 'stepsfeature_index1343', 'heartfeature_index671', 'heartfeature_index672', 'heartfeature_index1343']
prediction accuracy based on test data is 0.615236 


In [15]:
ndata1_cr_model, ndata1_cr_model_features = get_dt_model(ndata1_cr, ndata1_cr_features, 'inactive', optimized=True)
ndata1_ac_s_model, ndata1_ac_s_model_features = get_dt_model(ndata1_ac_s, ndata1_ac_s_features, 
                                                             'inactive', optimized=True)
ndata1_ac_sh_model, ndata1_ac_sh_model_features = get_dt_model(ndata1_ac_sh, ndata1_ac_sh_features, 
                                                               'inactive', optimized=True)

accuracy based on training data is 0.652727
accuracy based on training data is 0.551011
accuracy based on training data is 0.696844
accuracy based on training data is 0.694547
accuracy based on training data is 0.696538
accuracy based on training data is 0.696844
accuracy based on training data is 0.696538
best features are: 'timeofweek'
accuracy based on training data is 0.696844
accuracy based on training data is 0.629894
accuracy based on training data is 0.629894
accuracy based on training data is 0.630665
accuracy based on training data is 0.672710
accuracy based on training data is 0.686789
accuracy based on training data is 0.692960
accuracy based on training data is 0.716297
best features are: 'stepsfeature_index671', 'stepsfeature_index672', 'stepsfeature_index1343'
accuracy based on training data is 0.716297
accuracy based on training data is 0.629894
accuracy based on training data is 0.629894
accuracy based on training data is 0.630665
accuracy based on training data is 0.6

In [None]:
ndata1_ac_model, ndata1_ac_model_features = get_dt_model(ndata1_ac, ndata1_ac_features, 'inactive', optimized=True)


frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']); frame
foo = set(['a', 'b','b', 'c', 'd', 'e'])
print(random.sample(foo,2))
type(set([x for x in range(7)])[0])
frame.reset_index(inplace=True)
frame.loc[[2,3], :]