In [2]:
import pandas as pd
import numpy as np
import random
from lib.get_plots import *
from lib.sba_transform import *
from lib.model_pipeline import *
# from lib.dt_model import *
from lib.demo_model_pipeline import * 

In [None]:
%pylab inline
pylab.rcParams['figure.figsize'] = (20, 7)   # caution: plt namespace may be taken

In [3]:
data = pd.read_csv('demo/fitbituser.csv', parse_dates=['date_time'])
data = filter_na(data)
print(data.describe())

             heart       steps
count  5950.000000  5950.00000
mean     71.840504    93.72084
std      16.009608   228.97116
min      43.000000     0.00000
25%      58.000000     0.00000
50%      70.000000     0.00000
75%      80.000000    49.00000
max     156.000000  2306.00000


# Fill missing data with different methods

In [4]:
ndata1 = fill_missing(data, method='mean')
ndata2 = fill_missing(data, method='rndminmax')
ndata3 = fill_missing(data, method='rnd50zeromax')
ndata4 = fill_missing(data, method='zero')
ndata5 = fill_missing(data, method='max')
ndatacontrol = fill_missing(data, method='none')

In [5]:
ndata1 = sba_pipeline(ndata1)
ndata2 = sba_pipeline(ndata2)
ndata3 = sba_pipeline(ndata3)
ndata4 = sba_pipeline(ndata4)
ndata5 = sba_pipeline(ndata5)
ndatacontrol = sba_pipeline(ndatacontrol)

# Heatmap

In [None]:
print(ndata1.describe())
ndata1.head(5)

In [None]:
get_heatmap(ndata1, 'steps', 'ndata1 steps')
get_heatmap(ndata2, 'steps', 'ndata2 steps')
get_heatmap(ndata3, 'steps', 'ndata3 steps')
get_heatmap(ndata4, 'steps', 'ndata4 steps')
get_heatmap(ndata5, 'steps', 'ndata5 steps')
get_heatmap(ndatacontrol, 'steps', 'ndatacontrol steps')

# Autocorrelation Analysis

In [None]:
# with sleeping time
get_plot_acf(ndatacontrol.steps, 96*21, 96, title='Autocorrelations of steps in ndatacontrol')
get_plot_acf(ndata1.steps, 96*21, 96, title='Autocorrelations of steps in ndata1')
get_plot_acf(ndata2.steps, 96*21, 96, title='Autocorrelations of steps in ndata2')
get_plot_acf(ndata3.steps, 96*21, 96, title='Autocorrelations of steps in ndata3')
get_plot_acf(ndata4.steps, 96*21, 96, title='Autocorrelations of steps in ndata4')
get_plot_acf(ndata5.steps, 96*21, 96, title='Autocorrelations of steps in ndata5')


In [None]:
# without sleeping time
get_plot_acf(filter_sleeping_time(ndatacontrol).steps, 68*21, 68, title='Autocorrelations of steps in ndatacontrol')
get_plot_acf(filter_sleeping_time(ndata1).steps, 68*21, 68, title='Autocorrelations of steps in ndata1')
get_plot_acf(filter_sleeping_time(ndata2).steps, 68*21, 68, title='Autocorrelations of steps in ndata2')
get_plot_acf(filter_sleeping_time(ndata3).steps, 68*21, 68, title='Autocorrelations of steps in ndata3')
get_plot_acf(filter_sleeping_time(ndata4).steps, 68*21, 68, title='Autocorrelations of steps in ndata4')
get_plot_acf(filter_sleeping_time(ndata5).steps, 68*21, 68, title='Autocorrelations of steps in ndata5')

# Get features
    1) Weekly and daily circadian rhythmicity of physical activity
    2) Autocorrelation analysis for feature selection

In [None]:
# 1. circadian rhythmicity features: time, time of week, day of week
ndata1_cr, ndata1_cr_features = get_features(ndata1, method='rhythm')
# 2. autocorrelated parameters: steps
ndata1_ac_s, ndata1_ac_s_features = get_features(ndata1, method='autoco')
# 3. autocorrelated parameters: steps and heart
ndata1_ac_sh_params = ['steps', 'heart']
ndata1_ac_sh, ndata1_ac_sh_features_subset = get_features(ndata1, method='autoco', params=ndata1_ac_sh_params)
ndata1_ac_sh_features = [item for sublist in ndata1_ac_sh_features_subset for item in sublist]
# 4. circadian rhythmicity features + autocorrelated parameters: time, time of week, day of week, steps, heart
ndata1_ac_shr, ndata1_ac_shr_features_subset = get_features(ndata1_ac_sh, method='rhythm')
ndata1_ac_shr_features = ndata1_ac_shr_features_subset + ndata1_ac_sh_features

In [None]:
ndata1_ac_shr.head()

# Machine Learning Models
1. Use rhythm features
2. Use autoco features
    - 1) steps
    - 2) steps, heart

In [6]:
# All features
# 4. circadian rhythmicity features + autocorrelated parameters: time, time of week, day of week, steps, heart
sh_params = ['steps', 'heart']
ndata1, features_subset1 = get_features(ndata1, method='autoco', params=sh_params)
features_subset1 = [item for sublist in features_subset1 for item in sublist]
ndata1, features_subset2 = get_features(ndata1, method='rhythm')
ndata1_features = features_subset1 + features_subset2

the 671 th index has ac of 0.283958
the 672 th index has ac of 0.211081
the 1343 th index has ac of 0.230691
feature indexes are 671, 672, 1343
index 671 is valid
index 672 is valid
index 1343 is valid
the 671 th index has ac of 0.283958
the 672 th index has ac of 0.211081
the 1343 th index has ac of 0.230691
feature indexes are 671, 672, 1343
index 671 is valid
index 672 is valid
index 1343 is valid


In [7]:
ndata1_features

['stepsfeature_index671',
 'stepsfeature_index672',
 'stepsfeature_index1343',
 'heartfeature_index671',
 'heartfeature_index672',
 'heartfeature_index1343',
 'time',
 'dayofweek',
 'timeofweek']

In [8]:
# get train and test
ndata1_train, ndata1_test = get_train_test(ndata1, split=0.8)

number of observations in training data is: 4148
number of observations in testing data is: 1037


In [10]:
from lib.MachineLearningModels import *

In [50]:
# Decision Tree 
# 80.59% 58.92%
# prediction score on test data is 0.589200
# false positive rate is 0.207329
# false negative rate is 0.203472
ndata1_dt_model, ndata1_dt_model_features = get_selected_model(ndata1_train, 
                                                               ndata1_features, 
                                                               'inactive', 
                                                               'Decision Tree', 
                                                               optimized=True)
ndata1_dt_model_predicted, ndata1_dt_model_accuracy = get_prediction(ndata1_test, 
                                                                     ndata1_dt_model, 
                                                                     'inactive', 
                                                                     ndata1_dt_model_features)
get_model_accuracy(ndata1_dt_model_predicted, ndata1_test['inactive'])

best features are: 'stepsfeature_index671', 'stepsfeature_index672', 'stepsfeature_index1343', 'heartfeature_index671', 'heartfeature_index672', 'heartfeature_index1343', 'time', 'dayofweek', 'timeofweek'
accuracy based on training data is 0.805689
['stepsfeature_index671', 'stepsfeature_index672', 'stepsfeature_index1343', 'heartfeature_index671', 'heartfeature_index672', 'heartfeature_index1343', 'time', 'dayofweek', 'timeofweek']
prediction accuracy based on test data is 0.593057 
prediction score on test data is 0.593057
false positive rate is 0.203472
false negative rate is 0.203472


In [49]:
# Random Forest
ndata1_rf_model, ndata1_rf_model_features = get_selected_model(ndata1_train, ndata1_features[1:], 'inactive',
                                                               'Random Forest', optimized=False)
ndata1_rf_model_predicted, ndata1_rf_model_accuracy = get_prediction(ndata1_test,
                                                                     ndata1_rf_model,
                                                                     'inactive',
                                                                     ndata1_rf_model_features)

['stepsfeature_index672', 'stepsfeature_index1343', 'heartfeature_index671', 'heartfeature_index672', 'heartfeature_index1343', 'time', 'dayofweek', 'timeofweek']
prediction accuracy based on test data is 0.628737 


In [71]:
# Bagging
ndata1_bg_model, ndata1_bg_model_features = get_selected_model(ndata1_train, ndata1_features, 'inactive', 'Bagging',
                                                               optimized=False)
ndata1_bg_model_predicted, ndata1_bg_model_accuracy = get_prediction(ndata1_test,
                                                                     ndata1_bg_model,
                                                                     'inactive',
                                                                     ndata1_bg_model_features)

['stepsfeature_index671', 'stepsfeature_index672', 'stepsfeature_index1343', 'heartfeature_index671', 'heartfeature_index672', 'heartfeature_index1343', 'time', 'dayofweek', 'timeofweek']
prediction accuracy based on test data is 0.633558 


In [102]:
# Adaboost 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
# 
ada_score_lst = []
for i in range(10, 100):
    ada_method = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=i)
    adaboost_model, adaboost_feature = get_model(ndata1_train, ndata1_features, 'inactive', ada_method, 
                                                 optimized=False)
    ada_predicted, ada_accuracy = get_prediction(ndata1_test, adaboost_model, 'inactive', adaboost_feature)
    ada_score_lst.append(ada_accuracy)

print(mean(ada_score_lst))  # 0.661523625844

['stepsfeature_index671', 'stepsfeature_index672', 'stepsfeature_index1343', 'heartfeature_index671', 'heartfeature_index672', 'heartfeature_index1343', 'time', 'dayofweek', 'timeofweek']
prediction accuracy based on test data is 0.657666 
['stepsfeature_index671', 'stepsfeature_index672', 'stepsfeature_index1343', 'heartfeature_index671', 'heartfeature_index672', 'heartfeature_index1343', 'time', 'dayofweek', 'timeofweek']
prediction accuracy based on test data is 0.656702 
['stepsfeature_index671', 'stepsfeature_index672', 'stepsfeature_index1343', 'heartfeature_index671', 'heartfeature_index672', 'heartfeature_index1343', 'time', 'dayofweek', 'timeofweek']
prediction accuracy based on test data is 0.660559 
['stepsfeature_index671', 'stepsfeature_index672', 'stepsfeature_index1343', 'heartfeature_index671', 'heartfeature_index672', 'heartfeature_index1343', 'time', 'dayofweek', 'timeofweek']
prediction accuracy based on test data is 0.657666 
['stepsfeature_index671', 'stepsfeature_