In [1]:
import pandas as pd
import numpy as np
import patsy
import time

from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from datetime_truncate import truncate
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

In [2]:
sleep_df = pd.read_csv("../../Data_Science_Coding_Exercise/sleepdata.csv")

In [3]:
sleep_df

Unnamed: 0,Night_date,Sleep_time,Wakeup_time,Time_in_bed,Sleep_rating,Notes,Steps_taken
0,2014-04-29,2014-04-30 00:00:40,2014-04-30 08:14:20,8:13,OK,Drank tea:Early day tomorrow:Massage,3874.0
1,2014-04-30,2014-05-01 00:36:25,2014-05-01 09:13:31,8:37,OK,Ate late:Drank coffee:Early day tomorrow:Massage,3973.0
2,2014-05-01,2014-05-02 00:23:13,2014-05-02 09:27:02,9:03,OK,Drank coffee:Worked out,2513.0
3,2014-05-02,2014-05-03 01:06:28,2014-05-03 09:26:28,8:19,OK,Ate late:Had alcohol:cool night:Massage:Worked...,159.0
4,2014-05-03,2014-05-04 01:21:08,2014-05-04 10:17:36,8:56,OK,Ate late:Had alcohol:Worked out,2177.0
5,2014-05-07,2014-05-08 00:31:23,2014-05-08 09:32:38,9:01,OK,Early day tomorrow:Had alcohol:cool night,5294.0
6,2014-05-08,2014-05-08 23:34:45,2014-05-09 09:54:17,10:19,OK,Ate late:Had alcohol,8666.0
7,2014-05-16,2014-05-17 00:43:17,2014-05-17 08:28:26,7:45,OK,Stressful day:Worked out,14686.0
8,2014-05-18,2014-05-19 00:07:10,2014-05-19 09:13:54,9:06,Poor,Early day tomorrow,1975.0
9,2014-05-22,2014-05-23 00:04:19,2014-05-23 08:20:27,8:16,OK,Early day tomorrow:Worked out,2322.0


In [4]:
# converting time_in_bed column to minutes
time = pd.DatetimeIndex(sleep_df['Time_in_bed'])
time_mins = time.hour*60 + time.minute

In [5]:
# replacing previous hh:mm format to simple minutes
sleep_df['Time_in_bed'] = time_mins

In [6]:
sleep_df

Unnamed: 0,Night_date,Sleep_time,Wakeup_time,Time_in_bed,Sleep_rating,Notes,Steps_taken
0,2014-04-29,2014-04-30 00:00:40,2014-04-30 08:14:20,493,OK,Drank tea:Early day tomorrow:Massage,3874.0
1,2014-04-30,2014-05-01 00:36:25,2014-05-01 09:13:31,517,OK,Ate late:Drank coffee:Early day tomorrow:Massage,3973.0
2,2014-05-01,2014-05-02 00:23:13,2014-05-02 09:27:02,543,OK,Drank coffee:Worked out,2513.0
3,2014-05-02,2014-05-03 01:06:28,2014-05-03 09:26:28,499,OK,Ate late:Had alcohol:cool night:Massage:Worked...,159.0
4,2014-05-03,2014-05-04 01:21:08,2014-05-04 10:17:36,536,OK,Ate late:Had alcohol:Worked out,2177.0
5,2014-05-07,2014-05-08 00:31:23,2014-05-08 09:32:38,541,OK,Early day tomorrow:Had alcohol:cool night,5294.0
6,2014-05-08,2014-05-08 23:34:45,2014-05-09 09:54:17,619,OK,Ate late:Had alcohol,8666.0
7,2014-05-16,2014-05-17 00:43:17,2014-05-17 08:28:26,465,OK,Stressful day:Worked out,14686.0
8,2014-05-18,2014-05-19 00:07:10,2014-05-19 09:13:54,546,Poor,Early day tomorrow,1975.0
9,2014-05-22,2014-05-23 00:04:19,2014-05-23 08:20:27,496,OK,Early day tomorrow:Worked out,2322.0


In [7]:
# changing any nan values in steps_taken to simply 0
for i in range(len(sleep_df)):
    if np.isnan(sleep_df['Steps_taken'][i]) == True:
        sleep_df['Steps_taken'][i] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [8]:
# below we're taking the variance of the user's movement to describe their movements throughout sleep for each night

var_movements = []

for i in range(len(sleep_df)):
    date = str(sleep_df['Night_date'][i])[:10]
    
    csv = '../../Data_Science_Coding_Exercise/accelerometer/data_%s.csv' % date
    
    accel_df = pd.read_csv(csv, names = ['time(ms)', 'movement'])
    var_movement = np.var(accel_df['movement'])
    
    var_movements.append(var_movement)

sleep_df['var_movements'] = var_movements

In [9]:
sleep_df

Unnamed: 0,Night_date,Sleep_time,Wakeup_time,Time_in_bed,Sleep_rating,Notes,Steps_taken,var_movements
0,2014-04-29,2014-04-30 00:00:40,2014-04-30 08:14:20,493,OK,Drank tea:Early day tomorrow:Massage,3874.0,18204360000000.0
1,2014-04-30,2014-05-01 00:36:25,2014-05-01 09:13:31,517,OK,Ate late:Drank coffee:Early day tomorrow:Massage,3973.0,31228970000000.0
2,2014-05-01,2014-05-02 00:23:13,2014-05-02 09:27:02,543,OK,Drank coffee:Worked out,2513.0,19378950000000.0
3,2014-05-02,2014-05-03 01:06:28,2014-05-03 09:26:28,499,OK,Ate late:Had alcohol:cool night:Massage:Worked...,159.0,27002560000000.0
4,2014-05-03,2014-05-04 01:21:08,2014-05-04 10:17:36,536,OK,Ate late:Had alcohol:Worked out,2177.0,6340275000000.0
5,2014-05-07,2014-05-08 00:31:23,2014-05-08 09:32:38,541,OK,Early day tomorrow:Had alcohol:cool night,5294.0,2671272000000.0
6,2014-05-08,2014-05-08 23:34:45,2014-05-09 09:54:17,619,OK,Ate late:Had alcohol,8666.0,2426893000000.0
7,2014-05-16,2014-05-17 00:43:17,2014-05-17 08:28:26,465,OK,Stressful day:Worked out,14686.0,5731557000000.0
8,2014-05-18,2014-05-19 00:07:10,2014-05-19 09:13:54,546,Poor,Early day tomorrow,1975.0,5918767000000.0
9,2014-05-22,2014-05-23 00:04:19,2014-05-23 08:20:27,496,OK,Early day tomorrow:Worked out,2322.0,3557770000000.0


In [10]:
# here we code each possible note in the 'Notes' column to their own separate binary column (0 for no, 1 for yes)

drank_tea = []
drank_coffee = []
early_day = []
alcohol = []
ate_late = []
worked_out = []
stress_day = []
cool_night = []
massage = []

for i in range(len(sleep_df)):
    if 'Drank tea' in str(sleep_df['Notes'][i]):
        drank_tea.append(1)
    else:
        drank_tea.append(0)
        
    if 'Drank coffee' in str(sleep_df['Notes'][i]):
        drank_coffee.append(1)
    else:
        drank_coffee.append(0)
        
    if 'Early day tomorrow' in str(sleep_df['Notes'][i]):
        early_day.append(1)
    else:
        early_day.append(0)
        
    if 'Had alcohol' in str(sleep_df['Notes'][i]):
        alcohol.append(1)
    else:
        alcohol.append(0)
        
    if 'Ate late' in str(sleep_df['Notes'][i]):
        ate_late.append(1)
    else:
        ate_late.append(0)
        
    if 'Worked out' in str(sleep_df['Notes'][i]):
        worked_out.append(1)
    else:
        worked_out.append(0)
        
    if 'Stressful day' in str(sleep_df['Notes'][i]):
        stress_day.append(1)
    else:
        stress_day.append(0)
        
    if 'cool night' in str(sleep_df['Notes'][i]):
        cool_night.append(1)
    else:
        cool_night.append(0)
        
    if 'Massage' in str(sleep_df['Notes'][i]):
        massage.append(1)
    else:
        massage.append(0)
    
sleep_df['drank_tea'] = drank_tea
sleep_df['drank_coffee'] = drank_coffee
sleep_df['early_day_tmrw'] = early_day
sleep_df['had_alcohol'] = alcohol
sleep_df['ate_late'] = ate_late
sleep_df['worked_out'] = worked_out
sleep_df['stressful_day'] = stress_day
sleep_df['cool_night'] = cool_night
sleep_df['massage'] = massage

In [11]:
sleep_df

Unnamed: 0,Night_date,Sleep_time,Wakeup_time,Time_in_bed,Sleep_rating,Notes,Steps_taken,var_movements,drank_tea,drank_coffee,early_day_tmrw,had_alcohol,ate_late,worked_out,stressful_day,cool_night,massage
0,2014-04-29,2014-04-30 00:00:40,2014-04-30 08:14:20,493,OK,Drank tea:Early day tomorrow:Massage,3874.0,18204360000000.0,1,0,1,0,0,0,0,0,1
1,2014-04-30,2014-05-01 00:36:25,2014-05-01 09:13:31,517,OK,Ate late:Drank coffee:Early day tomorrow:Massage,3973.0,31228970000000.0,0,1,1,0,1,0,0,0,1
2,2014-05-01,2014-05-02 00:23:13,2014-05-02 09:27:02,543,OK,Drank coffee:Worked out,2513.0,19378950000000.0,0,1,0,0,0,1,0,0,0
3,2014-05-02,2014-05-03 01:06:28,2014-05-03 09:26:28,499,OK,Ate late:Had alcohol:cool night:Massage:Worked...,159.0,27002560000000.0,0,0,0,1,1,1,0,1,1
4,2014-05-03,2014-05-04 01:21:08,2014-05-04 10:17:36,536,OK,Ate late:Had alcohol:Worked out,2177.0,6340275000000.0,0,0,0,1,1,1,0,0,0
5,2014-05-07,2014-05-08 00:31:23,2014-05-08 09:32:38,541,OK,Early day tomorrow:Had alcohol:cool night,5294.0,2671272000000.0,0,0,1,1,0,0,0,1,0
6,2014-05-08,2014-05-08 23:34:45,2014-05-09 09:54:17,619,OK,Ate late:Had alcohol,8666.0,2426893000000.0,0,0,0,1,1,0,0,0,0
7,2014-05-16,2014-05-17 00:43:17,2014-05-17 08:28:26,465,OK,Stressful day:Worked out,14686.0,5731557000000.0,0,0,0,0,0,1,1,0,0
8,2014-05-18,2014-05-19 00:07:10,2014-05-19 09:13:54,546,Poor,Early day tomorrow,1975.0,5918767000000.0,0,0,1,0,0,0,0,0,0
9,2014-05-22,2014-05-23 00:04:19,2014-05-23 08:20:27,496,OK,Early day tomorrow:Worked out,2322.0,3557770000000.0,0,0,1,0,0,1,0,0,0


In [12]:
# setting our X and Y values below

Y = sleep_df['Sleep_rating'].values

# normalizing X before we start modeling
# deciding not to use the night_date, sleep_time, or wakeup_time values as these could confound our findings
X = sleep_df[[x for x in sleep_df.columns if x not in ['Night_date', 'Sleep_time', 'Wakeup_time', 'Sleep_rating', 'Notes']]]
X_cols = X.columns
X = (X - X.mean())/X.std()
X = X.values

In [13]:
# we'll start our modeling by gridsearching over a knn model below
knn = KNeighborsClassifier()

knn_params = {
    'n_neighbors': np.linspace(1, 25, 13),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree']
}

knn_gs = GridSearchCV(knn, knn_params, cv = 5)
knn_gs.fit(X, Y)

knn = knn_gs.best_estimator_
knn_score = np.mean(cross_val_score(knn, X, Y, cv = 5))
knn.fit(X, Y)
print knn_score

  return [func(*args, **kwargs) for func, args, kwargs in self.items]
  neigh_ind = neigh_ind[:, :n_neighbors]


0.682619047619


In [14]:
# 6.8 is a respectable score, but what does the classification report tell us below?
knn_true = Y
knn_pred = knn.predict(X)
print classification_report(knn_true, knn_pred, target_names = ['Great', 'OK', 'Poor'])

             precision    recall  f1-score   support

      Great       0.67      0.29      0.40         7
         OK       0.71      0.96      0.81        23
       Poor       0.00      0.00      0.00         4

avg / total       0.62      0.71      0.63        34



  'precision', 'predicted', average, warn_for)


In [15]:
knn_pred

array(['OK', 'OK', 'OK', 'OK', 'OK', 'OK', 'OK', 'OK', 'OK', 'OK', 'OK',
       'OK', 'OK', 'OK', 'OK', 'OK', 'Great', 'OK', 'OK', 'OK', 'Great',
       'OK', 'OK', 'OK', 'OK', 'OK', 'OK', 'OK', 'OK', 'Great', 'OK', 'OK',
       'OK', 'OK'], dtype=object)

In [16]:
knn_true

array(['OK', 'OK', 'OK', 'OK', 'OK', 'OK', 'OK', 'OK', 'Poor', 'OK', 'OK',
       'OK', 'Great', 'OK', 'Poor', 'OK', 'Great', 'Great', 'Great', 'OK',
       'OK', 'OK', 'Poor', 'OK', 'Poor', 'OK', 'Great', 'Great', 'OK',
       'Great', 'OK', 'OK', 'OK', 'OK'], dtype=object)

Looking at the above results tells us that the model almost exclusively guesses OKs, which, while resulting in a fairly high R2, isn't really what we want. Below we try a different classifier: Decision Tree.

In [17]:
dtc = DecisionTreeClassifier()

dtc_params = {
    'max_features': [None],
    'max_depth': [1, 2, 3, None],
    'min_samples_leaf': np.linspace(1, 101, 5)
}

dtc_gs = GridSearchCV(dtc, dtc_params, cv = 5)
dtc_gs.fit(X, Y)

dtc = dtc_gs.best_estimator_
dtc_score = np.mean(cross_val_score(dtc, X, Y, cv = 5))
dtc.fit(X, Y)
print dtc_score

0.75119047619


In [18]:
dtc_true = Y
dtc_pred = dtc.predict(X)
print classification_report(dtc_true, dtc_pred, target_names = ['Great', 'OK', 'Poor'])

             precision    recall  f1-score   support

      Great       0.75      0.86      0.80         7
         OK       0.81      0.91      0.86        23
       Poor       0.00      0.00      0.00         4

avg / total       0.70      0.79      0.74        34



In [19]:
dtc_pred

array(['OK', 'OK', 'OK', 'OK', 'OK', 'OK', 'Great', 'OK', 'OK', 'OK', 'OK',
       'OK', 'OK', 'OK', 'OK', 'OK', 'Great', 'Great', 'Great', 'OK',
       'Great', 'OK', 'OK', 'OK', 'OK', 'OK', 'Great', 'Great', 'OK',
       'Great', 'OK', 'OK', 'OK', 'OK'], dtype=object)

In [20]:
dtc_true

array(['OK', 'OK', 'OK', 'OK', 'OK', 'OK', 'OK', 'OK', 'Poor', 'OK', 'OK',
       'OK', 'Great', 'OK', 'Poor', 'OK', 'Great', 'Great', 'Great', 'OK',
       'OK', 'OK', 'Poor', 'OK', 'Poor', 'OK', 'Great', 'Great', 'OK',
       'Great', 'OK', 'OK', 'OK', 'OK'], dtype=object)

Now we see a greater variability in the model's predictions. It still doesn't attempt to predict any Poors but we see more Greats and, thus, a higher R2. As a final test let's try wrapping a Decision Tree Classifier in a Bagging Classifier.

In [21]:
dtc = DecisionTreeClassifier()

# setting our parameters for the bagging classifier below
bag_params = {
    'base_estimator__max_features': [None],
    'base_estimator__max_depth': [None],
    'base_estimator__min_samples_leaf': [1],
    'max_features': [0.33, 0.66, 0.99],
    'max_samples': [0.1, 0.2, 0.4, 0.6, 0.8, 0.9],
    'n_estimators': [100]
}

# instantiating the bagging classifier with our decision tree classifier as the base estimator
bag = BaggingClassifier(dtc)

# using the gridsearch method to search over our parameters above and see which set of parameters is the most optimal
bag_gs = GridSearchCV(bag, bag_params, cv = 5)
bag_gs.fit(X, Y)

bag = bag_gs.best_estimator_
bag_score = np.mean(cross_val_score(bag, X, Y, cv = 5))
print bag_score

0.644047619048


In [22]:
# creating a classification report for our model above
# our model is pretty good at predicting Greats and OKs,
# but pretty bad at guessing Poors. perhaps because there's only 4 points
# of data where the user had a poor night's of rest?
bag_true = Y
bag_pred = bag.predict(X)
print classification_report(bag_true, bag_pred, target_names = ['Great', 'OK', 'Poor'])

             precision    recall  f1-score   support

      Great       1.00      0.86      0.92         7
         OK       0.82      1.00      0.90        23
       Poor       0.00      0.00      0.00         4

avg / total       0.76      0.85      0.80        34



In [23]:
bag_true

array(['OK', 'OK', 'OK', 'OK', 'OK', 'OK', 'OK', 'OK', 'Poor', 'OK', 'OK',
       'OK', 'Great', 'OK', 'Poor', 'OK', 'Great', 'Great', 'Great', 'OK',
       'OK', 'OK', 'Poor', 'OK', 'Poor', 'OK', 'Great', 'Great', 'OK',
       'Great', 'OK', 'OK', 'OK', 'OK'], dtype=object)

In [24]:
bag_pred

array(['OK', 'OK', 'OK', 'OK', 'OK', 'OK', 'OK', 'OK', 'OK', 'OK', 'OK',
       'OK', 'OK', 'OK', 'OK', 'OK', 'Great', 'Great', 'Great', 'OK', 'OK',
       'OK', 'OK', 'OK', 'OK', 'OK', 'Great', 'Great', 'OK', 'Great', 'OK',
       'OK', 'OK', 'OK'], dtype=object)

Now our model is willing to try predicting Poors, which is nice, but it means the R2 is lower overall as its not very accurate. The model's accuracy should increase however if exposed to more data.

If we were to receive more data, it'd be a simple task to clean the data, normalize it, and use our model's `predict` method to see it's predictions and how accurate it is compared to the true values.