In [21]:
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sns
import matplotlib.pyplot as plt
import re
import ray
import datetime
from scipy.signal import argrelextrema
%matplotlib inline

In [22]:
## put in the file directory where you stored the data
## e.g. '/home/ab/cd/FITriMS_Intraday_All.09.10.17 (2).csv'
mypath = './data/FITriMS_Intraday_All_09_10_17.csv'
fitM = pd.read_csv(mypath)
# sample = pd.read_csv('/home/eli/Downloads/sample_trainset.csv')

In [23]:
fitM['ID'].value_counts().sort_values()

10        220
11        484
2         966
96       1211
101      1335
        ...  
44     183366
35     224171
51     240132
50     254002
24     257303
Name: ID, Length: 98, dtype: int64

In [24]:
fitM['device'].value_counts().sort_values()

Charge        37270
Charge 2      40063
Flex 2        45392
Flex        6550036
Name: device, dtype: int64

In [25]:
fitM['resource'].value_counts()

steps    6672761
Name: resource, dtype: int64

In [26]:
new_df = fitM[fitM['ID'] == 7]
new_df

Unnamed: 0,ID,date,minute,steps,resource,device
0,7,20JUL2015,582,90,steps,Flex
1,7,20JUL2015,583,112,steps,Flex
2,7,20JUL2015,584,46,steps,Flex
3,7,20JUL2015,587,4,steps,Flex
4,7,20JUL2015,588,4,steps,Flex
...,...,...,...,...,...,...
120720,7,25AUG2017,1331,12,steps,Flex
120721,7,25AUG2017,1334,11,steps,Flex
120722,7,25AUG2017,1335,44,steps,Flex
120723,7,25AUG2017,1336,19,steps,Flex


In [27]:
def for_trainset(df, pat_ID, dest_dir):
    
    """
    Select data from df corresponding to patient with ID "pat_ID". Then convert it
    to the format as required by the trainset. Save the dataframe as a csv file in 
    the local directory with the name "trainset_patient + ID" (e.g. "trainset_patient7"). Also return
    the dataframe as the output.
    
    Parameters: 
    df (pandas.DataFrame): the dataframe containing all the data
    patient_ID (int) : the ID of the desired patient data
    dest_dir: path to the directory to store the output dataframe (should end with '/')

  
    Returns: 
    panda.DataFrame: A dataframe contains the data for a certain patient in the trainset format

    """
    
    
    ## Get the data for the patient
    new_df = df[df['ID'] == pat_ID]
    
    ## Drop unnecessary columns
    new_df.drop(columns=['resource', 'device'], inplace=True)
    
    ## Convert the date into datetime format and add minutes
    new_df['date'] = pd.to_datetime(new_df['date'])
    new_df['minute dt'] = pd.TimedeltaIndex(new_df['minute'], unit='m')
    new_df['date minute'] = new_df['date'] + new_df['minute dt']
    
    # Drop columns not needed for trainset
    new_df.drop(columns=['date', 'minute', 'minute dt', 'ID'], inplace=True)
    
    ## Add columns 'label', 'filename', 'timestamp' as required by the trainset
    new_df.rename(columns={"steps": "value", "date minute": "timestamp"}, inplace=True)
    new_df['label'] = 0
    new_df['filename'] = "trainset_patient" + str(pat_ID)
    new_df = new_df[['filename', 'timestamp', 'value', 'label']]
    new_df['timestamp'] = new_df['timestamp'].apply(datetime.datetime.isoformat)
    new_df['timestamp'] = new_df['timestamp'] + 'Z'
    new_df['value'] = new_df['value'].apply(float)
    
    ## Save the new_df to a csv file
    new_df.to_csv(dest_dir + 'trainset_patient' + str(pat_ID) + '.csv',  index=False)
    
    return new_df

In [28]:
trainset_patient7_full = for_trainset(fitM, 7, 'data/')
trainset_patient7_full

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['date'] = pd.to_datetime(new_df['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['minute dt'] = pd.TimedeltaIndex(new_df['minute'], unit='m')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

S

Unnamed: 0,filename,timestamp,value,label
0,trainset_patient7,2015-07-20T09:42:00Z,90.0,0
1,trainset_patient7,2015-07-20T09:43:00Z,112.0,0
2,trainset_patient7,2015-07-20T09:44:00Z,46.0,0
3,trainset_patient7,2015-07-20T09:47:00Z,4.0,0
4,trainset_patient7,2015-07-20T09:48:00Z,4.0,0
...,...,...,...,...
120720,trainset_patient7,2017-08-25T22:11:00Z,12.0,0
120721,trainset_patient7,2017-08-25T22:14:00Z,11.0,0
120722,trainset_patient7,2017-08-25T22:15:00Z,44.0,0
120723,trainset_patient7,2017-08-25T22:16:00Z,19.0,0


In [29]:
trainset_patient7_20150720 = trainset_patient7_full[trainset_patient7_full['timestamp'].str.contains('2015-07-20')]
trainset_patient7_20150720

Unnamed: 0,filename,timestamp,value,label
0,trainset_patient7,2015-07-20T09:42:00Z,90.0,0
1,trainset_patient7,2015-07-20T09:43:00Z,112.0,0
2,trainset_patient7,2015-07-20T09:44:00Z,46.0,0
3,trainset_patient7,2015-07-20T09:47:00Z,4.0,0
4,trainset_patient7,2015-07-20T09:48:00Z,4.0,0
...,...,...,...,...
212,trainset_patient7,2015-07-20T22:46:00Z,33.0,0
213,trainset_patient7,2015-07-20T22:52:00Z,8.0,0
214,trainset_patient7,2015-07-20T23:52:00Z,6.0,0
215,trainset_patient7,2015-07-20T23:53:00Z,100.0,0


In [30]:
trainset_patient7_20150720.to_csv('data/trainset_patient7_20150720.csv',  index=False)

In [31]:
trainset_patient7_201507 = trainset_patient7_full[trainset_patient7_full['timestamp'].str.contains('2015-07')]
trainset_patient7_201507

Unnamed: 0,filename,timestamp,value,label
0,trainset_patient7,2015-07-20T09:42:00Z,90.0,0
1,trainset_patient7,2015-07-20T09:43:00Z,112.0,0
2,trainset_patient7,2015-07-20T09:44:00Z,46.0,0
3,trainset_patient7,2015-07-20T09:47:00Z,4.0,0
4,trainset_patient7,2015-07-20T09:48:00Z,4.0,0
...,...,...,...,...
2757,trainset_patient7,2015-07-31T22:28:00Z,2.0,0
2758,trainset_patient7,2015-07-31T22:59:00Z,12.0,0
2759,trainset_patient7,2015-07-31T23:00:00Z,16.0,0
2760,trainset_patient7,2015-07-31T23:09:00Z,4.0,0


In [33]:
trainset_patient7_201507.to_csv('data/trainset_patient7_201507.csv',  index=False)