In [4]:
import tensorflow as tf
from tensorflow.keras.models import load_model
import numpy as np
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import StructField, StructType, DoubleType,MapType, StringType,ArrayType, FloatType, TimestampType, IntegerType
from cerebralcortex.core.datatypes import DataStream
from cerebralcortex.core.metadata_manager.stream.metadata import Metadata, DataDescriptor, \
ModuleMetadata
from scipy import signal
from scipy.interpolate import interp1d
from cerebralcortex import Kernel
from scipy.stats import skew,kurtosis,mode
from collections import Counter
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')
import pickle
CC = Kernel("/home/jupyter/cc3_conf/", study_name='mperf')

In [None]:
activities = ['Stationery','Stairs','Exercise','Walking','Sports']
window_size = 20
training_percentage = .8
activity = activities[0]
for activity in activities[::-1]:
    directory = './data/'+str(window_size)+'/'+activity.lower()+'_moral/'
    if not os.path.isdir(directory+'training'):
        os.makedirs(directory+'training')
    if not os.path.isdir(directory+'testing'):
        os.makedirs(directory+'testing')
    stream_name = 'accelerometer--org.md2k.motionsense--motion_sense--right_wrist'+'.activity.all.three.'+str(window_size)+'.secs.'+str(activity.lower())
    users = CC.get_stream(stream_name).select('user').distinct().toPandas()['user'].values
    for i,user_id in enumerate(users):
        data = CC.get_stream(stream_name,user_id=user_id)
        data = data.toPandas()
        data  = data.sort_values('timestamp').reset_index(drop=True)
        data['time'] = data['start'].apply(lambda a:a.timestamp())
        data['data'] = data['data'].apply(lambda a:np.array(a).reshape(1,-1,3))
        training_data = data[:int(data.shape[0]*(training_percentage))] 
        testing_data = data[int(data.shape[0]*training_percentage):] 
        pickle.dump(training_data,open(directory+'training/'+str(user_id),'wb'))
        pickle.dump(testing_data,open(directory+'testing/'+str(user_id),'wb'))
        print(i,testing_data.shape[0],end=',')
    print()

In [5]:
def get_shortened_data(data_windowed,minutes=2000,window_size=20,training_percentage=.8):
    schema = data_windowed.schema
    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
    def get_user_data(df):
        df = df[int(df.shape[0]*training_percentage):]
        return df
    data_windowed = data_windowed.groupBy(['user','day']).apply(get_user_data)
    n = int(minutes*60/window_size)
    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
    def get_user_data_sort(df):
        df = df.sort_values('timestamp').reset_index(drop=True)
        if n>=df.shape[0]//2:
            m = df.shape[0]//2
        else:
            m = n
        df = df[int(df.shape[0]-m):]
        return df
    data_windowed = data_windowed.groupBy('user').apply(get_user_data_sort)
    return data_windowed
    
activities = ['walking']
window_size = 20
training_percentage = .8
activity = activities[0]
for activity in activities:
    print('-'*20+activity+'-'*20)
    directory = './data/'+str(window_size)+'/'+activity+'/'
    if not os.path.isdir(directory+'training'):
        os.makedirs(directory+'training')
    if not os.path.isdir(directory+'testing'):
        os.makedirs(directory+'testing')
    stream_name = 'org.md2k.feature.motionsensehrv.decoded.rightwrist.all.activity.all.three.'+str(window_size)+'.secs.'+str(activity)
    users = CC.get_stream(stream_name).select('user').distinct().toPandas()['user'].values
    for i,user_id in enumerate(users):
        data = CC.get_stream(stream_name,user_id=user_id)
        if activity=='stationery':
            data = get_shortened_data(data,minutes=1000,window_size=20)
        data = data.toPandas()
        data  = data.sort_values('timestamp').reset_index(drop=True)
#         if activity == 'stationery':
#             data = data[int(data.shape[0]*training_percentage):] 
        data['time'] = data['start'].apply(lambda a:a.timestamp())
        data['data'] = data['data'].apply(lambda a:np.array(a).reshape(1,-1,3))
        if activity =='stationery':
            testing_data  = data
            pickle.dump(testing_data,open(directory+'testing/'+str(user_id),'wb'))
        else:
#             training_data = data[:int(data.shape[0]*(training_percentage))] 
            testing_data = data[int(data.shape[0]*training_percentage):] 
            if testing_data.shape[0]>3000:
                testing_data = testing_data[:3000]
#             if training_data.shape[0]>5000:
#                 training_data = training_data[:5000]
#             pickle.dump(training_data,open(directory+'training/'+str(user_id),'wb'))
            pickle.dump(testing_data,open(directory+'testing/'+str(user_id),'wb'))
        print(i,testing_data.shape[0],end=',')
    print()

--------------------walking--------------------
0 1310,1 1827,2 971,3 9,4 661,5 3,6 2047,7 1284,8 1213,9 891,10 593,11 1251,12 67,13 137,14 885,15 593,16 1537,17 1517,18 1707,19 1119,20 465,21 1675,22 1140,23 497,24 581,25 701,26 10,27 938,28 1054,29 937,30 509,31 531,32 681,33 1418,34 1694,35 671,36 505,37 62,38 1437,39 741,40 1806,41 1257,42 1086,43 985,44 681,45 1046,46 600,47 628,48 1554,49 988,50 455,51 1623,52 828,53 703,54 485,55 441,56 403,57 1731,58 1977,59 1493,60 2067,61 961,62 1077,63 778,64 899,65 189,66 456,67 1158,68 2347,69 1133,70 701,71 1111,72 468,73 84,74 1361,75 844,76 689,77 560,78 770,79 1880,80 1415,81 1081,82 1015,83 845,84 110,85 1392,86 634,87 1310,88 1479,89 832,90 539,91 1231,92 1071,93 716,94 490,95 4,96 1100,97 1556,98 1035,99 940,100 629,101 688,102 3000,103 1987,104 31,105 1502,106 975,107 813,108 468,109 1119,110 410,111 89,112 71,113 427,114 765,115 929,116 732,117 457,118 83,119 54,120 947,121 741,122 857,123 855,124 482,125 1051,126 1119,127 664,128

In [None]:
def get_shortened_data(data_windowed,minutes=2000,window_size=20,training_percentage=.3):
    schema = data_windowed.schema
    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
    def get_user_data(df):
        df = df[:int(df.shape[0]*training_percentage)]
        return df
    data_windowed = data_windowed.groupBy(['user','day']).apply(get_user_data)
    n = int(minutes*60/window_size)
    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
    def get_user_data_sort(df):
        df = df.sort_values('timestamp').reset_index(drop=True)
#         if n>=df.shape[0]//2:
#             m = df.shape[0]//2
#         else:
#             m = n
        df = df[:n]
        return df
    data_windowed = data_windowed.groupBy('user').apply(get_user_data_sort)
    return data_windowed
    
activities = ['stationery']
window_size = 20
training_percentage = .8
activity = activities[0]
for activity in activities:
    print('-'*20+activity+'-'*20)
    directory = './data/'+str(window_size)+'/'+activity+'/'
    if not os.path.isdir(directory+'training'):
        os.makedirs(directory+'training')
    if not os.path.isdir(directory+'testing'):
        os.makedirs(directory+'testing')
    stream_name = 'org.md2k.feature.motionsensehrv.decoded.rightwrist.all.activity.all.three.'+str(window_size)+'.secs.'+str(activity)
    users = CC.get_stream(stream_name).select('user').distinct().toPandas()['user'].values
    for i,user_id in enumerate(users):
        data = CC.get_stream(stream_name,user_id=user_id)
        if activity=='stationery':
            data = get_shortened_data(data,minutes=3000,window_size=20)
        data = data.toPandas()
        data  = data.sort_values('timestamp').reset_index(drop=True)
#         if activity == 'stationery':
#             data = data[int(data.shape[0]*training_percentage):] 
        data['time'] = data['start'].apply(lambda a:a.timestamp())
        data['data'] = data['data'].apply(lambda a:np.array(a).reshape(1,-1,3))
        if activity =='stationery':
            training_data  = data
            pickle.dump(training_data,open(directory+'training/'+str(user_id),'wb'))
        else:
            training_data = data[:int(data.shape[0]*training_percentage)] 
            testing_data = data[int(data.shape[0]*training_percentage):] 
            pickle.dump(training_data,open(directory+'training/'+str(user_id),'wb'))
            pickle.dump(testing_data,open(directory+'testing/'+str(user_id),'wb'))
        print(i,training_data.shape[0],end=',')
    print()

In [None]:
import shutil

In [None]:
shutil.make_archive('./data/20/std/','zip','./data/20/std/')

In [None]:
data = data.withColumn('time',F.col('timestamp').cast('double'))

In [None]:
df = data.select('time').distinct().toPandas()['time'].values

In [None]:
activities = ['stationery']
window_size = 20
stream_name = 'org.md2k.feature.motionsensehrv.decoded.rightwrist.all.activity.all.three.'+str(window_size)+'.secs.'+str(activities[0])
users = CC.get_stream(stream_name).select('user').distinct().toPandas()['user'].values

In [None]:
times = CC.get_stream(stream_name).select('timestamp').distinct().toPandas()['timestamp'].values

In [None]:
import os

In [None]:
len(os.listdir('./data/20/stationery/testing'))

In [1]:
import shutil

In [2]:
shutil.rmtree('./data/20/std5/training/')

In [None]:
import shutil

# shutil.make_archive('./data/20/std10/testing/','zip','./data/20/std10/testing/')

In [None]:
shutil.rmtree('/home/jupyter/mullah/cc3/rice_data/ecg_ppg_5_left_final_v1//')