In [None]:
import tensorflow as tf
from tensorflow.keras.models import load_model
import numpy as np
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import StructField, StructType, DoubleType,MapType, StringType,ArrayType, FloatType, TimestampType, IntegerType
from cerebralcortex.core.datatypes import DataStream
from cerebralcortex.core.metadata_manager.stream.metadata import Metadata, DataDescriptor, \
ModuleMetadata
from scipy import signal
from scipy.interpolate import interp1d
from cerebralcortex import Kernel
from scipy.stats import skew,kurtosis,mode
from collections import Counter
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')
import pickle
CC = Kernel("/home/jupyter/cc3_conf/", study_name='mperf')

In [None]:
def get_shortened_data(data_windowed,minutes=2000,window_size=20,training_percentage=.8):
    schema = data_windowed.schema
    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
    def get_user_data(df):
        df = df[int(df.shape[0]*training_percentage):]
        return df
    data_windowed = data_windowed.groupBy(['user','day']).apply(get_user_data)
    n = int(minutes*60/window_size)
    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
    def get_user_data_sort(df):
        df = df.sort_values('timestamp').reset_index(drop=True)
        if n>=df.shape[0]//2:
            m = df.shape[0]//2
        else:
            m = n
        df = df[int(df.shape[0]-m):]
        return df
    data_windowed = data_windowed.groupBy('user').apply(get_user_data_sort)
    return data_windowed
    
activities = ['stationery']
window_size = 20
training_percentage = .8
activity = activities[0]
for activity in activities:
    print('-'*20+activity+'-'*20)
    directory = './data/'+str(window_size)+'/'+activity+'/'
    if not os.path.isdir(directory+'training'):
        os.makedirs(directory+'training')
    if not os.path.isdir(directory+'testing'):
        os.makedirs(directory+'testing')
    stream_name = 'org.md2k.feature.motionsensehrv.decoded.rightwrist.all.activity.all.three.'+str(window_size)+'.secs.'+str(activity)
    users = CC.get_stream(stream_name).select('user').distinct().toPandas()['user'].values
    for i,user_id in enumerate(users):
        data = CC.get_stream(stream_name,user_id=user_id)
        if activity=='stationery':
            data = get_shortened_data(data,minutes=1000,window_size=20)
        data = data.toPandas()
        data  = data.sort_values('timestamp').reset_index(drop=True)
#         if activity == 'stationery':
#             data = data[int(data.shape[0]*training_percentage):] 
        data['time'] = data['start'].apply(lambda a:a.timestamp())
        data['data'] = data['data'].apply(lambda a:np.array(a).reshape(1,-1,3))
        if activity =='stationery':
            testing_data  = data
            pickle.dump(testing_data,open(directory+'testing/'+str(user_id),'wb'))
        else:
            training_data = data[:int(data.shape[0]*training_percentage)] 
            testing_data = data[int(data.shape[0]*training_percentage):] 
            pickle.dump(training_data,open(directory+'training/'+str(user_id),'wb'))
            pickle.dump(testing_data,open(directory+'testing/'+str(user_id),'wb'))
        print(i,testing_data.shape[0],end=',')
    print()

In [None]:
def get_shortened_data(data_windowed,minutes=2000,window_size=20,training_percentage=.3):
    schema = data_windowed.schema
    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
    def get_user_data(df):
        df = df[:int(df.shape[0]*training_percentage)]
        return df
    data_windowed = data_windowed.groupBy(['user','day']).apply(get_user_data)
    n = int(minutes*60/window_size)
    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
    def get_user_data_sort(df):
        df = df.sort_values('timestamp').reset_index(drop=True)
#         if n>=df.shape[0]//2:
#             m = df.shape[0]//2
#         else:
#             m = n
        df = df[:n]
        return df
    data_windowed = data_windowed.groupBy('user').apply(get_user_data_sort)
    return data_windowed
    
activities = ['stationery']
window_size = 20
training_percentage = .8
activity = activities[0]
for activity in activities:
    print('-'*20+activity+'-'*20)
    directory = './data/'+str(window_size)+'/'+activity+'/'
    if not os.path.isdir(directory+'training'):
        os.makedirs(directory+'training')
    if not os.path.isdir(directory+'testing'):
        os.makedirs(directory+'testing')
    stream_name = 'org.md2k.feature.motionsensehrv.decoded.rightwrist.all.activity.all.three.'+str(window_size)+'.secs.'+str(activity)
    users = CC.get_stream(stream_name).select('user').distinct().toPandas()['user'].values
    for i,user_id in enumerate(users):
        data = CC.get_stream(stream_name,user_id=user_id)
        if activity=='stationery':
            data = get_shortened_data(data,minutes=3000,window_size=20)
        data = data.toPandas()
        data  = data.sort_values('timestamp').reset_index(drop=True)
#         if activity == 'stationery':
#             data = data[int(data.shape[0]*training_percentage):] 
        data['time'] = data['start'].apply(lambda a:a.timestamp())
        data['data'] = data['data'].apply(lambda a:np.array(a).reshape(1,-1,3))
        if activity =='stationery':
            training_data  = data
            pickle.dump(training_data,open(directory+'training/'+str(user_id),'wb'))
        else:
            training_data = data[:int(data.shape[0]*training_percentage)] 
            testing_data = data[int(data.shape[0]*training_percentage):] 
            pickle.dump(training_data,open(directory+'training/'+str(user_id),'wb'))
            pickle.dump(testing_data,open(directory+'testing/'+str(user_id),'wb'))
        print(i,training_data.shape[0],end=',')
    print()

In [None]:
import shutil

In [None]:
shutil.make_archive('./data/20/std/','zip','./data/20/std/')

In [None]:
data = data.withColumn('time',F.col('timestamp').cast('double'))

In [None]:
df = data.select('time').distinct().toPandas()['time'].values

In [None]:
activities = ['stationery']
window_size = 20
stream_name = 'org.md2k.feature.motionsensehrv.decoded.rightwrist.all.activity.all.three.'+str(window_size)+'.secs.'+str(activities[0])
users = CC.get_stream(stream_name).select('user').distinct().toPandas()['user'].values

In [None]:
times = CC.get_stream(stream_name).select('timestamp').distinct().toPandas()['timestamp'].values

In [None]:
import os

In [None]:
len(os.listdir('./data/20/stationery/testing'))

In [1]:
import shutil

In [5]:
shutil.rmtree('./data/10/Sitting/')