In [1]:
import tensorflow as tf
from tensorflow.keras.models import load_model
import numpy as np
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import StructField, StructType, DoubleType,MapType, StringType,ArrayType, FloatType, TimestampType, IntegerType
from cerebralcortex.core.datatypes import DataStream
from cerebralcortex.core.metadata_manager.stream.metadata import Metadata, DataDescriptor, \
ModuleMetadata
from scipy import signal
from scipy.interpolate import interp1d
from cerebralcortex import Kernel
from scipy.stats import skew,kurtosis,mode
from collections import Counter
import pandas as pd
CC = Kernel("/home/jupyter/cc3_conf/", study_name='mperf')

In [2]:
def get_data_for_saving(data,
                        data_acl,
                        stream_name = 'org.md2k.feature.motionsensehrv.decoded.rightwrist.all.activity',
                        acl_stream_name = 'org.md2k.feature.motionsensehrv.decoded.rightwrist.all',
                        activities = ['Walking'],
                        window_size = 10,
                        base_window_size  = 10,
                        prediction_name = 'prediction',
                        minutes = 700):
    data = data._data
    data = data.select('localtime','timestamp','day',prediction_name,'user','version','start', 'end')
    data  = data.filter(F.col(prediction_name).isin(activities))
    
#     groupbycols = ['user','version','day',F.window('timestamp',windowDuration=str(window_size)+' seconds', startTime='0 seconds')]
#     data_windowed = data.groupBy(groupbycols).agg(F.collect_list('prediction')).withColumnRenamed('collect_list(prediction)','prediction')
#     data_windowed = data_windowed.filter(F.size(F.col('prediction'))==window_size//base_window_size)
#     def get_most_frequent(a):
#         return Counter(a).most_common()[0][0]
#     qfunction = F.udf(get_most_frequent,StringType())
#     data_windowed = data_windowed.withColumn('prediction',qfunction(data_windowed['prediction']))
    data_windowed = data.filter(F.col(prediction_name).isin(activities))
    data_windowed = data_windowed.withColumn('time',F.col('timestamp').cast('double'))
    n = int(minutes*60/base_window_size)
    schema = data_windowed.schema
    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
    def get_user_data(df):
        df = df.sort_values('time').reset_index(drop=True)
        df = df[n:3*n]
        return df
    
    data_windowed = data_windowed.groupBy('user').apply(get_user_data).drop('time')
    data_windowed = data_windowed.select('localtime','timestamp','day',prediction_name,'user','version',F.struct('start', 'end').alias('window'))
    
    
    if data_windowed.count()<30*6:
        return pd.DataFrame([],columns=list('abcdefgh'))
#     data_windowed.printSchema()
#     print(data_windowed.count())
    data_acl = data_acl.select('localtime','timestamp','aclx','acly','aclz','user','version')
    data_acl = data_acl.withColumn('time',F.col('timestamp').cast('double'))
    data_acl = data_acl.withColumn('data',F.array('time','aclx','acly','aclz')).drop('time','aclx','acly','aclz')
    groupbycols = ['user','version',F.window('timestamp',windowDuration=str(window_size)+' seconds', startTime='0 seconds')]
    data_acl_windowed = data_acl.groupBy(groupbycols).agg(F.collect_list('data')).withColumnRenamed('collect_list(data)','data')
    data_joined = data_windowed.join(data_acl_windowed.drop('version'),on=['user','window'],how='inner')
    return data_joined.toPandas()
#     print(data_joined.count())
#     data_joined.printSchema()
#     def reshape_data(a):
# #         a = np.array([np.array(b) for b in a])
# #         a = a[a[:,0].argsort()]
# #         return list(a.reshape(-1))
#         return [1,2,3,4]
#     qfunction = F.udf(reshape_data,ArrayType(DoubleType()))
#     data_joined_final = data_joined.withColumn('data',qfunction(data_joined['data']))

#     schema = data_joined.schema
#     stream_metadata = Metadata()
#     print(stream_name+'.'+str(window_size)+'.secs.'+str(activities[0]).lower())
#     stream_metadata.set_name(stream_name+'.'+str(window_size)+'.secs.'+str(activities[0]).lower()).set_description("ACL data saving for REID model, window size = "+str(window_size)+' secs')
#     for field in schema.fields:
#         stream_metadata.add_dataDescriptor(
#             DataDescriptor().set_name(str(field.name)).set_type(str(field.dataType))
#         )
#     stream_metadata.add_module(
#         ModuleMetadata().set_name("ACL data saving for REID model, window size = "+str(window_size)+' secs'+str(activities[0]).lower()) \
#         .set_attribute("url", "https://md2k.org").set_author(
#             "Md Azim Ullah", "mullah@memphis.edu"))
#     ds = DataStream(data=data_joined,metadata=stream_metadata)
#     return ds

In [None]:
base_window_size = 10
import pickle
import os
stream_name = 'org.md2k.feature.motionsensehrv.decoded.rightwrist.all.activity'
# activities = ['Driving','Sitting','Stairs','Walking']
activities = ['Sitting']
window_size = 10
acl_stream_name = 'org.md2k.feature.motionsensehrv.decoded.rightwrist.all'
user_id1 = '3ca3dbf5-2390-409e-bd2c-c9f23a255e75'
users = pickle.load(open('./data/users.p','rb'))

directory = './data/'
if not os.path.isdir(directory+str(base_window_size)):
    os.makedirs(directory+str(base_window_size))

for activity in activities:
    activity1 = activity+'_test'
    if not os.path.isdir(directory+str(base_window_size)+'/'+activity1):
        os.makedirs(directory+str(base_window_size)+'/'+activity1)
    for user_id in users:
        data = CC.get_stream(stream_name,user_id=user_id)
        data_acl = CC.get_stream(acl_stream_name,user_id=user_id)
        df = get_data_for_saving(data,
                                data_acl,
                                stream_name = stream_name,
                                acl_stream_name = acl_stream_name,
                                activities = [activity],
                                window_size = window_size,
                                base_window_size  = base_window_size)
#         print(user_id)
        if df.shape[0]/6 < 30:
            continue
        pickle.dump(df,open(directory+str(base_window_size)+'/'+activity1+'/'+user_id+'.p','wb'))
        print(user_id,df.shape,activity)



0990887a-6163-4c80-9c9e-468ea2598202 (6611, 8) Sitting
61d1a237-d70f-49b0-89ba-cea4d2526832 (8400, 8) Sitting
8aa1bd02-ee43-4e9d-b7f7-7ddc66b607f9 (8400, 8) Sitting
87d70bed-3ed0-455c-a144-9fd955229125 (8400, 8) Sitting
db4a2be2-d180-4fa5-b3b8-41b91c2a641c (8400, 8) Sitting
c7e9149b-94da-4733-92eb-1395c724fc7b (3283, 8) Sitting
08b3a46a-f926-4a57-8723-b78b53c33729 (8400, 8) Sitting
fd36e160-50a0-4dad-9357-65ea218c8d3c (8400, 8) Sitting
95085684-88ec-4d2a-8eba-a38268018193 (8400, 8) Sitting
d83ac187-97cd-4ee0-a35f-5a1ffe6c7885 (8400, 8) Sitting
3b9ff2e4-dfec-4022-8994-1a0c4db7227a (8400, 8) Sitting
072c81f7-4410-4301-8fd0-17337c0ac1e8 (8400, 8) Sitting
0c824653-a13b-4a4e-b907-660f1d8f8981 (8400, 8) Sitting
f244a6e2-97bf-4c57-8fb9-ed1ca1774c37 (8400, 8) Sitting
d1392516-4b33-47c0-81b2-066fa7210135 (8400, 8) Sitting
ac48132f-2c65-4762-bb64-ed8f733a540d (8400, 8) Sitting
a6c16f12-0987-4690-87fe-336710f96398 (8400, 8) Sitting
3c1b90f5-dd19-4872-8175-9dede757c9c6 (8400, 8) Sitting
b7a05945-a

In [None]:
import shutil

In [None]:
shutil.rmtree('./data/10/Sitting_test/') 

In [None]:
len(df['data'])/6
org.md2k.feature.motionsensehrv.decoded.rightwrist.all.activity.10.secs.driving
org.md2k.feature.motionsensehrv.decoded.rightwrist.all.activity.10.secs.driving

In [None]:
data = CC.get_stream(stream_name+'.'+str(window_size)+'.secs')
df = data._data.toPandas()

In [None]:
import pickle
pickle.dump(df,open('./data/right_wrist/'+user_id+'.p','wb'))

In [None]:
data_final.drop('data').show(1,False)

In [None]:
data = CC.get_stream('org.md2k.feature.motionsensehrv.decoded.rightwrist.all.activity.10.secs')

In [None]:
data.count()

In [None]:
import pickle
filepath = './data/walking_10/right_wrist/'
stream_name = 'org.md2k.feature.motionsensehrv.decoded.rightwrist.all.activity.10.secs'
users = pickle.load(open('./data/users.p','rb'))
for i,user in enumerate(users):
    df = CC.get_stream(stream_name,user_id=user)
    data  = df.toPandas()
    pickle.dump(data,open(filepath+user,'wb'))
    print(i,end=',')

In [None]:
import pandas as pd




In [None]:
import shutil

In [None]:
maxmean_score = pd.read_json('maxmean_10.json')
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size':15})
plt.figure(figsize=(20,10))
sns.heatmap(maxmean_score,annot=True,fmt='.2f',cmap='Reds',linewidth=1,linecolor='black')
plt.title('Max Mean Boosting')
plt.xlabel('Train data per user, minutes')
plt.ylabel('Test data length, minutes')
plt.tight_layout()
plt.savefig('./images/maxmean_boosting.png',dps=1e6)
plt.show()

In [None]:
maxmean_score = pd.read_json('majority_10.json')
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size':15})
plt.figure(figsize=(20,10))
sns.heatmap(maxmean_score,annot=True,fmt='.2f',cmap='Reds',linewidth=1,linecolor='black')
plt.title('Majority Boosting')
plt.xlabel('Train data per user, minutes')
plt.ylabel('Test data length, minutes')
plt.tight_layout()
plt.savefig('./images/majority_boosting.png',dps=1e6)
plt.show()

In [None]:
df.loc[0]['data']
users
import shutil
import pickle
import os
hours = []
for f in os.listdir('./data/right_wrist/'):
    data = pickle.load(open('./data/right_wrist/'+f,'rb'))
    hours.append(data.shape[0]*20/3600)
data.shape[0]*20/3600
import matplotlib.pyplot as plt

from collections import Counter

import numpy as np
Counter(np.floor(hours))