In [2]:
import tensorflow as tf
from tensorflow.keras.models import load_model
import numpy as np
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import StructField, StructType, DoubleType,MapType, StringType,ArrayType, FloatType, TimestampType, IntegerType
from cerebralcortex.core.datatypes import DataStream
from cerebralcortex.core.metadata_manager.stream.metadata import Metadata, DataDescriptor, \
ModuleMetadata
from scipy import signal
from scipy.interpolate import interp1d
from cerebralcortex import Kernel
from scipy.stats import skew,kurtosis,mode
from collections import Counter
import pandas as pd
import numpy as np
CC = Kernel("/home/jupyter/cc3_conf/", study_name='moral')

  self.fs = pa.hdfs.connect(self.hdfs_ip, self.hdfs_port)


In [3]:
def get_data_for_saving_moral(data,
                        data_acl,
                        stream_name = 'org.md2k.feature.motionsensehrv.decoded.rightwrist.all.activity',
                        acl_stream_name = 'org.md2k.feature.motionsensehrv.decoded.rightwrist.all',
                        activities = ['Walking'],
                        window_size = 20,
                        base_window_size  = 20,
                        prediction_name = 'prediction'):
    data = data._data
    data = data.select('localtime','timestamp','day',prediction_name,'user','version','start', 'end','std')
    data_windowed  = data.filter(F.col(prediction_name).isin(activities))
    data_windowed = data_windowed.withColumn('time',F.col('timestamp').cast('double'))
    data_windowed = data_windowed.select('localtime','timestamp','day',prediction_name,'user','version',F.struct('start', 'end').alias('window'))
    if data_windowed.count()<30*60/base_window_size:
        return pd.DataFrame([],columns=list('abcdefgh'))
    
    data_acl = data_acl.select('localtime','timestamp','x','y','z','user','version')
    data_acl = data_acl.withColumn('time',F.col('timestamp').cast('double'))
    data_acl = data_acl.withColumn('data',F.array('time','x','y','z')).drop('time','x','y','z')
    groupbycols = ['user','version',F.window('timestamp',windowDuration=str(window_size)+' seconds', 
                                             startTime='0 seconds',slideDuration=str(window_size//2)+' seconds')]
    data_acl_windowed = data_acl.groupBy(groupbycols).agg(F.collect_list('data')).withColumnRenamed('collect_list(data)','data')
    data_joined = data_windowed.join(data_acl_windowed.drop('version'),on=['user','window'],how='inner')

    def interpolate_acl(a,window_size=20,fs_now=25,fs_new=25):
        x_now = np.linspace(0,window_size,a.shape[0])
        f = interp1d(x_now,a,axis=0,fill_value='extrapolate')
        x_new = np.linspace(0,window_size,window_size*fs_new)
        return f(x_new)
    
    schema = StructType([
        StructField("version", IntegerType()),
        StructField("user", StringType()),
        StructField("localtime", TimestampType()),
        StructField("timestamp", TimestampType()),
        StructField("start", TimestampType()),
        StructField("end", TimestampType()),
        StructField("data", ArrayType(DoubleType())),
        StructField("day", StringType()),
        StructField("prediction", StringType())
    ])

    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
    def interpolate_data(df):
        df['data'] = df['data'].apply(lambda b:np.array([np.array(a) for a in b]).reshape(len(b),4))
        df['data'] = df['data'].apply(lambda a:a[a[:,0].argsort()])
        df['data'] = df['data'].apply(lambda a:a[:,1:].reshape(a.shape[0],3))
        df['data'] = df['data'].apply(lambda a:interpolate_acl(a).reshape(-1))
        return df
    data_joined = data_joined.withColumn('start',F.col('window').start)
    data_joined = data_joined.withColumn('end',F.col('window').end).drop('window')
    data_joined = data_joined.groupBy(['user','version','day']).apply(interpolate_data)
    schema = data_joined.schema
    stream_metadata = Metadata()
    stream_name  = stream_name+'.'+str(window_size)+'.secs.'+str(activities[0]).lower()
    print(stream_name)
    stream_metadata.set_name(stream_name).set_description("ACL data saving for REID model, window size = "+str(window_size)+' secs')
    for field in schema.fields:
        stream_metadata.add_dataDescriptor(
            DataDescriptor().set_name(str(field.name)).set_type(str(field.dataType))
        )
    stream_metadata.add_module(
        ModuleMetadata().set_name("ACL data saving for REID model, window size = "+str(window_size)+' secs'+str(activities[0]).lower()) \
        .set_attribute("url", "https://md2k.org").set_author(
            "Md Azim Ullah", "mullah@memphis.edu"))
    ds = DataStream(data=data_joined,metadata=stream_metadata)
    return ds

In [5]:
base_window_size = 20
import pickle
import os
stream_name = 'accelerometer--org.md2k.motionsense--motion_sense--right_wrist'+'.activity.all.three'
activities =  ['Stationery','Stairs','Exercise','Walking','Sports']
window_size = 20
acl_stream_name = 'accelerometer--org.md2k.motionsense--motion_sense--right_wrist'
for activity in activities[::-1]:
    try:
        activity1 = activity
        data = CC.get_stream(stream_name)
        data_acl = CC.get_stream(acl_stream_name)
        df = get_data_for_saving_moral(data,
                                data_acl,
                                stream_name = stream_name,
                                acl_stream_name = acl_stream_name,
                                activities = [activity],
                                window_size = window_size,
                                base_window_size  = base_window_size)
        CC.save_stream(df,overwrite=True)
    except Exception as e:
        print(e)



accelerometer--org.md2k.motionsense--motion_sense--right_wrist.activity.all.three.20.secs.sports
accelerometer--org.md2k.motionsense--motion_sense--right_wrist.activity.all.three.20.secs.walking
accelerometer--org.md2k.motionsense--motion_sense--right_wrist.activity.all.three.20.secs.exercise
accelerometer--org.md2k.motionsense--motion_sense--right_wrist.activity.all.three.20.secs.stairs
accelerometer--org.md2k.motionsense--motion_sense--right_wrist.activity.all.three.20.secs.stationery


In [6]:
data = CC.get_stream('accelerometer--org.md2k.motionsense--motion_sense--right_wrist.activity.all.three.20.secs.walking')

In [None]:
def get_data_for_saving(data,
                        data_acl,
                        stream_name = 'org.md2k.feature.motionsensehrv.decoded.rightwrist.all.activity',
                        acl_stream_name = 'org.md2k.feature.motionsensehrv.decoded.rightwrist.all',
                        activities = ['Walking'],
                        window_size = 20,
                        base_window_size  = 20,
                        prediction_name = 'prediction',
                        minutes = 1500):
    data = data._data
    data = data.select('localtime','timestamp','day',prediction_name,'user','version','start', 'end','std')
    
#     groupbycols = ['user','version','day',F.window('timestamp',windowDuration=str(window_size)+' seconds', startTime='0 seconds')]
#     data_windowed = data.groupBy(groupbycols).agg(F.collect_list('prediction')).withColumnRenamed('collect_list(prediction)','prediction')
#     data_windowed = data_windowed.filter(F.size(F.col('prediction'))==window_size//base_window_size)
#     def get_most_frequent(a):
#         return Counter(a).most_common()[0][0]
#     qfunction = F.udf(get_most_frequent,StringType())
#     data_windowed = data_windowed.withColumn('prediction',qfunction(data_windowed['prediction']))
#     data_windowed = data.filter(F.col(prediction_name).isin(activities))
    
    if activities[0][:3] not in ['std']:
        data_windowed  = data.filter(F.col(prediction_name).isin(activities))
        data_windowed = data_windowed.withColumn('time',F.col('timestamp').cast('double'))
    else:
        threshold = int(activities[0][3:])/100
        print(threshold)
        data_windowed  = data.filter(F.col('std')>=threshold)
        data_windowed = data_windowed.withColumn('time',F.col('timestamp').cast('double'))
    
    n = int(minutes*60/base_window_size)
    schema = data_windowed.schema
    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
    def get_user_data(df):
        df = df.sort_values('time').reset_index(drop=True)
        df = df[:n]
        return df
    
#     data_windowed = data_windowed.groupBy('user').apply(get_user_data).drop('time')
    data_windowed = data_windowed.select('localtime','timestamp','day',prediction_name,'user','version',F.struct('start', 'end').alias('window'))
    
    
    if data_windowed.count()<30*60/base_window_size:
        return pd.DataFrame([],columns=list('abcdefgh'))
#     data_windowed.printSchema()
#     print(data_windowed.count())
    
    data_acl = data_acl.select('localtime','timestamp','aclx','acly','aclz','user','version')
    data_acl = data_acl.withColumn('time',F.col('timestamp').cast('double'))
    data_acl = data_acl.withColumn('data',F.array('time','aclx','acly','aclz')).drop('time','aclx','acly','aclz')
    groupbycols = ['user','version',F.window('timestamp',windowDuration=str(window_size)+' seconds', startTime='0 seconds')]
    data_acl_windowed = data_acl.groupBy(groupbycols).agg(F.collect_list('data')).withColumnRenamed('collect_list(data)','data')
#     data_acl_windowed.printSchema()
#     data_windowed.printSchema()
    data_joined = data_windowed.join(data_acl_windowed.drop('version'),on=['user','window'],how='inner')
#     data_joined.printSchema()
    
    def interpolate_acl(a,window_size=20,fs_now=25,fs_new=25):
        x_now = np.linspace(0,window_size,a.shape[0])
        f = interp1d(x_now,a,axis=0,fill_value='extrapolate')
        x_new = np.linspace(0,window_size,window_size*fs_new)
        return f(x_new)
    
    schema = StructType([
        StructField("version", IntegerType()),
        StructField("user", StringType()),
        StructField("localtime", TimestampType()),
        StructField("timestamp", TimestampType()),
        StructField("start", TimestampType()),
        StructField("end", TimestampType()),
        StructField("data", ArrayType(DoubleType())),
        StructField("day", StringType()),
        StructField("prediction", StringType())
    ])

    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
    def interpolate_data(df):
        df['data'] = df['data'].apply(lambda b:np.array([np.array(a) for a in b]).reshape(len(b),4))
        df['data'] = df['data'].apply(lambda a:a[a[:,0].argsort()])
        df['data'] = df['data'].apply(lambda a:a[:,1:].reshape(a.shape[0],3))
        df['data'] = df['data'].apply(lambda a:interpolate_acl(a).reshape(-1))
        return df
    data_joined = data_joined.withColumn('start',F.col('window').start)
    data_joined = data_joined.withColumn('end',F.col('window').end).drop('window')
    data_joined = data_joined.groupBy(['user','version','day']).apply(interpolate_data)
#     return data_joined.toPandas()
    schema = data_joined.schema
    stream_metadata = Metadata()
    stream_name  = stream_name+'.'+str(window_size)+'.secs.'+str(activities[0]).lower()
    print(stream_name)
    stream_metadata.set_name(stream_name).set_description("ACL data saving for REID model, window size = "+str(window_size)+' secs')
    for field in schema.fields:
        stream_metadata.add_dataDescriptor(
            DataDescriptor().set_name(str(field.name)).set_type(str(field.dataType))
        )
    stream_metadata.add_module(
        ModuleMetadata().set_name("ACL data saving for REID model, window size = "+str(window_size)+' secs'+str(activities[0]).lower()) \
        .set_attribute("url", "https://md2k.org").set_author(
            "Md Azim Ullah", "mullah@memphis.edu"))
    ds = DataStream(data=data_joined,metadata=stream_metadata)
    return ds

In [None]:
base_window_size = 20
import pickle
import os
stream_name = 'org.md2k.feature.motionsensehrv.decoded.rightwrist.all.activity.all.three'
# activities = ['Driving','Sitting','Stairs','Walking']
activities =  ['Stationery','Stairs','Exercise','Walking','Sports']
activities = ['std25','std15','std10','std30']
activities = ['std5']
window_size = 20
acl_stream_name = 'org.md2k.feature.motionsensehrv.decoded.rightwrist.all'
# user_id1 = '3ca3dbf5-2390-409e-bd2c-c9f23a255e75'
# users = pickle.load(open('./data/users.p','rb'))

directory = './data/'



for activity in activities[::-1]:
    activity1 = activity
#     if not os.path.isdir(directory+str(base_window_size)+'/'+activity1):
#         os.makedirs(directory+str(base_window_size)+'/'+activity1)
#     for user_id in users[:1]:
#     user_id = users[0]
    data = CC.get_stream(stream_name)
    data_acl = CC.get_stream(acl_stream_name)
    df = get_data_for_saving(data,
                            data_acl,
                            stream_name = stream_name,
                            acl_stream_name = acl_stream_name,
                            activities = [activity],
                            window_size = window_size,
                            base_window_size  = base_window_size)
    CC.save_stream(df,overwrite=True)
#     print(df.shape)
        
#         print(user_id)
#         if df.shape[0]/6 < 30:
#             continue
#         pickle.dump(df,open(directory+str(base_window_size)+'/'+activity1+'/'+user_id+'.p','wb'))
#         print(user_id,df.shape,activity)

In [None]:
data = CC.get_stream(stream_name+'.'+str(window_size)+'.secs')
df = data._data.toPandas()

In [None]:
import pickle
pickle.dump(df,open('./data/right_wrist/'+user_id+'.p','wb'))

In [None]:
data_final.drop('data').show(1,False)

In [None]:
data = CC.get_stream('org.md2k.feature.motionsensehrv.decoded.rightwrist.all.activity.all.three').drop('magnitude','start','end')
schema = StructType([
    StructField("timestamp", TimestampType()),
    StructField("localtime", TimestampType()),
    StructField("start", TimestampType()),
    StructField("end", TimestampType()),
    StructField("version", IntegerType()),
    StructField("user", StringType()),
    StructField("prediction", StringType()),
    StructField("std", DoubleType()),
    StructField("day", StringType())
])
from scipy.stats import mode
import pandas as pd
@pandas_udf(schema, PandasUDFType.GROUPED_MAP)
def minutewise_data(key,df):
    timestamp = df['timestamp'].values[0]
    localtime = df['localtime'].values[0]
    version = 1
    user = df['user'].values[0]
    prediction = mode(df['prediction'].values)[0][0]
    day = df['day'].values[0]
    std_value = np.mean(df['std'].values)
    rows = []
    rows.append([timestamp,localtime,version,user,prediction,std_value,day,key[3]['start'],key[3]['end']])
    return pd.DataFrame(rows,columns=['timestamp','localtime','version','user','prediction','std','day','start','end'])
win = F.window("timestamp", windowDuration='60 seconds',slideDuration='60 seconds',startTime='0 seconds')
data_60 = data.groupBy(['user','version','day',win]).apply(minutewise_data)

schema = data_60.schema
stream_metadata = Metadata()
stream_metadata.set_name('org.md2k.feature.motionsensehrv.decoded.rightwrist.all.activity.all.three.60.secs').set_description("right wrist 60 secs yield")
for field in schema.fields:
    stream_metadata.add_dataDescriptor(
        DataDescriptor().set_name(str(field.name)).set_type(str(field.dataType))
    )
stream_metadata.add_module(
    ModuleMetadata().set_name("right wrist 60 secs yield") \
    .set_attribute("url", "https://md2k.org").set_author(
        "Md Azim Ullah", "mullah@memphis.edu"))
stream_metadata.is_valid()
ds = DataStream(data=data_60,metadata=stream_metadata)
CC.save_stream(ds,overwrite=True)

In [None]:
data = CC.get_stream('org.md2k.feature.motionsensehrv.decoded.rightwrist.all.activity.all.three.60.secs')

schema = data.schema
columns = [a.name for a in schema.fields]
@pandas_udf(schema, PandasUDFType.GROUPED_MAP)
def filter_daywise(df):
    if df.shape[0]<=120:
        return pd.DataFrame([],columns=columns)
    return df
data_60 = data.groupBy(['user','day']).apply(filter_daywise)

schema = data_60.schema
stream_metadata = Metadata()
stream_metadata.set_name('org.md2k.feature.motionsensehrv.decoded.rightwrist.all.activity.all.three.60.secs.filtered').set_description("right wrist 60 secs yield")
for field in schema.fields:
    stream_metadata.add_dataDescriptor(
        DataDescriptor().set_name(str(field.name)).set_type(str(field.dataType))
    )
stream_metadata.add_module(
    ModuleMetadata().set_name("right wrist 60 secs yield") \
    .set_attribute("url", "https://md2k.org").set_author(
        "Md Azim Ullah", "mullah@memphis.edu"))
stream_metadata.is_valid()
ds = DataStream(data=data_60,metadata=stream_metadata)
CC.save_stream(ds,overwrite=True)

In [None]:
from collections import Counter
data = CC.get_stream('org.md2k.feature.motionsensehrv.decoded.rightwrist.all.activity.all.three.60.secs.filtered')
data.printSchema()
schema =  StructType([
    StructField("timestamp", TimestampType()),
    StructField("localtime", TimestampType()),
    StructField("version", IntegerType()),
    StructField("user", StringType()),
    StructField("prediction_count", ArrayType(DoubleType())),
    StructField("prediction_percentage", ArrayType(DoubleType())),
    StructField("std_count", ArrayType(DoubleType())),
    StructField("std_percentage", ArrayType(DoubleType())),
    StructField("day", StringType())
])

stds = np.arange(.01,.31,.01)
activities = ['Stationery','Stairs','Exercise','Walking','Sports']
@pandas_udf(schema, PandasUDFType.GROUPED_MAP)
def daywise_count(df):
    timestamp = df['timestamp'].values[0]
    localtime = df['localtime'].values[0]
    version  = 1
    user = df['user'].values[0]
    day = df['day'].values[0]
    activity_counts = Counter(df['prediction'].values)
    prediction_count = np.array([0]*len(activities))
    for i,activity in enumerate(activities):
        if activity in activity_counts:
            prediction_count[i] = activity_counts[activity]/60
    prediction_percentage = 100*prediction_count/np.sum(prediction_count)
    std_count = np.array([0]*len(stds))
    for i,std_value in enumerate(stds):
        std_count[i] = df[df['std']>=std_value].shape[0]/60
    std_percentage = 100*std_count/(df.shape[0]/60)
    return pd.DataFrame([[timestamp,localtime,version,user,day,prediction_count,prediction_percentage,std_count,std_percentage]],
                       columns = ['timestamp','localtime','version','user','day',
                                  'prediction_count','prediction_percentage','std_count',
                                 'std_percentage'])
    
data_60 = data.groupBy(['user','day']).apply(daywise_count)
schema = data_60.schema
stream_metadata = Metadata()
stream_metadata.set_name('org.md2k.feature.motionsensehrv.decoded.rightwrist.all.activity.all.three.60.secs.filtered.count.percentage').set_description("right wrist daywise yield")
for field in schema.fields:
    stream_metadata.add_dataDescriptor(
        DataDescriptor().set_name(str(field.name)).set_type(str(field.dataType))
    )
stream_metadata.add_module(
    ModuleMetadata().set_name("right wrist daywise yield") \
    .set_attribute("url", "https://md2k.org").set_author(
        "Md Azim Ullah", "mullah@memphis.edu"))
stream_metadata.is_valid()
ds = DataStream(data=data_60,metadata=stream_metadata)
CC.save_stream(ds,overwrite=True)

In [None]:
data  = CC.get_stream('org.md2k.feature.motionsensehrv.decoded.rightwrist.all.activity.all.three.60.secs.filtered.count.percentage')

In [None]:
data = data._data.toPandas()

In [None]:
import pickle
pickle.dump(data,open('./data/daywise_count_percentage.p','wb'))

In [None]:
import pickle
data  = pickle.load(open('./data/daywise_count_percentage.p','rb'))

In [None]:
data['count'] = data['prediction_count'].apply(lambda a:sum(a))

In [None]:
data.groupby('user').sum().mean()*.8

In [None]:
data['count'].sum()

In [None]:
data['user_day'] = data.apply(lambda a:a['user']+a['day'],axis=1)

In [None]:
data['user_day'].unique().shape,19950/353

In [None]:
data['stationery'] = data['prediction_percentage'].apply(lambda a:a[0])
data['sports'] = data['prediction_percentage'].apply(lambda a:a[-1])
data['walking'] = data['prediction_percentage'].apply(lambda a:sum(a[1:-2]))

In [None]:
data['stationery'].mean(),data['sports'].mean()

In [None]:
a = data['prediction_count'].values

In [None]:
a1 = [sum(b) for b in a]

In [None]:
np.nanmean(a1)

In [None]:
a = [b for b in a if b[0] is not None and b[0]>1]

In [None]:
import numpy as np

In [None]:
np.mean([b[0] for b in a if b[0]>2])

In [None]:
np.mean([b[2] for b in a])

In [None]:
np.mean([b[-1] for b in a if b[-1]>0])

In [None]:
np.mean([b[1]+b[3] for b in a if b[3]>0])

In [None]:
stds = np.arange(.01,.31,.01)
activities = ['Stationery','Stairs','Exercise','Walking','Sports']
column_name='std_count'
if column_name.split('_')[0]=='prediction':
    x = activities
else:
    x = stds
    x = [np.round(a*100)/100 for a in x]
rows = []
for i,row in data.iterrows():
    for j,value in enumerate(x):
        rows.append([value,row[column_name][j],row['user'],row['day']])
df = pd.DataFrame(rows,columns=['Activity Type','value','user','day'])

In [None]:
import seaborn as sns

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.rcParams.update({'font.size':25})
plt.figure(figsize=(20,10))
sns.barplot(x='Activity Type',y='value',data=df)
if column_name.split('_')[1]=='count':
    plt.ylabel('Hours per user day')
else:
    plt.ylabel('Percentage per user day')
if column_name.split('_')[0]=='std':
    plt.xlabel('Accelerometer Standard Deviation')
else:
    plt.xlabel('Activity Type')
plt.xticks(rotation=60)
plt.tight_layout()
plt.savefig('./images/'+column_name+'.png',dps=1e6)
plt.show()

In [None]:
df.groupby('Activity Type').mean()

In [None]:
import pickle
filepath = './data/walking_10/right_wrist/'
stream_name = 'org.md2k.feature.motionsensehrv.decoded.rightwrist.all.activity.10.secs'
users = pickle.load(open('./data/users.p','rb'))
for i,user in enumerate(users):
    df = CC.get_stream(stream_name,user_id=user)
    data  = df.toPandas()
    pickle.dump(data,open(filepath+user,'wb'))
    print(i,end=',')

In [None]:
import pandas as pd




In [None]:
import shutil

In [None]:
maxmean_score = pd.read_json('maxmean_10.json')
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size':15})
plt.figure(figsize=(20,10))
sns.heatmap(maxmean_score,annot=True,fmt='.2f',cmap='Reds',linewidth=1,linecolor='black')
plt.title('Max Mean Boosting')
plt.xlabel('Train data per user, minutes')
plt.ylabel('Test data length, minutes')
plt.tight_layout()
plt.savefig('./images/maxmean_boosting.png',dps=1e6)
plt.show()

In [None]:
maxmean_score = pd.read_json('majority_10.json')
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size':15})
plt.figure(figsize=(20,10))
sns.heatmap(maxmean_score,annot=True,fmt='.2f',cmap='Reds',linewidth=1,linecolor='black')
plt.title('Majority Boosting')
plt.xlabel('Train data per user, minutes')
plt.ylabel('Test data length, minutes')
plt.tight_layout()
plt.savefig('./images/majority_boosting.png',dps=1e6)
plt.show()

In [None]:
df.loc[0]['data']
users
import shutil
import pickle
import os
hours = []
for f in os.listdir('./data/right_wrist/'):
    data = pickle.load(open('./data/right_wrist/'+f,'rb'))
    hours.append(data.shape[0]*20/3600)
data.shape[0]*20/3600
import matplotlib.pyplot as plt

from collections import Counter

import numpy as np
Counter(np.floor(hours))