In [2]:
import tensorflow as tf
import numpy as np
seed = 100
tf.random.set_seed(seed)
np.random.seed(seed)
import os
import pandas as pd
import pickle
import numpy as np
import tensorflow as tf
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Conv1D,BatchNormalization,Dropout,Input,MaxPooling1D,Flatten,Dense,Input,Activation,GRU
from tensorflow.keras.models import Model, Sequential
from sklearn.metrics import accuracy_score
from joblib import Parallel,delayed
from scipy.stats import mode
from sklearn.metrics import accuracy_score

In [3]:
import pickle
import pandas as pd
def get_participants_df(directory,window_size,min_length):
    df = []
    n = 60//window_size
    for f in os.listdir(directory):
        if f[0]=='.':
            continue
        data = pickle.load(open(directory+f,'rb'))
        df.append([f,data.shape[0]//n])
    df = pd.DataFrame(df,columns=['user','total_test_length'])
    return df[df.total_test_length>=min_length]

def get_training_data(directory,
                      train_length,
                      n_user,
                      participant_df,
                      window_size):
    n = 60//window_size
    users = participant_df['user'].values[:n_user]
    X = []
    y = []
    for f in users:
        df = pickle.load(open(directory+f,'rb'))
        if train_length==-1 or df.shape[0]<=n*train_length:
            X.append(np.concatenate(list(df['data'])))
            y.extend([f]*df.shape[0])
        else:
            i = np.random.randint(0,df.shape[0]-n*train_length)
            df = df[i:i+n*train_length]
    #         df = df.sample(n*train_length,replace=False)
            X.append(np.concatenate(list(df['data'])))
            y.extend([f]*df.shape[0])
    y_dict = {a:i for i,a in enumerate(np.unique(y))}
    y  = [y_dict[a] for a in y]
    return np.concatenate(X),np.array(y),y_dict

def get_trained_model(X_train,y_train,n_timesteps,n_channels,window_size,filepath):
    n_classes = len(np.unique(y_train))
    model =  get_model(input_shape=(n_timesteps,n_channels),n_classes=n_classes)
    checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True, mode='min',save_weights_only=False)
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=0,patience=40)
    lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',patience=5,verbose=0,factor=0.5)
    callbacks_list = [es,checkpoint]
    train_x,val_x,train_y,val_y = train_test_split(X_train,y_train,test_size=.2,stratify=y_train)
    history = model.fit(train_x,train_y,validation_data=(val_x,val_y), epochs=200, batch_size=100,verbose=0,callbacks=callbacks_list,shuffle=True)
    model.load_weights(filepath)
    print(accuracy_score(val_y,model.predict(val_x).argmax(axis=1)),end=',')
    return model

def get_model(input_shape=(500,3),n_classes=1):
    model =  Sequential()
    model.add(Conv1D(128,2,input_shape=input_shape,activation='relu',kernel_initializer='normal',padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Conv1D(128,2,activation='relu',kernel_initializer='normal',padding='same'))
    model.add(MaxPooling1D(2))
#     model.add(Conv1D(128,2,activation='relu',kernel_initializer='normal',padding='same'))
#     model.add(MaxPooling1D(2))
#     model.add(Conv1D(128,2,activation='relu',kernel_initializer='normal',padding='same'))
#     model.add(MaxPooling1D(2))
    model.add(Activation('tanh'))
    model.add(Dropout(.4))
    model.add(GRU(128,return_sequences=False,activation='tanh'))
    model.add(Flatten())
    model.add(Dense(350,name='feature'))
    model.add(Dense(n_classes))
    model.add(Dense(n_classes,activation='softmax'))
    model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),optimizer='adam',metrics=['acc'])
    return model


window_size = 20
activity = 'walking_moral'
data_directory = './data/'+str(window_size)+'/'+activity+'/'
model_directory = './models/'+str(window_size)+'/'+activity+'/'
min_test_total_length  = 10
fs = 25
n_timesteps,n_channels = fs*window_size,3 
if not os.path.isdir(model_directory):
    os.makedirs(model_directory)
participant_df = get_participants_df(data_directory+'testing/',window_size,min_test_total_length)
# n_users = list(np.arange(50,participant_df.shape[0],50))+[participant_df.shape[0]]
# train_lengths = list(np.arange(10,60,10))+list(np.arange(60,210,30))
# train_lengths = np.arange(90,210,30)
n_iters = np.arange(0,3,1)
n_users = [participant_df.shape[0]]
train_lengths = [-1]
# print(n_users)
# train_lengths = [10,30,60,150,300]
# train_lengths = [270]
# n_users = np.arange(50,350,50)
# n_users = [10,20,40]+list(n_users)
# n_iters = [1]
print(n_users)
for n_user in n_users:
    if not os.path.isdir(model_directory+str(n_user)):
        os.makedirs(model_directory+str(n_user))
    print('--'*30)
    print('Starting for no. of training users = ',n_user)
    for train_length in train_lengths:
        print('Training length minutes = ',train_length)
        if not os.path.isdir(model_directory+str(n_user)+'/'+str(train_length)):
            os.makedirs(model_directory+str(n_user)+'/'+str(train_length))
        for n_iter in n_iters:
            X_train,y_train,user_dict = get_training_data(directory = data_directory+'training/',
                                                          train_length=train_length,
                                                          n_user=n_user,
                                                          participant_df=participant_df,
                                                          window_size=window_size) 
            print(X_train.shape)
            pickle.dump(user_dict,open(model_directory+str(n_user)+'/'+str(train_length)+'/userdict_seed_'+str(seed)+'_iteration_'+str(n_iter)+'.p','wb'))
            model = get_model(input_shape=(n_timesteps,n_channels),n_classes=len(np.unique(y_train)))
            model_filepath = model_directory+str(n_user)+'/'+str(train_length)+'/trainedmodel_seed_'+str(seed)+'_iteration_'+str(n_iter)+'.hdf5'
            model = get_trained_model(X_train,y_train,n_timesteps,n_channels,window_size,model_filepath)
            print('iteration = ',n_iter)
        print(train_length, '--Done')

[22]
------------------------------------------------------------
Starting for no. of training users =  22
Training length minutes =  -1
(20949, 500, 3)
0.6410501193317423,iteration =  0
(20949, 500, 3)
0.6343675417661098,iteration =  1
(20949, 500, 3)
0.6257756563245823,iteration =  2
-1 --Done


In [None]:
import tensorflow as tf
import os
import pickle
import pandas as pd
import numpy as np

def get_testing_data(directory,min_length,window_size):
    X = []
    y = []
    n = 60//window_size
    for f in os.listdir(directory):
        if f[0]=='.':
            continue
        data = pickle.load(open(directory+f,'rb'))
        if data.shape[0]//n<min_length:
            continue
        X.append(np.concatenate(list(data['data'])))
        y.extend([f]*data.shape[0])
    return np.concatenate(X),np.array(y)



def get_test_results_alll(window_size=20,activity='sports',n_user=50,min_length=100):
    data_directory = './data/'+str(window_size)+'/'+activity+'/testing/'
    model_directory = './models/'+str(window_size)+'/'+activity+'/'+str(n_user)+'/'
    train_lengths = os.listdir(model_directory)
    X,y = get_testing_data(data_directory,min_length,window_size)
    save_directory = './predictions/'+str(window_size)+'/'+activity+'/'+str(n_user)+'/'
    result_directory = './results/'
    if not os.path.isdir(save_directory):
        os.makedirs(save_directory)

    X.shape,y.shape

    from sklearn.metrics import accuracy_score
    def get_dictfilename(a):
        a = a.replace('trainedmodel','userdict').replace('hdf5','p')
        return a

    def get_predictions(df):
        indexes = np.array(list(df['index']))
        y_pred = model.predict(X[indexes])
        df['y_prob'] = list(y_pred)
        df['y_pred'] = y_pred.argmax(axis=1)
        return df


    for f in list(os.listdir(model_directory)):
        if f in os.listdir(save_directory) or f[0]=='.':
            continue
        print(f)
        model_files = [model_directory+f+'/'+a for a in os.listdir(model_directory+f) if a[-1]=='5']
        dict_files = [get_dictfilename(a) for a in model_files]
        pairs = list(zip(model_files,dict_files))
        predictions_all = []
        for i,a in enumerate(pairs):
            print(i,end=',')
            m_name,d_name = a
            user_dict = pickle.load(open(d_name,'rb'))
            indexes = np.array([i for i in range(len(y)) if y[i] in user_dict.keys()])
            y_temp = y[indexes]
            X_temp = X[indexes]
            y_final = np.array([user_dict[a] for a in y_temp])
            index_df = pd.DataFrame({'user':y_temp,'y':y_final,'index':np.arange(len(y_temp))})
            model = tf.keras.models.load_model(m_name)
            predictions = index_df.groupby('user',as_index=False).apply(get_predictions)
            predictions['iteration'] = i
            predictions_all.append(predictions)
        predictions_all = pd.concat(predictions_all)
        pickle.dump(predictions_all,open(save_directory+f,'wb'))
        print(f,'done')


    from sklearn.metrics import accuracy_score
    from scipy.stats import mode

    def get_results(df):
        df['y_prob'] = df['y_prob'].apply(lambda a:a.reshape(1,-1))
        rows = []
        rows.append([0,accuracy_score(df['y'],df['y_pred']),accuracy_score(df['y'],df['y_pred']),np.int64(f),df['user'].values[0],df['iteration'].values[0]])
        for t in test_lengths:
            y_true = []
            y_pred = []
            y_pred_maxmean = []
            for j in range(n_iter):
                n = t*3
                if n>df.shape[0]:
                    continue
                temp_df = df.sample(n,replace=False)
                y_true.append(temp_df['y'].values[0])
                y_pred.append(mode(temp_df['y_pred'])[0][0])
                y_pred_maxmean.append(np.concatenate(list(temp_df['y_prob'])).mean(axis=0).argmax())
            rows.append([t,accuracy_score(y_true,y_pred),accuracy_score(y_true,y_pred_maxmean),np.int64(f),df['user'].values[0],df['iteration'].values[0]])
        return pd.DataFrame(rows,columns=['test_length','majority_score',
                                          'maxmean_score','train_length',
                                          'user','iteration'])


    import sys
    from joblib import Parallel,delayed
    # if activity in ['stationery','std5']:
    test_lengths = list(np.arange(1,10,1))+list(np.arange(10,130,10))+list(np.arange(150,750,50))
    # else:
    #     test_lengths = list(np.arange(1,10,1))+list(np.arange(10,60,5))
    n_iter = 100
    base_window_size = 20
    final_results = []
    train_lengths = []
    # if activity+'.p' in os.listdir(result_directory):
    #     df = pickle.load(open(result_directory+activity+'.p','rb'))
    #     final_results.append(df)
    #     train_lengths = [str(a) for a in df['train_length'].unique()]

    for f in os.listdir(save_directory):
        if f in train_lengths:
            continue
        print(f)
        dd = pickle.load(open(save_directory+f,'rb'))
        all_dfs = list(dd.groupby(['user','iteration'],as_index=False))
        all_results = Parallel(n_jobs=40,verbose=2)(delayed(get_results)(all_dfs[k][1]) for k in range(len(all_dfs)))
        results = pd.concat(all_results)
        final_results.append(results)
        pickle.dump(pd.concat(final_results),open(result_directory+activity+str(n_user)+'.p','wb'))
        print(f,'done')

    results = pickle.load(open(result_directory+activity+str(n_user)+'.p','rb'))

    final_results = results.groupby(['test_length','iteration','train_length'],as_index=False).mean().groupby(['test_length','train_length'],as_index=False).mean()

    import seaborn as sns

    import matplotlib.pyplot as plt

    plt.rcParams.update({'font.size':40})
    plt.figure(figsize=(30,20))
    sns.lineplot(x='test_length',y='majority_score',hue='train_length',data=final_results)
    plt.ylim([0,1])
    plt.show()
    plt.rcParams.update({'font.size':40})
    plt.figure(figsize=(30,20))
    sns.lineplot(x='test_length',y='maxmean_score',hue='train_length',data=final_results)
    plt.ylim([0,1])
    plt.show()
    def save_data_final(extra=''):
        final_result_directory = './final_results/'
        activity1 = activity
        if activity=='std':
            activity1 += '20'
        activity1+=extra
        maxmean = pd.pivot_table(final_results,columns='train_length',index='test_length',values='maxmean_score',aggfunc='mean')
        maxmean.to_csv(final_result_directory+activity1+'_maxmean.csv')
        majority = pd.pivot_table(final_results,columns='train_length',index='test_length',values='majority_score',aggfunc='mean')
        majority.to_csv(final_result_directory+activity1+'_majority.csv')

    save_data_final('_'+str(n_user))
    return

In [None]:
n_users = np.arange(50,350,50)
n_users = [10,20,40]+list(n_users)
for n_user in n_users:
    get_test_results_alll(window_size=20,activity='sports',n_user=n_user,min_length=100)

In [None]:
import pickle
import pandas as pd
def get_participants_df(directory,window_size,min_length):
    df = []
    n = 60//window_size
    for f in os.listdir(directory):
        data = pickle.load(open(directory+f,'rb'))
        df.append([f,data.shape[0]//n])
    df = pd.DataFrame(df,columns=['user','total_test_length'])
    return df[df.total_test_length>=min_length]

def get_training_data(directory,
                      train_length,
                      n_user,
                      participant_df,
                      window_size):
    n = 60//window_size
    users = participant_df['user'].values[:n_user]
    X = []
    y = []
    for f in users:
        df = pickle.load(open(directory+f,'rb'))
        i = np.random.randint(0,df.shape[0]-n*train_length)
        df = df[i:i+n*train_length]
#         df = df.sample(n*train_length,replace=False)
        X.append(np.concatenate(list(df['data'])))
        y.extend([f]*df.shape[0])
    y_dict = {a:i for i,a in enumerate(np.unique(y))}
    y  = [y_dict[a] for a in y]
    return np.concatenate(X),np.array(y),y_dict

def get_trained_model(X_train,y_train,n_timesteps,n_channels,window_size,filepath):
    n_classes = len(np.unique(y_train))
    model =  get_model(input_shape=(n_timesteps,n_channels),n_classes=n_classes)
    checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=0, save_best_only=True, mode='max',save_weights_only=False)
    es = EarlyStopping(monitor='val_acc', mode='max', verbose=0,patience=40)
    lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',patience=5,verbose=0,factor=0.5)
    callbacks_list = [es,checkpoint]
    train_x,val_x,train_y,val_y = train_test_split(X_train,y_train,test_size=.2,stratify=y_train)
    history = model.fit(train_x,train_y,validation_data=(val_x,val_y), epochs=200, batch_size=300,verbose=0,callbacks=callbacks_list,shuffle=True)
    model.load_weights(filepath)
    print(accuracy_score(val_y,model.predict(val_x).argmax(axis=1)),end=',')
    return model

def get_model(input_shape=(500,3),n_classes=1):
    model =  Sequential()
    model.add(Conv1D(128,2,input_shape=input_shape,activation='relu',kernel_initializer='normal',padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Conv1D(128,2,activation='relu',kernel_initializer='normal',padding='same'))
    model.add(MaxPooling1D(2))
#     model.add(Conv1D(128,2,activation='relu',kernel_initializer='normal',padding='same'))
#     model.add(MaxPooling1D(2))
#     model.add(Conv1D(128,2,activation='relu',kernel_initializer='normal',padding='same'))
#     model.add(MaxPooling1D(2))
    model.add(Activation('tanh'))
    model.add(Dropout(.4))
    model.add(GRU(128,return_sequences=False,activation='tanh'))
    model.add(Flatten())
    model.add(Dense(350,name='feature'))
    model.add(Dense(n_classes))
    model.add(Dense(n_classes,activation='softmax'))
    model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),optimizer='adam',metrics=['acc'])
    return model


window_size = 20
activity = 'sports'
data_directory = './data/'+str(window_size)+'/'+activity+'/'
model_directory = './models/'+str(window_size)+'/'+activity+'/'
min_test_total_length  = 100
fs = 25
n_timesteps,n_channels = fs*window_size,3 
if not os.path.isdir(model_directory):
    os.makedirs(model_directory)
participant_df = get_participants_df(data_directory+'testing/',window_size,min_test_total_length)
# n_users = list(np.arange(50,participant_df.shape[0],50))+[participant_df.shape[0]]
train_lengths = list(np.arange(10,60,10))+list(np.arange(60,210,30))
n_iters = np.arange(3)
n_users = [participant_df.shape[0]]
# train_lengths = [120]
# n_iters = [1]

for n_user in n_users[::-1]:
    if not os.path.isdir(model_directory+str(n_user)):
        os.makedirs(model_directory+str(n_user))
    print('--'*30)
    print('Starting for no. of training users = ',n_user)
    for train_length in train_lengths:
        print('Training length minutes = ',train_length)
        if not os.path.isdir(model_directory+str(n_user)+'/'+str(train_length)):
            os.makedirs(model_directory+str(n_user)+'/'+str(train_length))
        for n_iter in n_iters:
            X_train,y_train,user_dict = get_training_data(directory = data_directory+'training/',
                                                          train_length=train_length,
                                                          n_user=n_user,
                                                          participant_df=participant_df,
                                                          window_size=window_size) 
            print(X_train.shape)
            pickle.dump(user_dict,open(model_directory+str(n_user)+'/'+str(train_length)+'/userdict_seed_'+str(seed)+'_iteration_'+str(n_iter)+'.p','wb'))
            model = get_model(input_shape=(n_timesteps,n_channels),n_classes=len(np.unique(y_train)))
            model_filepath = model_directory+str(n_user)+'/'+str(train_length)+'/trainedmodel_seed_'+str(seed)+'_iteration_'+str(n_iter)+'.hdf5'
            model = get_trained_model(X_train,y_train,n_timesteps,n_channels,window_size,model_filepath)
            print('iteration = ',n_iter)
        print(train_length, '--Done')

In [None]:
plt.plot(X_train[0])

In [None]:
import tensorflow as tf
import os

import pickle
window_size = 20
train_length = 120
activity = 'walking'
n_user  = 333
model_directory = './models/'+str(window_size)+'/'+activity+'/'+str(n_user)+'/'+str(train_length)+'/'

model_directory = './models/20/walking/333/120/trained_model_seed_100_iteration_1.hdf5'
# model = tf.keras.models.load_model(model_directory+'trained_model_seed_100_iteration_0.hdf5')
model = tf.keras.models.load_model(model_directory)

# user_dict = pickle.load(open(model_directory+'user_dict_seed_100_iteration_0.p','rb'))

user_dict = pickle.load(open('./models/20/walking/333/120/user_dict_seed_100_iteration_1.p','rb'))

data_directory = './data/'+str(window_size)+'/'+activity+'/testing/'

from sklearn.metrics import accuracy_score
import numpy as np
def get_training_data(directory,
                      test_length,
                      user_dict,
                      window_size,
                      model):
    n = 60//window_size
    users = list(user_dict.keys())
    results = []
    y_orig = []
    y_pred = []
    for f in users:
        df = pickle.load(open(directory+f,'rb'))
        X = np.concatenate(list(df['data']))
        pred = model.predict(X).argmax(axis=1)
        results.append(accuracy_score([user_dict[f]]*df.shape[0],pred))
        y_orig.extend([user_dict[f]]*df.shape[0])
        y_pred.extend(list(pred))
    return results,pd.DataFrame({'original':y_orig,'prediction':y_pred})

results,df = get_training_data(data_directory,1,user_dict,window_size,model)

np.mean(results)

In [None]:
plt.hist(results)

In [None]:
accuracy_score(df['original'],df['prediction'])

In [None]:
df.groupby('original',as_index=False).apply(lambda a:pd.Series({'accuracy':accuracy_score(a['original'],a['prediction'])}))

In [None]:
import shutil

shutil.make_archive('./models/','zip','./models/')