In [12]:
import tensorflow as tf
import os
import pickle
import pandas as pd
import numpy as np

def get_testing_data(directory,min_length,window_size):
    X = []
    y = []
    n = 60//window_size
    for f in os.listdir(directory):
        if f[0]=='.':
            continue
        data = pickle.load(open(directory+f,'rb'))
        if data.shape[0]//n<min_length:
            continue
        X.append(np.concatenate(list(data['data'])))
        y.extend([f]*data.shape[0])
    return np.concatenate(X),np.array(y)

window_size = 20
activity = 'std25'
n_user  = 315
min_length = 100
data_directory = './data/'+str(window_size)+'/'+activity+'/testing/'
model_directory = './models/'+str(window_size)+'/'+activity+'/'+str(n_user)+'/'
train_lengths = os.listdir(model_directory)
# X,y = get_testing_data(data_directory,min_length,window_size)
save_directory = './predictions/'+str(window_size)+'/'+activity+'/'+str(n_user)+'/'
result_directory = './results/'
if not os.path.isdir(save_directory):
    os.makedirs(save_directory)

In [2]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only use the first GPU
    try:
        tf.config.experimental.set_visible_devices(gpus[2], 'GPU')
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e:
        # Visible devices must be set before GPUs have been initialized
        print(e)

4 Physical GPUs, 1 Logical GPU


In [3]:
from sklearn.metrics import accuracy_score
def get_dictfilename(a):
    a = a.replace('trainedmodel','userdict').replace('hdf5','p')
    return a

def get_predictions(df):
    indexes = np.array(list(df['index']))
    y_pred = model.predict(X[indexes])
    df['y_prob'] = list(y_pred)
    df['y_pred'] = y_pred.argmax(axis=1)
    return df
    

for f in list(os.listdir(model_directory)):
    model_files = [model_directory+f+'/'+a for a in os.listdir(model_directory+f) if a[-1]=='5']
    dict_files = [get_dictfilename(a) for a in model_files]
    pairs = list(zip(model_files,dict_files))
    predictions_all = []
    for i,a in enumerate(pairs):
        m_name,d_name = a
        user_dict = pickle.load(open(d_name,'rb'))
        y_final = np.array([user_dict[a] for a in y])
        index_df = pd.DataFrame({'user':y,'y':y_final,'index':np.arange(len(y))})
        model = tf.keras.models.load_model(m_name)
        predictions = index_df.groupby('user',as_index=False).apply(get_predictions)
        predictions['iteration'] = i
        predictions_all.append(predictions)
    predictions_all = pd.concat(predictions_all)
    pickle.dump(predictions_all,open(save_directory+f,'wb'))
    print(f,'done')

40 done
10 done
60 done
20 done
50 done
30 done
120 done
90 done
150 done
180 done


In [19]:
from sklearn.metrics import accuracy_score
from scipy.stats import mode

def get_results(df):
    df['y_prob'] = df['y_prob'].apply(lambda a:a.reshape(1,-1))
    rows = []
    rows.append([0,accuracy_score(df['y'],df['y_pred']),accuracy_score(df['y'],df['y_pred']),np.int64(f),df['user'].values[0],df['iteration'].values[0]])
    for t in test_lengths:
        y_true = []
        y_pred = []
        y_pred_maxmean = []
        for j in range(n_iter):
            n = t*3
            if n>df.shape[0]:
                continue
            temp_df = df.sample(n,replace=False)
            y_true.append(temp_df['y'].values[0])
            y_pred.append(mode(temp_df['y_pred'])[0][0])
            y_pred_maxmean.append(np.concatenate(list(temp_df['y_prob'])).mean(axis=0).argmax())
        rows.append([t,accuracy_score(y_true,y_pred),accuracy_score(y_true,y_pred_maxmean),np.int64(f),df['user'].values[0],df['iteration'].values[0]])
    return pd.DataFrame(rows,columns=['test_length','majority_score',
                                      'maxmean_score','train_length',
                                      'user','iteration'])


import sys
from joblib import Parallel,delayed
if activity=='stationery':
    test_lengths = list(np.arange(1,10,1))+list(np.arange(10,120,10))+list(np.arange(120,240,20))+list(np.arange(240,480,40))
else:
    test_lengths = list(np.arange(1,10,1))+list(np.arange(10,60,5))
n_iter = 100
base_window_size = 20
final_results = []
train_lengths = []

# if activity+'.p' in os.listdir(result_directory):
#     df = pickle.load(open(result_directory+activity+'.p','rb'))
#     final_results.append(df)
#     train_lengths = [str(a) for a in df['train_length'].unique()]
    
for f in os.listdir(save_directory):
    if f in train_lengths:
        continue
    print(f)
    dd = pickle.load(open(save_directory+f,'rb'))
    all_dfs = list(dd.groupby(['user','iteration'],as_index=False))
    all_results = Parallel(n_jobs=30,verbose=2)(delayed(get_results)(all_dfs[k][1]) for k in range(len(all_dfs)))
    results = pd.concat(all_results)
    final_results.append(results)
    pickle.dump(pd.concat(final_results),open(result_directory+activity+'.p','wb'))
    print(f,'done')

results = pickle.load(open(result_directory+activity+'.p','rb'))

final_results = results.groupby(['test_length','iteration','train_length'],as_index=False).mean().groupby(['test_length','train_length'],as_index=False).mean()

import seaborn as sns

import matplotlib.pyplot as plt

plt.rcParams.update({'font.size':40})
plt.figure(figsize=(30,20))
sns.lineplot(x='test_length',y='majority_score',hue='train_length',data=final_results)
plt.ylim([0,1])
plt.show()
plt.rcParams.update({'font.size':40})
plt.figure(figsize=(30,20))
sns.lineplot(x='test_length',y='maxmean_score',hue='train_length',data=final_results)
plt.ylim([0,1])
plt.show()

def save_data_final():
    final_result_directory = './final_results/'
    if not os.path.isdir(final_result_directory):
        os.makedirs(final_result_directory)
    activity1 = activity
    if activity=='std':
        activity1 += '20'
    maxmean = pd.pivot_table(final_results,columns='train_length',index='test_length',values='maxmean_score',aggfunc='mean')
    maxmean.to_csv(final_result_directory+activity1+'_maxmean.csv')
    majority = pd.pivot_table(final_results,columns='train_length',index='test_length',values='majority_score',aggfunc='mean')
    majority.to_csv(final_result_directory+activity1+'_majority.csv')

save_data_final()