In [69]:
import os
import pickle
from collections import Counter
from datetime import datetime, timedelta
from dateutil import parser
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from joblib import Parallel,delayed
import warnings
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping
from tensorflow.keras.layers import Conv1D,BatchNormalization,Dropout,Input,MaxPooling1D,Flatten,Dense,Input,Activation,GRU, LSTM
from tensorflow.keras.models import Model, Sequential
warnings.filterwarnings('ignore')

def select_categories(data):
    categories = []
    for cat in data.category.unique():
        # if data[data.category==cat].user.unique().shape[0]==data.user.unique().shape[0]:
        categories.append(cat)
    return categories

def remove_nonusers(df):
    if df.day.unique().shape[0] < minimum_day_per_user:
        return pd.DataFrame([],columns=df.columns)
    return df

unknown_category = 'Unknown'
start_hour = 6
end_hour  = 24
step = 1
one_minute = 60
day_minimum_threshold = 100
maximum_unknown_percentage = .3
minimum_day_per_user = 30

data = pickle.load(open('../data/activity_mperf_smartphone.p','rb'))
print(data.start_hour.unique())
selected_categories = select_categories(data)
onehotencoder  = OneHotEncoder().fit(np.array([unknown_category]+selected_categories).reshape(-1,1))
category_dict = {a:i for i,a in enumerate(onehotencoder.categories_[0])}
data = data[data.category.isin(selected_categories)]
data = data.groupby('user',as_index=False).apply(remove_nonusers)

def parse_day_data(df):
    import warnings
    warnings.filterwarnings('ignore')
    if df['duration'].sum() < day_minimum_threshold or df['duration'].sum() < (end_hour-start_hour)*one_minute*(1-maximum_unknown_percentage):
        return pd.DataFrame([], columns = ['user', 'day', 'data'])
    user = df.user.values[0]
    day = df.day.values[0]
    df = df[(df.start_hour>=start_hour) & (df.start_hour<=end_hour)]
    start_time = pd.to_datetime(parser.parse(day))+timedelta(hours=start_hour)
    end_time = pd.to_datetime(parser.parse(day))+timedelta(hours=end_hour)
    df['start'] = df['start'].apply(lambda a:parser.parse(pd.to_datetime(a).strftime("%m/%d/%Y, %H:%M")))
    df['end'] = df['end'].apply(lambda a:parser.parse(pd.to_datetime(a).strftime("%m/%d/%Y, %H:%M")))
    df = df.sort_values('start').reset_index(drop=True)
    input_data = []
    initial = start_time
    for i,row in df.iterrows():
        if row['start']!=initial:
            n = (row['start']-initial).total_seconds()//one_minute
            input_data.extend([unknown_category]*int(n))
            initial += timedelta(minutes=n)
        n = (row['end']-initial).total_seconds()//one_minute
        input_data.extend([row['category']]*int(n))
        initial += timedelta(minutes=n)
    n = (end_time-initial).total_seconds()//one_minute
    input_data.extend([unknown_category]*int(n))
    
    input_data_final = onehotencoder.transform(np.array(input_data).reshape(-1,1)).toarray().reshape(1,-1,len(category_dict))
    if input_data_final.shape[1] != (end_hour-start_hour)*one_minute:
        return pd.DataFrame([], columns = ['user', 'day', 'data'])
    return pd.DataFrame([[user,day,input_data_final]],columns=['user','day','data'])

data_final = pd.concat(Parallel(n_jobs=-1,verbose=2)(delayed(parse_day_data)(df) for i,df in data.groupby(['user','day'],as_index=False)))
data_final = data_final.groupby('user',as_index=False).apply(remove_nonusers)

[ 9 12 13 14 15  7  8 10 11 16 17  6 18 20 19  0 21 22 23  1  5  2  3  4]


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 180 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 586 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 1152 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 1882 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 2772 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 3826 tasks      | elapsed:   14.0s
[Parallel(n_jobs=-1)]: Done 5040 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done 6418 tasks      | elapsed:   23.5s
[Parallel(n_jobs=-1)]: Done 7956 tasks      | elapsed:   29.0s
[Parallel(n_jobs=-1)]: Done 9658 tasks      | elapsed:   35.3s
[Parallel(n_jobs=-1)]: Done 11520 tasks      | elapsed:   42.1s
[Parallel(n_jobs=-1)]: Done 13546 tasks      | elapsed:   49.4s
[Parallel(n_jobs=-1)]: Done 15732 tasks      | elapsed:   57.6s
[Parallel(n_jobs=-1)]: Done 18082 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 20592 tasks

In [70]:
def get_model(input_shape=(500,3),n_classes=1):
    model =  Sequential()
    model.add(Conv1D(128,2,input_shape=input_shape,activation='relu',kernel_initializer='normal',padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Conv1D(128,2,activation='relu',kernel_initializer='normal',padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Conv1D(128,2,activation='relu',kernel_initializer='normal',padding='same'))
    model.add(MaxPooling1D(4))
    model.add(Conv1D(128,2,activation='relu',kernel_initializer='normal',padding='same'))
    model.add(MaxPooling1D(4))
    # model.add(Activation('tanh'))
    model.add(Dropout(.4))
    # model.add(tf.keras.layers.LSTM(200,return_sequences=False,activation='tanh'))
    model.add(Flatten())
    # model.add(Dense(350,name='feature'))
    model.add(Dense(n_classes))
    model.add(Dense(n_classes,activation='softmax'))
    model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),optimizer='sgd',metrics=['acc'])
    return model
X = np.concatenate(list(data_final['data']))
users = data_final['user'].unique()
user_dict = {a:i for i,a in enumerate(users)}
y = np.array([user_dict[a] for a in data_final['user'].values])
y_days = data_final['day'].values
input_shape = X[0].shape

In [71]:
np.unique(y)

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [72]:

X_train,X_test,y_train,y_test,days_train,days_test = train_test_split(X,y,y_days,test_size=.2,stratify=y)
X_train,X_val,y_train,y_val,days_train,days_val = train_test_split(X_train,y_train,days_train,test_size=.2,stratify=y_train)
model  = get_model(input_shape=input_shape,n_classes=len(np.unique(y)))
model.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_40 (Conv1D)           (None, 1080, 128)         1920      
_________________________________________________________________
max_pooling1d_40 (MaxPooling (None, 540, 128)          0         
_________________________________________________________________
conv1d_41 (Conv1D)           (None, 540, 128)          32896     
_________________________________________________________________
max_pooling1d_41 (MaxPooling (None, 270, 128)          0         
_________________________________________________________________
conv1d_42 (Conv1D)           (None, 270, 128)          32896     
_________________________________________________________________
max_pooling1d_42 (MaxPooling (None, 67, 128)           0         
_________________________________________________________________
conv1d_43 (Conv1D)           (None, 67, 128)         

In [73]:
filepath = './models/first_model.h5'
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max',save_weights_only=False)
es = EarlyStopping(monitor='val_acc', mode='max', verbose=0,patience=40)
callbacks_list = [es,checkpoint]
history = model.fit(X_train,y_train,validation_data=(X_val,y_val), epochs=200, batch_size=25,verbose=1,callbacks=callbacks_list,shuffle=True)

Epoch 1/200

Epoch 00001: val_acc improved from -inf to 0.00520, saving model to ./models/first_model.h5
Epoch 2/200

Epoch 00002: val_acc improved from 0.00520 to 0.00669, saving model to ./models/first_model.h5
Epoch 3/200

Epoch 00003: val_acc did not improve from 0.00669
Epoch 4/200

Epoch 00004: val_acc did not improve from 0.00669
Epoch 5/200

Epoch 00005: val_acc did not improve from 0.00669
Epoch 6/200

Epoch 00006: val_acc did not improve from 0.00669
Epoch 7/200

Epoch 00007: val_acc did not improve from 0.00669
Epoch 8/200

Epoch 00008: val_acc improved from 0.00669 to 0.00743, saving model to ./models/first_model.h5
Epoch 9/200

Epoch 00009: val_acc improved from 0.00743 to 0.01189, saving model to ./models/first_model.h5
Epoch 10/200

Epoch 00010: val_acc improved from 0.01189 to 0.01263, saving model to ./models/first_model.h5
Epoch 11/200

Epoch 00011: val_acc improved from 0.01263 to 0.01524, saving model to ./models/first_model.h5
Epoch 12/200

Epoch 00012: val_acc imp

In [74]:
model.load_weights(filepath)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,model.predict(X_test).argmax(axis=1))

0.20243757431629014

In [34]:
(-start_time).total_seconds()/60

NameError: name 'start_time' is not defined

In [None]:
pd.to_datetime(df['start'].values[0])

In [62]:
len(np.unique(y))

193