In [92]:
import os
import pandas as pd
import numpy as np
import datetime
import scipy
from scipy.stats import skew, kurtosis

In [93]:
import tensorflow as tf

In [94]:
# Read Accelerometer Data
acc_data = pd.read_csv('../data/all_accelerometer_data_pids_13.csv')

def get_time_value(x):
  # x is ms. it is divided by 1000 to get microservond
  t = datetime.datetime.fromtimestamp(x/1000.0)
  t = t.replace(microsecond = 0)
  return int(t.timestamp())

acc_data['window10'] = acc_data['time'].apply(get_time_value)
acc_data = acc_data.drop(columns="time")
acc_data = acc_data.rename(columns = {"window10": "time"})

acc_data.head()

Unnamed: 0,pid,x,y,z,time
0,JB3156,0.0,0.0,0.0,0
1,CC6740,0.0,0.0,0.0,0
2,SA0297,0.0758,0.0273,-0.0102,1493733882
3,SA0297,-0.0359,0.0794,0.0037,1493733882
4,SA0297,-0.2427,-0.0861,-0.0163,1493733882


In [95]:
acc_data['pid'].unique()

array(['JB3156', 'CC6740', 'SA0297', 'PC6771', 'BK7610', 'DC6359',
       'MC7070', 'MJ8002', 'BU4707', 'JR8022', 'HV0618', 'SF3079',
       'DK3500'], dtype=object)

In [96]:

# Read clean tac data for pid = BK7610
clean_tac_data = pd.read_csv('../data/clean_tac/BK7610_clean_TAC.csv')
clean_tac_data["tac"] = np.where(clean_tac_data["TAC_Reading"] > 0.08, 1, 0)
clean_tac_data = clean_tac_data.drop(columns="TAC_Reading")
clean_tac_data = clean_tac_data.rename(columns={"tac": "TAC_Reading"})
clean_tac_data.describe()


Unnamed: 0,timestamp,TAC_Reading
count,57.0,57.0
mean,1493758000.0,0.22807
std,28415.95,0.423318
min,1493719000.0,0.0
25%,1493729000.0,0.0
50%,1493756000.0,0.0
75%,1493782000.0,0.0
max,1493808000.0,1.0


In [97]:
# Filtering for specific PID (temps)
acc_data_pid = acc_data[acc_data.pid == "BK7610"]
                        # | (acc_data.pid == "JB3156")]
acc_data_pid['pid'].unique()

array(['BK7610'], dtype=object)

In [98]:
acc_data_pid.describe()

Unnamed: 0,x,y,z,time
count,1225727.0,1225727.0,1225727.0,1225727.0
mean,-0.006497703,0.007507374,0.002747567,1493752000.0
std,0.1380473,0.1387602,0.1279124,9276.766
min,-4.2748,-6.9489,-5.2772,1493736000.0
25%,-0.0094,-0.006,-0.0073,1493744000.0
50%,0.0001,0.0001,0.0057,1493752000.0
75%,0.0083,0.0094,0.0114,1493760000.0
max,6.4503,5.3441,4.6565,1493768000.0


In [99]:
acc_data_pid.shape

(1225727, 5)

In [100]:
# Up sampling tac data to match acc data
clean_ts = clean_tac_data['timestamp'] 
acc_ts = acc_data_pid['time']
all_labels = list()
offset_tac, offset_acc = 0, 0
# print(acc_ts.iloc[0])
# print(clean_ts.loc[0])
# print(clean_tac_data.loc[0]['TAC_Reading'])
# # acc_ts.iloc[0] #1493735870653
while offset_tac < len(clean_ts) and offset_acc < len(acc_ts):
  
  while acc_ts.iloc[offset_acc] < clean_ts.iloc[offset_tac]:
    all_labels.append([clean_tac_data.iloc[offset_tac]['TAC_Reading'], acc_ts.iloc[offset_acc]])
    offset_acc += 1
    if offset_acc >= len(acc_ts):
      break

  offset_tac += 1

all_labels


[[0, 1493735870],
 [0, 1493735870],
 [0, 1493735870],
 [0, 1493735870],
 [0, 1493735870],
 [0, 1493735870],
 [0, 1493735870],
 [0, 1493735870],
 [0, 1493735870],
 [0, 1493735870],
 [0, 1493735870],
 [0, 1493735870],
 [0, 1493735870],
 [0, 1493735870],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735871],
 [0, 1493735872],
 [0, 14937

In [101]:
all_labels_df = pd.DataFrame(all_labels, columns = ["tac", "time"])
all_labels_df.shape, acc_data_pid.shape

# merged = merged.drop_duplicates().reset_index(drop=True)
# merged.to_csv("../data/BK7610_final_final.csv")

((1225727, 2), (1225727, 5))

In [102]:
# merged = acc_data_pid.head(10).merge(all_labels_df.head(10), on = 'time', how='inner')
# merged['time'].value_counts()

In [103]:
# len(all_labels_df['time'].unique()), len(acc_data_pid['time'].unique())

In [104]:
# acc_data_pid['tac_reading'] = 
# TODO: Make sure tac data is sorted on timestamp
clean_tac_data["from"] = clean_tac_data["timestamp"].shift(1, fill_value=-1) + 1


In [105]:
clean_tac_data.index = pd.IntervalIndex.from_arrays(clean_tac_data["from"], clean_tac_data["timestamp"], closed = "both")


In [106]:
acc_data_pid['tac'] = acc_data_pid["time"].apply(lambda x: clean_tac_data.iloc[clean_tac_data.index.get_loc(x)]["TAC_Reading"])
acc_data_pid

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  acc_data_pid['tac'] = acc_data_pid["time"].apply(lambda x: clean_tac_data.iloc[clean_tac_data.index.get_loc(x)]["TAC_Reading"])


Unnamed: 0,pid,x,y,z,time,tac
47136,BK7610,0.1261,-0.0078,-0.0243,1493735870,0
47138,BK7610,0.1336,-0.0697,-0.0446,1493735870,0
47140,BK7610,0.1443,-0.0474,-0.0447,1493735870,0
47142,BK7610,0.1255,-0.0038,0.0111,1493735870,0
47144,BK7610,0.1076,0.0032,0.0276,1493735870,0
...,...,...,...,...,...,...
6071104,BK7610,-0.0784,-0.0161,0.1719,1493767770,1
6071108,BK7610,-0.0395,-0.0816,0.1634,1493767770,1
6071112,BK7610,0.0160,-0.0853,0.0906,1493767770,1
6071117,BK7610,0.0901,-0.0767,0.0162,1493767770,1


In [107]:
# print(acc_data_pid["tac"].unique().sort())
# clean_tac_data
# min: 1,493,718,714
# max: 1,493,807,899
# 
# acc_data_pid
# min: 1,493,735,870
# max: 1,493,767,770
# acc_data_pid["time"].max()


In [108]:
# TODO: Make n = 10 after either removing one record which has 7 records for a second or by adding 3 dummy values to it (latter is better)
# frame_temp.groupby([ "pid", "window10"]).count().describe()
# We are sampling with replacement, which should be okay since it is within a second
acc_data_pid_20s = acc_data_pid.groupby([ "pid", "time"]).sample(n = 20, replace=True)

In [216]:
# TODO: Make this as an assert statement in the begininng for both tac and accelerometer data
acc_data_pid_20s["time"].is_monotonic_increasing

True

In [294]:

acc_data_sliding = acc_data_pid_20s.copy()
cols = ["x", "y", "z"]
window_size = 10 # including current

for col in cols:
    cols_to_append = []
    for i in range(0, window_size):
        shifted_col_name = str(col) + "_" + str(i)
        acc_data_sliding[shifted_col_name] = acc_data_sliding[col].shift(i, fill_value = 0)
        cols_to_append.append(shifted_col_name)
    
    # we have (windpw_size ) columsn for col
    # Uncomment to keep original columns
    acc_data_sliding = acc_data_sliding.drop(columns=[col])    
    acc_data_sliding[str(col)] = acc_data_sliding[cols_to_append].values.tolist()
    
    # acc_data_sliding = acc_data_sliding.drop(columns=cols_to_append)

In [308]:

# # x_sliding_window.shape
pids = ["BK7610"]
final = []
labels = []
for pid in pids:
  # temptemp = acc_data_pid_20s[acc_data_pid_20s['pid'] == pid]
  temptemp = acc_data_sliding[acc_data_sliding['pid'] == pid]
  times = temptemp.time.unique()
  final_temp =[]
  labels_temp = []
  for time in times:
    # x = np.lib.stride_tricks.sliding_window_view(frame_temp2[frame_temp2.pid == pid and frame_temp2.window10 == time], window_shape = 10)
    temptemptemp = temptemp[temptemp['time'] == time]
    # TODO: Create x y z sliding windows
    # x = np.lib.stride_tricks.sliding_window_view(temptemptemp["x"], window_shape = 10)
    # y = np.lib.stride_tricks.sliding_window_view(temptemptemp["y"], window_shape = 10)
    # z = np.lib.stride_tricks.sliding_window_view(temptemptemp["z"], window_shape = 10)
    x_dash = np.array(temptemptemp[["x_"+str(i) for i in range(window_size)]]).flatten()
    y_dash = np.array(temptemptemp[["y_"+str(i) for i in range(window_size)]]).flatten()
    z_dash = np.array(temptemptemp[["z_"+str(i) for i in range(window_size)]]).flatten()
    # a = np.vstack((temptemptemp["x"].apply(lambda x: np.array(x, dtype="float32")), temptemptemp["y"].apply(lambda x: np.array(x, dtype="float32")), temptemptemp["z"].apply(lambda x: np.array(x, dtype="float32"))))
    a = np.vstack((x_dash, y_dash, z_dash))
    final_temp.append(a)
    labels_temp.append(temptemptemp.head(1)["tac"])
  final.append(np.array(final_temp))
  labels.append(np.array(labels_temp))
  # print(final)
  
  # break
# print(np.array(final,dtype=object).shape)


In [310]:

# [-0.1331,  0.026 ,  0.1165, -0.0039, -0.1326, -0.1095,  0.1165,
        #  0.1467, -0.1494, -0.0233]
a.shape

(3, 200)

In [311]:
final_arr = np.asarray(final).astype('float32')
labels_arr = np.asarray(labels).astype('float32')
print(final_arr.shape)
print(labels_arr.shape)

(1, 30735, 3, 200)
(1, 30735, 1)


In [312]:
final_arr_reshape = np.reshape(final_arr, (30735, 3, 200))
labels_arr_reshape = np.reshape(labels_arr, (30735,1))
final_arr_reshape.shape, labels_arr_reshape.shape 

((30735, 3, 200), (30735, 1))

In [354]:
batch_size = 30735
epochs = 100
# prep_data = pd.read_csv('../data/good_again_bhas.csv')
flatten = tf.keras.layers.Flatten()
conv_layer1 = tf.keras.layers.Conv1D(filters = 64, kernel_size = 10,padding='SAME')
conv_layer2 = tf.keras.layers.Conv1D(filters = 64, kernel_size = 10, padding='SAME')
dropout = tf.keras.layers.Dropout(0.5)
max_pooling = tf.keras.layers.MaxPool1D(pool_size=2)
# fc - fully connected layer
fc_layer = tf.keras.layers.Dense(units=128, activation = 'relu')
fc_layer2 = tf.keras.layers.Dense(units=1, activation = 'softmax') 
base_model = tf.keras.Sequential([
                                  conv_layer1, 
                                  conv_layer2, 
                                  dropout, 
                                  max_pooling, 
                                  flatten, 
                                  fc_layer, 
                                  fc_layer2
                                ])

base_model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), \
                   optimizer=tf.keras.optimizers.Adam(0.001), \
                   metrics=[tf.keras.metrics.Accuracy()]) 

In [355]:
# loss_func = tf.keras.losses.BinaryCrossentropy(from_logits=False)
# optimizer = tf.keras.optimizers.Adam(0.001)

# for e in range(0, 5000):
#     with tf.GradientTape() as tape:
#         preds = base_model(final_arr_reshape)
#         print(preds)
        
#         # loss = loss_func(y_true=labels_arr_reshape, y_pred=preds)
#         # acc = np.sum(np.equal(labels, preds)) / 30735
#     #     print(acc)
#     # gradients = tape.gradient(loss, base_model.trainable_variables)
#     # optimizer.apply_gradients(zip(gradients, base_model.trainable_variables))
#     # print(f"Epoch: {e} | LOSS : {loss} | acc {acc}")

# # loss
            
# # total_loss += loss
# # gradients = tape.gradient(loss, model.trainable_variables)
# #     optimizer.apply_gradients(zip(gradients, model.trainable_variables))
# #     return total_loss


In [356]:
base_model.build((30735, 3, 200))

In [357]:
base_model.summary()

Model: "sequential_21"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_42 (Conv1D)          (30735, 3, 64)            38464     
                                                                 
 conv1d_43 (Conv1D)          (30735, 3, 64)            12352     
                                                                 
 dropout_20 (Dropout)        (30735, 3, 64)            0         
                                                                 
 max_pooling1d_21 (MaxPoolin  (30735, 1, 64)           0         
 g1D)                                                            
                                                                 
 flatten_21 (Flatten)        (30735, 64)               0         
                                                                 
 dense_42 (Dense)            (30735, 128)              8320      
                                                     

In [358]:
np.unique(labels_arr, return_counts=True)

(array([0., 1.], dtype=float32), array([11229, 19506]))

In [359]:
print(all_labels_df.shape)
all_labels_df['tac'].value_counts()


(1225727, 2)


tac
1    778034
0    447693
Name: count, dtype: int64

In [360]:
base_model.fit(final_arr_reshape, labels_arr_reshape, epochs = 1,
            #    batch_size = batch_size, 
               verbose=1)



<keras.callbacks.History at 0x357fbd0f0>

In [361]:
loss, accuracy = base_model.evaluate(final_arr_reshape, labels_arr_reshape,
                                    #  batch_size = batch_size, 
                                     verbose=0)

round(loss,4),accuracy

(0.6229, 0.6346510648727417)

In [122]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
clf = MLPClassifier(solver='sgd', random_state=1)

In [123]:
final_arr_mlp = np.reshape(final_arr, (30735, 60))
labels_arr_mlp = np.reshape(labels_arr, (30735,1))

clf.fit(final_arr_mlp, labels_arr_mlp)
# print('Accuracy ', accuracy_score(y_test, clf.predict(x_test)))

  y = column_or_1d(y, warn=True)


In [127]:
print('Accuracy ', round(accuracy_score(labels_arr_mlp, clf.predict(final_arr_mlp)),4))

Accuracy  0.6348
