# 0. Imports and initializations

In [1]:
# for data manipulation
import numpy as np
import pandas as pd

# for data normalization and splitting
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# for models
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import losses
from tensorflow.keras import metrics
from tensorflow.keras import callbacks
from tensorflow.keras import preprocessing
from keras.utils.vis_utils import plot_model
import keras_tuner

# for plots
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

# for file management
from os.path import exists
from os import remove

# for sequences
# WINDOW_SIZE = 30  # 10 seconds sequences in 5 minutes
# WINDOW_SHIFT = 6  # 10 seconds sequences in 1 minute

# for models fit
EPOCHS = 25
BATCH_SIZE = 32

plt.rcParams['figure.figsize'] = plt.rcParamsDefault['figure.figsize']

# 1. Anomaly Detection Task

## 1.1 Utility functions

## 1.2 Il task in se

Given data from wearable, the goal is to identify anomalous events, i.e. tremors, in a set of observations. Data are collected by patients with and without Parkinson’s Desease. Features include:
- Identification of patient
- Accelerometer readings in the three axes (x, y, z)
- Heart Rate
- Date and timestamp

The training set is composed by control patient, i.e. volunteers without Parkinson’s Desease. Each 1 seconds there is a record and there are missing value on heart rate attribute (labeled with -1). The test set is composed by patient with Parkinson’s Desease, each 10 seconds there is a record.

### 1.2.1 Loading the dataset

In [2]:
data = pd.read_csv('dataset_task_2/train.csv')

data

Unnamed: 0,patient,x,y,z,heartRate,timestamp,tsDate
0,1502,23,569,878,-1,1568073600000,2019-09-10 00:00:00.003
1,1502,23,571,878,-1,1568073601000,2019-09-10 00:00:01.014
2,1502,23,570,878,-1,1568073602000,2019-09-10 00:00:02.025
3,1502,23,570,878,-1,1568073603000,2019-09-10 00:00:03.035
4,1502,23,570,878,-1,1568073604000,2019-09-10 00:00:04.046
...,...,...,...,...,...,...,...
943517,4506,-636,-399,-654,57,1572479994000,2019-10-30 23:59:54.315
943518,4506,-639,-396,-654,57,1572479995000,2019-10-30 23:59:55.316
943519,4506,-638,-396,-655,57,1572479996000,2019-10-30 23:59:56.336
943520,4506,-637,-396,-655,58,1572479997000,2019-10-30 23:59:57.337


### 1.2.2 Dropping unuseful columns

In [3]:
data_after_drop = data.drop(labels=['timestamp', 'tsDate'], axis=1)

data_after_drop.head()

Unnamed: 0,patient,x,y,z,heartRate
0,1502,23,569,878,-1
1,1502,23,571,878,-1
2,1502,23,570,878,-1
3,1502,23,570,878,-1
4,1502,23,570,878,-1


### 1.2.3 Change granularity

We are going to have ten 1 second sequences in one single 10 seconds sequence.

In [4]:
data_after_shirnk = pd.DataFrame(columns=['patient', 'x', 'y', 'z', 'heartRate'])

for patient in data_after_drop.patient.unique():
    rows = data_after_drop.loc[data_after_drop.patient == patient]
    for i in range(10, len(rows), 10):
        elements = rows[i - 10:i]
        temp = pd.DataFrame(
            data=[[patient, elements.x.mean(), elements.y.mean(), elements.z.mean(), elements.heartRate.mean()]],
            columns=['patient', 'x', 'y', 'z', 'heartRate']
        )
        data_after_shirnk = data_after_shirnk.append(temp, ignore_index=True)
        
data_after_shirnk

Unnamed: 0,patient,x,y,z,heartRate
0,1502,23.0,569.9,878.4,-1.0
1,1502,23.3,570.1,878.0,-1.0
2,1502,23.4,570.3,878.0,-1.0
3,1502,23.1,570.2,878.3,-1.0
4,1502,23.2,570.1,878.0,-1.0
...,...,...,...,...,...
94341,4506,-635.3,-395.5,-657.0,57.0
94342,4506,-639.7,-395.1,-653.3,56.1
94343,4506,-641.2,-393.7,-653.3,56.0
94344,4506,-640.2,-394.0,-653.6,56.2


### 1.2.4 Pad sequences

We add a padding in order to have seequences of same length.

In [5]:
maxlen = 0
for patient in data_after_shirnk.patient.unique():
    rows = data_after_shirnk.loc[data_after_shirnk.patient == patient]
    if len(rows) > maxlen:
        maxlen = len(rows)
    
data_after_pad = pd.DataFrame(columns=['patient', 'x', 'y', 'z', 'heartRate'])
for patient in data_after_shirnk.patient.unique():
    rows = data_after_shirnk.loc[data_after_shirnk.patient == patient]
    rows = preprocessing.sequence.pad_sequences([rows.to_numpy()], maxlen=maxlen, padding='post')
    temp = pd.DataFrame(
        data=rows[0],
        columns=['patient', 'x', 'y', 'z', 'heartRate']
    )
    data_after_pad = data_after_pad.append(temp, ignore_index=True)
    
data_after_pad

Unnamed: 0,patient,x,y,z,heartRate
0,1502,23,569,878,-1
1,1502,23,570,878,-1
2,1502,23,570,878,-1
3,1502,23,570,878,-1
4,1502,23,570,878,-1
...,...,...,...,...,...
111743,0,0,0,0,0
111744,0,0,0,0,0
111745,0,0,0,0,0
111746,0,0,0,0,0
