In [None]:
%config IPCompleter.greedy=True

%matplotlib inline
%load_ext autoreload
%autoreload 2

import os
import argparse
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms, utils
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np

from torch.autograd import Variable

import seaborn as sns

import h5py
#import pywt

%pip install pingouin

import pingouin as pg
%pip install matplotlib -U
%pip install Pillow -U

import scipy

In [None]:
print(os.getcwd())

In [None]:
all_data = './datasets/all_data'

paths = [all_data + '/feeding/csv/Feeding_25Hz_',
         all_data + '/swimming/csv/Swimming_25Hz_',
         all_data + '/resting/csv/Resting_25Hz_',
         all_data + '/ndm/csv/NDM_25Hz_']

In [None]:
'''
Load datasets into pandas
'''
dfs = []

for i in range(7):
    df = pd.concat((pd.read_csv(path + str(i+1) + '.csv',
                                index_col=['Date_Time'],
                                parse_dates=['Date_Time'],
                                infer_datetime_format=True) for path in paths), ignore_index=False, sort=False).iloc[:, 1:9]

    
    df = df.replace(to_replace={"Non directed motion": "NDM"})
    
    dfs.append(df)

In [None]:
static_stats = pd.DataFrame()
dynamic_stats = pd.DataFrame()

static_norms = pd.DataFrame()
dynamic_norms = pd.DataFrame()

for i, df in enumerate(dfs):
    static_stats['Exp ' + str(i+1)] = pd.concat([df[['X_static', 'Y_static', 'Z_static']].max(),
                                                 df[['X_static', 'Y_static', 'Z_static']].min(),
                                                 df[['X_static', 'Y_static', 'Z_static']].mean(),
                                                 df[['X_static', 'Y_static', 'Z_static']].std()], keys=['Max', 'Min', 'Mean', 'Std'])
    
    dynamic_stats['Exp ' + str(i+1)] = pd.concat([df[['X_dynamic', 'Y_dynamic', 'Z_dynamic']].max(),
                                                  df[['X_dynamic', 'Y_dynamic', 'Z_dynamic']].min(),
                                                  df[['X_dynamic', 'Y_dynamic', 'Z_dynamic']].mean(),
                                                  df[['X_dynamic', 'Y_dynamic', 'Z_dynamic']].std()], keys=['Max', 'Min', 'Mean', 'Std'])

    static_norms['Exp ' + str(i+1)] = pd.concat([pd.Series(np.linalg.norm(df[['X_static', 'Y_static', 'Z_static']].max())),
                                                 pd.Series(np.linalg.norm(df[['X_static', 'Y_static', 'Z_static']].min())),
                                                 pd.Series(np.linalg.norm(df[['X_static', 'Y_static', 'Z_static']].mean())),
                                                 pd.Series(np.linalg.norm(df[['X_static', 'Y_static', 'Z_static']].std()))], keys=['Max', 'Min', 'Mean', 'Std'])
    
    dynamic_norms['Exp ' + str(i+1)] = pd.concat([pd.Series(np.linalg.norm(df[['X_dynamic', 'Y_dynamic', 'Z_dynamic']].max())),
                                                  pd.Series(np.linalg.norm(df[['X_dynamic', 'Y_dynamic', 'Z_dynamic']].min())),
                                                  pd.Series(np.linalg.norm(df[['X_dynamic', 'Y_dynamic', 'Z_dynamic']].mean())),
                                                  pd.Series(np.linalg.norm(df[['X_dynamic', 'Y_dynamic', 'Z_dynamic']].std()))], keys=['Max', 'Min', 'Mean', 'Std'])

    
    

In [None]:
static_stats

In [None]:
dynamic_stats

In [None]:
static_norms

In [None]:
dynamic_norms

## Density Estimation for ODBA Distributions

In [None]:
def kde_plot(data_df, feature='ODBA', log_scale=False):
    label_list = data_df['Label'].unique().tolist()
    for label in label_list:
        class_data = data_df.loc[data_df['Label'] == label][feature]

        # Draw the density plot for original data
        ax = sns.kdeplot(data=class_data, legend=True, log_scale=log_scale)

    ax.legend(label_list)

    plt.show()
    plt.clf()

In [None]:
kde_plot(pd.concat(dfs), feature='X_static')
kde_plot(pd.concat(dfs), feature='Y_static')
kde_plot(pd.concat(dfs), feature='Z_static')
kde_plot(pd.concat(dfs), feature='X_dynamic', log_scale=True)
kde_plot(pd.concat(dfs), feature='Y_dynamic', log_scale=True)
kde_plot(pd.concat(dfs), feature='Z_dynamic', log_scale=True)

In [None]:
for i, df in enumerate(dfs):
    print("Normality test for Exp " + str(i+1))
    f_data = df.loc[df['Label'] == 'NDM']['X_dynamic']
#     f_data = df[['ODBA']]
#     log_f = np.log10(f_data)
    
#     plt.hist(f_data, bins='auto')
#     plt.show()

#     plt.hist(log_f, bins='auto')
#     plt.show()

#     sm.qqplot(f_data, line ='r')
#     plt.show()
    
#     sm.qqplot(log_f, line ='r')
#     plt.show()
    
    print(scipy.stats.normaltest(f_data))
#     print(scipy.stats.normaltest(log_f))
    
#     print(pg.normality(f_data, alpha=0.05))
#     print(pg.normality(log_f, alpha=0.05))

#     rand_idx = np.random.randint(0,len(df)-5000)
#     print(pg.normality(log_f[rand_idx:rand_idx+5000]), alpha=0.05)
    
#     print(pg.multivariate_normality(df[['X_dynamic', 
#                                         'Y_dynamic', 
#                                         'Z_dynamic']][rand_idx:rand_idx+5000], alpha=0.05))

## Normality Tests

In [None]:
mean, cov, n = [4, 5], [(0.6, 0.4), (0.4, 0.6)], 5000
x, y = np.random.multivariate_normal(mean, cov, n).T

In [None]:
print(pg.normality(x))
print(pg.normality(y))
print(pg.multivariate_normality(np.column_stack((x, y))))

In [None]:
# plt.hist(x, bins='auto')
# plt.show()
# plt.hist(y, bins='auto')
# plt.show()

plt.hist2d(x, y, bins=(69, 69), cmap='gray')
plt.show()

In [None]:
%pip install statsmodels
import statsmodels.api as sm

In [None]:
sm.qqplot(y, line ='45')
plt.show()

## Data split

In [None]:
# Train: 1, 2, 3, 4, 7
# Val: 6
# Test: 5
train_df = pd.concat([dfs[0], dfs[1], dfs[2], dfs[3], dfs[6]])
val_df = dfs[5]
test_df = dfs[4]

In [None]:
print(train_df.shape)
print(val_df.shape)
print(test_df.shape)

In [None]:
features = ['X_static', 'Y_static', 'Z_static', 'X_dynamic', 'Y_dynamic', 'Z_dynamic']

train_data = train_df[features + ['Label']]
val_data = val_df[features + ['Label']]
test_data = test_df[features + ['Label']]

In [None]:
print([df[features].isna().values.any() for df in dfs])
print(train_data[features].isna().values.any())
print(val_data[features].isna().values.any())
print(test_data[features].isna().values.any())

In [None]:
for column in features:
    mean = np.mean(train_data[column])
    std  = np.std(train_data[column])
    
    train_data[column] = train_data[column].map(lambda x: (x-mean)/std)
    val_data[column]   = val_data[column].map(lambda x: (x-mean)/std)
    test_data[column]  = test_data[column].map(lambda x: (x-mean)/std)

In [None]:
print(np.mean(train_data[features]))
print(np.std(train_data[features]))

print(np.mean(val_data[features]))
print(np.std(val_data[features]))

print(np.mean(test_data[features]))
print(np.std(test_data[features]))

In [None]:
kde_plot(train_data, feature='X_static')
kde_plot(train_data, feature='Y_static')
kde_plot(train_data, feature='Z_static')
kde_plot(train_data, feature='X_dynamic', log_scale=True)
kde_plot(train_data, feature='Y_dynamic', log_scale=True)
kde_plot(train_data, feature='Z_dynamic', log_scale=True)

In [None]:
kde_plot(val_data, feature='X_static')
kde_plot(val_data, feature='Y_static')
kde_plot(val_data, feature='Z_static')
kde_plot(val_data, feature='X_dynamic', log_scale=True)
kde_plot(val_data, feature='Y_dynamic', log_scale=True)
kde_plot(val_data, feature='Z_dynamic', log_scale=True)

In [None]:
kde_plot(test_data, feature='X_static')
kde_plot(test_data, feature='Y_static')
kde_plot(test_data, feature='Z_static')
kde_plot(test_data, feature='X_dynamic', log_scale=True)
kde_plot(test_data, feature='Y_dynamic', log_scale=True)
kde_plot(test_data, feature='Z_dynamic', log_scale=True)

## Group contiguous time intervals

In [None]:
def group_times(df):
    time_diff = df.index.to_series().diff()
    breaks = time_diff > pd.Timedelta('1s')
    groups = breaks.cumsum()
    
    df['Group'] = groups
    
    return df

In [None]:
train_data = group_times(train_data)
val_data = group_times(val_data)
test_data = group_times(test_data)

In [None]:
max_group = train_data['Group'].max()+1

groups = [train_data[train_data['Group'] == i] for i in range(max_group)]



In [None]:
counts = []
for i, group in enumerate(groups):
    seq_dict = {}
    
    for label in label_list:
        seq_dict[label] = len(group[group['Label'] == label])
    
    counts.append(seq_dict)

In [None]:
print(len(counts))

for count_dict in counts:
    print(count_dict)

In [None]:
label_list = ['Feeding', 'Swimming', 'Resting', 'NDM']

for group in groups:
    for label in label_list:
        group = group[group['Label'] == label]
        
        if()
        
        if len(group) <= 50:
            print(label + ": " + str(group['Group'].to_numpy()))

In [None]:
data = []
i = 0
while(len(data) == 0):
    i += 1
    chunk_idx = np.random.randint((train_data['Group'].max()+1))

    data = train_data.loc[(train_data['Label'] == 'Feeding') & (train_data['Group'] == chunk_idx)]

In [None]:
i

In [None]:
data

In [None]:
def sample_sequences(df, num_samples=None, seq_len=50, dims=1, train=True):
    X = []
    Y = []
    
    label_list = ['Feeding', 'Swimming', 'Resting', 'NDM']
    
    for idx, label in enumerate(label_list):
        print(str(idx) + ": " + label)
        
        class_df = 
        if train:
            X_class = np.zeros((num_samples, seq_len, dims), dtype=np.float32)
            Y_class = np.full((num_samples, 1), idx, dtype=np.int64)
            
            for i in range(num_samples):
                data = []
                while(len(data) == 0):
                    chunk_idx = np.random.randint((df['Group'].max()+1))
                    
                    data = df.loc[(df['Label'] == label) & (df['Group'] == chunk_idx)][features].to_numpy()
                
                rand = np.random.randint(len(data)-seq_len)
                
                if dims == 1:
                    X_class[i] = np.expand_dims(data[rand:rand+seq_len], axis=1)
                else:
                    X_class[i] = data[rand:rand+seq_len]
                
        else:
            data = df.loc[df['Label'] == label][features].to_numpy()
            
            num_samples = len(data)//50
            print(num_samples)
            
            X_class = np.zeros((num_samples, seq_len, dims), dtype=np.float32)
            Y_class = np.full((num_samples, 1), idx, dtype=np.int64)
            
            for i in range(num_samples):
                if dims == 1:
                    X_class[i] = np.expand_dims(data[seq_len*i:seq_len*(i+1)], axis=1)
                else:
                    X_class[i] = data[seq_len*i:seq_len*(i+1)]
                    
        X.append(X_class)
        Y.append(Y_class)
        
    return X, Y

In [None]:
X_train, Y_train = sample_sequences(train_data, num_samples=10000, dims=6)

In [None]:
X_train = np.concatenate(X_train)
Y_train = np.concatenate(Y_train)

In [None]:
from sklearn.utils import shuffle

X_train, Y_train = shuffle(X_train, Y_train, random_state=33)

In [None]:
X_val, Y_val = sample_sequences(val_data, num_samples=2000, dims=6)

In [None]:
X_val = np.concatenate(X_val)
Y_val = np.concatenate(Y_val)

In [None]:
X_val, Y_val = shuffle(X_val, Y_val, random_state=33)

In [None]:
print(X_train.mean())
print(X_train.std())

print(X_val.mean())
print(X_val.std())

In [None]:
from collections import Counter

print(Counter(np.squeeze(Y_train).tolist()))
print(Counter(np.squeeze(Y_val).tolist()))

In [None]:
test_data.count()

In [None]:
X_test, Y_test = sample_sequences(test_data, train=False, dims=6)

In [None]:
X_test = np.concatenate(X_test)
Y_test = np.concatenate(Y_test)

In [None]:
print(X_test.shape)
print(Y_test.shape)

In [None]:
test_odba['Label'].value_counts()//50

In [None]:
sum(test_odba['Label'].value_counts()//50)

In [None]:
## This estimates the distribution of each sequence (40,000/2,000 distributions, each of 50 samples)
ax = sns.kdeplot(data=np.squeeze(X_train).T, legend=False)

plt.show()
plt.clf()

ax = sns.kdeplot(data=np.squeeze(X_val).T, legend=False)

plt.show()
plt.clf()

In [None]:
## This estimates the distribution of each sequence element (50 distributions, each of 40,000/2,000 samples)
ax = sns.kdeplot(data=np.squeeze(X_train), legend=False)

plt.show()
plt.clf()

ax = sns.kdeplot(data=np.squeeze(X_val), legend=False)

plt.show()
plt.clf()

In [None]:
## This estimates the distribution over the train and val sets, respectively
ax = sns.kdeplot(data=train_odba['ODBA'], legend=False)

plt.show()
plt.clf()

ax = sns.kdeplot(data=val_odba['ODBA'], legend=False)

plt.show()
plt.clf()

In [None]:
ax = sns.kdeplot(data=X_train.flatten(), legend=False)

plt.show()
plt.clf()

ax = sns.kdeplot(data=X_val.flatten(), legend=False)

plt.show()
plt.clf()

In [None]:
def write(data, gts, outfile):
    '''
        This function writes the pre-processed image data to a HDF5 file
        Args:
          data: numpy.array, image data as numpy array
          outfile: string, path to write file to
    '''
    print("---------------------------------------")
    print("Saving data")
    print("---------------------------------------\n")
    with h5py.File(outfile, "w") as f:
        f.create_dataset("features", data=data, dtype=data.dtype)
        f.create_dataset("gts", data=gts, dtype=gts.dtype)

def load(infile, dataset):
    '''
        This function loads the image data from a HDF5 file 
        Args:
          outfile: string, path to read file from
          
        Returns:
          f["image"][()]: numpy.array, image data as numpy array
    '''
    print("---------------------------------------")
    print("Loading data")
    print("---------------------------------------\n")
    with h5py.File(infile, "r") as f:
        return f[dataset][()]

In [None]:
write(X_train, Y_train, './datasets/data/2d/train/data.hdf5')
write(X_val, Y_val, './datasets/data/2d/val/data.hdf5')
write(X_test, Y_test, './datasets/data/2d/test/orig/data.hdf5')

In [None]:
X_train