In [48]:
# basic
import os 
import sys
import math
from time import time
import pickle

# general
import warnings
import numpy as np
import scipy as sp
import pandas as pd

# visual
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt

# notebook
from IPython.display import display

# Sklean
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score , roc_auc_score , confusion_matrix , f1_score
from sklearn import metrics

from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder

# torch
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn import Sequential
from torch.nn import Sigmoid,ReLU
from torch.nn import Linear
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam,SGD , RMSprop

import gc


from tqdm import tqdm
import copy  # to save best model parameters


In [49]:
%%time
load_activity_map = {}
load_activity_map[0] = 'transient'
load_activity_map[1] = 'lying'
load_activity_map[2] = 'sitting'
load_activity_map[3] = 'standing'
load_activity_map[4] = 'walking'
load_activity_map[5] = 'running'
load_activity_map[6] = 'cycling'
load_activity_map[7] = 'Nordic_walking'
load_activity_map[9] = 'watching_TV'
load_activity_map[10] = 'computer_work'
load_activity_map[11] = 'car driving'
load_activity_map[12] = 'ascending_stairs'
load_activity_map[13] = 'descending_stairs'
load_activity_map[16] = 'vacuum_cleaning'
load_activity_map[17] = 'ironing'
load_activity_map[18] = 'folding_laundry'
load_activity_map[19] = 'house_cleaning'
load_activity_map[20] = 'playing_soccer'
load_activity_map[24] = 'rope_jumping'

CPU times: total: 0 ns
Wall time: 1.04 ms


In [50]:
Data_dir = "./PAMAP2_Dataset/Protocol/"

In [51]:
def generate_three_IMU(name):
    x = name +'_x'
    y = name +'_y'
    z = name +'_z'
    return [x,y,z]

def generate_four_IMU(name):
    x = name +'_x'
    y = name +'_y'
    z = name +'_z'
    w = name +'_w'
    return [x,y,z,w]

def generate_cols_IMU(name):
    # temp
    temp = name+'_temperature'
    output = [temp]
    # acceleration 16
    acceleration16 = name+'_3D_accelerometer_16'
    acceleration16 = generate_three_IMU(acceleration16)
    output.extend(acceleration16)
    # acceleration 6
    acceleration6 = name+'_3D_accelerometer_6'
    acceleration6 = generate_three_IMU(acceleration6)
    output.extend(acceleration6)
    # gyroscope
    gyroscope = name+'_3D_gyroscope'
    gyroscope = generate_three_IMU(gyroscope)
    output.extend(gyroscope)
    # magnometer
    magnometer = name+'_3D_magnetometer'
    magnometer = generate_three_IMU(magnometer)
    output.extend(magnometer)
    # oreintation
    oreintation = name+'_4D_orientation'
    oreintation = generate_four_IMU(oreintation)
    output.extend(oreintation)
    return output

def load_IMU():
    output = ['time_stamp','activity_id', 'heart_rate']
    hand = 'hand'
    hand = generate_cols_IMU(hand)
    output.extend(hand)
    chest = 'chest'
    chest = generate_cols_IMU(chest)
    output.extend(chest)
    ankle = 'ankle'
    ankle = generate_cols_IMU(ankle)
    output.extend(ankle)
    return output
    
def load_subjects(root=Data_dir+'subject'):
    output = pd.DataFrame()
    cols = load_IMU()
    
    for i in range(101,110):
        print ("Loading subject number: {}",format(i))
        path = root + str(i) +'.dat'
        subject = pd.read_table(path, header=None, sep='\s+')
        subject.columns = cols 
        subject['id'] = i
        output = output.append(subject, ignore_index=True)
    output.reset_index(drop=True, inplace=True)
    return output

data = load_subjects()

Loading subject number: {} 101


  output = output.append(subject, ignore_index=True)


Loading subject number: {} 102


  output = output.append(subject, ignore_index=True)


Loading subject number: {} 103


  output = output.append(subject, ignore_index=True)


Loading subject number: {} 104


  output = output.append(subject, ignore_index=True)


Loading subject number: {} 105


  output = output.append(subject, ignore_index=True)


Loading subject number: {} 106


  output = output.append(subject, ignore_index=True)


Loading subject number: {} 107


  output = output.append(subject, ignore_index=True)


Loading subject number: {} 108


  output = output.append(subject, ignore_index=True)


Loading subject number: {} 109


  output = output.append(subject, ignore_index=True)


In [52]:
data.head(10)

Unnamed: 0,time_stamp,activity_id,heart_rate,hand_temperature,hand_3D_accelerometer_16_x,hand_3D_accelerometer_16_y,hand_3D_accelerometer_16_z,hand_3D_accelerometer_6_x,hand_3D_accelerometer_6_y,hand_3D_accelerometer_6_z,...,ankle_3D_gyroscope_y,ankle_3D_gyroscope_z,ankle_3D_magnetometer_x,ankle_3D_magnetometer_y,ankle_3D_magnetometer_z,ankle_4D_orientation_x,ankle_4D_orientation_y,ankle_4D_orientation_z,ankle_4D_orientation_w,id
0,8.38,0,104.0,30.0,2.37223,8.60074,3.51048,2.43954,8.76165,3.35465,...,0.00925,-0.01758,-61.1888,-38.9599,-58.1438,1.0,0.0,0.0,0.0,101
1,8.39,0,,30.0,2.18837,8.5656,3.66179,2.39494,8.55081,3.64207,...,-0.004638,0.000368,-59.8479,-38.8919,-58.5253,1.0,0.0,0.0,0.0,101
2,8.4,0,,30.0,2.37357,8.60107,3.54898,2.30514,8.53644,3.7328,...,0.000148,0.022495,-60.7361,-39.4138,-58.3999,1.0,0.0,0.0,0.0,101
3,8.41,0,,30.0,2.07473,8.52853,3.66021,2.33528,8.53622,3.73277,...,-0.020301,0.011275,-60.4091,-38.7635,-58.3956,1.0,0.0,0.0,0.0,101
4,8.42,0,,30.0,2.22936,8.83122,3.7,2.23055,8.59741,3.76295,...,-0.014303,-0.002823,-61.5199,-39.3879,-58.2694,1.0,0.0,0.0,0.0,101
5,8.43,0,,30.0,2.29959,8.82929,3.5471,2.26132,8.65762,3.77788,...,-0.016024,0.00105,-60.2954,-38.8778,-58.3977,1.0,0.0,0.0,0.0,101
6,8.44,0,,30.0,2.33738,8.829,3.54767,2.27703,8.77828,3.7323,...,-0.053934,0.015594,-60.6307,-38.8676,-58.2711,1.0,0.0,0.0,0.0,101
7,8.45,0,,30.0,2.37142,9.055,3.39347,2.39786,8.89814,3.64131,...,-0.039937,-0.000785,-60.5171,-38.9819,-58.2733,1.0,0.0,0.0,0.0,101
8,8.46,0,,30.0,2.33951,9.13251,3.54668,2.44371,8.98841,3.62596,...,-0.010042,0.017701,-61.2916,-39.6182,-58.1499,1.0,0.0,0.0,0.0,101
9,8.47,0,,30.0,2.25966,9.09415,3.43015,2.42877,9.01871,3.61081,...,-0.013923,0.014498,-60.8509,-39.0821,-58.1478,1.0,0.0,0.0,0.0,101


In [53]:
# percentage of lables that are not interesting 
Interesting_lables_per = np.count_nonzero(data['activity_id'] == 0) / data.shape[0]
print (Interesting_lables_per)

0.32363805742179463


In [None]:
# Dealing with NAN and removing activity = 0
def fix_data(data):
    data = data.drop(data[data['activity_id']==0].index)
    data = data.interpolate()
    # fill all the NaN values in a coulmn with the mean values of the column
    # for colName in data.columns:
    #     data[colName] = data[colName].fillna(data[colName].mean())
    # activity_mean = data.groupby(['activity_id']).mean().reset_index()
    return data

data = fix_data(data)

In [None]:
print('total number of NaN values after interpolation',data.isna().sum().sum())
print('Number of NaN values in heart_rate column',data['heart_rate'].isna().sum())
print('So all NaNs are in heart_rate')

total number of NaN values after interpolation 4
Number of NaN values in heart_rate column 4
So all NaNs are in heart_rate


In [None]:
data.describe()

Unnamed: 0,time_stamp,activity_id,heart_rate,hand_temperature,hand_3D_accelerometer_16_x,hand_3D_accelerometer_16_y,hand_3D_accelerometer_16_z,hand_3D_accelerometer_6_x,hand_3D_accelerometer_6_y,hand_3D_accelerometer_6_z,...,ankle_3D_gyroscope_y,ankle_3D_gyroscope_z,ankle_3D_magnetometer_x,ankle_3D_magnetometer_y,ankle_3D_magnetometer_z,ankle_4D_orientation_x,ankle_4D_orientation_y,ankle_4D_orientation_z,ankle_4D_orientation_w,id
count,1942872.0,1942872.0,1942868.0,1942872.0,1942872.0,1942872.0,1942872.0,1942872.0,1942872.0,1942872.0,...,1942872.0,1942872.0,1942872.0,1942872.0,1942872.0,1942872.0,1942872.0,1942872.0,1942872.0,1942872.0
mean,1705.202,8.08183,107.4879,32.75126,-4.952973,3.585079,3.602232,-4.886286,3.573597,3.786931,...,-0.03652767,0.006658552,-31.58971,1.39696,17.25039,0.3812299,-0.008998437,0.3023855,-0.05698127,104.5664
std,1093.463,6.174789,26.99218,1.794162,6.239663,6.893705,3.956957,6.245972,6.589666,3.94286,...,0.6383285,2.012898,18.34639,21.68678,19.70652,0.3036803,0.5714335,0.332673,0.4800388,2.333052
min,31.2,1.0,57.0,24.875,-145.367,-104.301,-101.452,-61.2147,-61.8417,-61.9347,...,-18.1269,-14.0196,-172.865,-137.908,-102.716,1.52128e-06,-0.956876,-0.876838,-0.997281,101.0
25%,744.54,3.0,86.0,31.6875,-8.96976,1.05821,1.16147,-8.86649,1.05769,1.36337,...,-0.1066082,-0.4416998,-41.71632,-12.48465,3.794668,0.142634,-0.612958,0.004914657,-0.537871,102.0
50%,1480.33,6.0,104.0,33.125,-5.45028,3.52655,3.43141,-5.378515,3.56785,3.66264,...,-0.003950165,-0.00232757,-34.0082,0.776937,18.76755,0.283839,0.0,0.305533,0.0,105.0
75%,2663.61,13.0,124.0,34.0625,-0.9577087,6.453505,6.531523,-0.905887,6.45867,6.77598,...,0.116257,0.09181807,-17.906,17.83892,31.2116,0.5600202,0.6153723,0.5960633,0.438287,107.0
max,4245.68,24.0,202.0,35.5,62.8596,155.699,157.76,52.8214,62.2598,61.9234,...,13.5882,16.5288,91.5516,94.2478,146.9,1.0,0.959538,0.951482,0.996105,109.0


In [None]:
# GILR Based on Documentation Orientation data is not valid
# This means need to remove 4*3 = 12 features 
remove_features = ['hand_4D_orientation_x'  , 'hand_4D_orientation_y'  , 'hand_4D_orientation_z' , 'hand_4D_orientation_w' , \
                   'chest_4D_orientation_x' , 'chest_4D_orientation_y' , 'chest_4D_orientation_z', 'chest_4D_orientation_w', \
                   'ankle_4D_orientation_x' , 'ankle_4D_orientation_y' , 'ankle_4D_orientation_z', 'ankle_4D_orientation_w',\
                    'hand_3D_accelerometer_6_x',	'hand_3D_accelerometer_6_y',	'hand_3D_accelerometer_6_z',\
                    'chest_3D_accelerometer_6_x',	'chest_3D_accelerometer_6_y',	'chest_3D_accelerometer_6_z',\
                    'ankle_3D_accelerometer_6_x',	'ankle_3D_accelerometer_6_y',	'ankle_3D_accelerometer_6_z',\
                    'hand_temperature', 'chest_temperature', 'ankle_temperature', 'heart_rate']
                  
data_lean = data.drop(remove_features, axis=1, inplace = False).reset_index(drop = True)


In [None]:
# # GILR Based on Documentation Orientation data is not valid
# # This means need to remove 4*3 = 12 features 
# remove_features = ['hand_4D_orientation_x'  , 'hand_4D_orientation_y'  , 'hand_4D_orientation_z' , 'hand_4D_orientation_w' , \
#                    'chest_4D_orientation_x' , 'chest_4D_orientation_y' , 'chest_4D_orientation_z', 'chest_4D_orientation_w', \
#                    'ankle_4D_orientation_x' , 'ankle_4D_orientation_y' , 'ankle_4D_orientation_z', 'ankle_4D_orientation_w',\
#                     'hand_3D_accelerometer_6_x',	'hand_3D_accelerometer_6_y',	'hand_3D_accelerometer_6_z',\
#                     'chest_3D_accelerometer_6_x',	'chest_3D_accelerometer_6_y',	'chest_3D_accelerometer_6_z',\
#                     'ankle_3D_accelerometer_6_x',	'ankle_3D_accelerometer_6_y',	'ankle_3D_accelerometer_6_z',\
#                     'hand_3D_magnetometer_x',	'hand_3D_magnetometer_y',	'hand_3D_magnetometer_z',\
#                     'chest_3D_magnetometer_x',	'chest_3D_magnetometer_y',	'chest_3D_magnetometer_z',\
#                     'ankle_3D_magnetometer_x',	'ankle_3D_magnetometer_y',	'ankle_3D_magnetometer_z',\
#                     'hand_temperature', 'chest_temperature', 'ankle_temperature', 'heart_rate']
                  
# data_lean = data.drop(remove_features, axis=1, inplace = False).reset_index(drop = True)

In [None]:
data_lean.shape

(1942872, 30)

In [None]:
data_lean.head()

Unnamed: 0,time_stamp,activity_id,hand_3D_accelerometer_16_x,hand_3D_accelerometer_16_y,hand_3D_accelerometer_16_z,hand_3D_gyroscope_x,hand_3D_gyroscope_y,hand_3D_gyroscope_z,hand_3D_magnetometer_x,hand_3D_magnetometer_y,...,ankle_3D_accelerometer_16_x,ankle_3D_accelerometer_16_y,ankle_3D_accelerometer_16_z,ankle_3D_gyroscope_x,ankle_3D_gyroscope_y,ankle_3D_gyroscope_z,ankle_3D_magnetometer_x,ankle_3D_magnetometer_y,ankle_3D_magnetometer_z,id
0,37.66,1,2.2153,8.27915,5.58753,-0.00475,0.037579,-0.011145,8.932,-67.9326,...,9.73855,-1.84761,0.095156,0.002908,-0.027714,0.001752,-61.1081,-36.8636,-58.3696,101
1,37.67,1,2.29196,7.67288,5.74467,-0.17171,0.025479,-0.009538,9.583,-67.9584,...,9.69762,-1.88438,-0.020804,0.020882,0.000945,0.006007,-60.8916,-36.3197,-58.3656,101
2,37.68,1,2.2909,7.1424,5.82342,-0.238241,0.011214,0.000831,9.05516,-67.4017,...,9.69633,-1.92203,-0.059173,-0.035392,-0.052422,-0.004882,-60.3407,-35.7842,-58.6119,101
3,37.69,1,2.218,7.14365,5.8993,-0.192912,0.019053,0.013374,9.92698,-67.4387,...,9.6637,-1.84714,0.094385,-0.032514,-0.018844,0.02695,-60.7646,-37.1028,-57.8799,101
4,37.7,1,2.30106,7.25857,6.09259,-0.069961,-0.018328,0.004582,9.15626,-67.1825,...,9.77578,-1.88582,0.095775,0.001351,-0.048878,-0.006328,-60.204,-37.1225,-57.8847,101


### Scaling

In [None]:
# max(data['ankle_3D_gyroscope_x'].values)
# max(data['ankle_3D_magnetometer_x'].values)
max_mag = 0
for column in data.columns.values:
    if column.find('accelerometer') != -1:
        # print(column)
        max_temp = max(abs(data[column].values))
        max_mag = max((max_mag, max_temp))

print(max_mag)
max_mag = 0
for column in data.columns.values:
    if column.find('gyroscope') != -1:
        # print(column)
        max_temp = max(abs(data[column].values))
        max_mag = max((max_mag, max_temp))

print(max_mag)
max_mag = 0
for column in data.columns.values:
    if column.find('magnetometer') != -1:
        # print(column)
        max_temp = max(abs(data[column].values))
        max_mag = max((max_mag, max_temp))

print(max_mag)

158.926
28.1354
200.043


In [None]:
for column in data_lean.columns.values:
    if column.find('accelerometer') != -1:
        # print(column)
        data_lean[column] = data_lean[column]/160

    if column.find('gyroscope') != -1:
        # print(column)
        data_lean[column] = data_lean[column]/28

    if column.find('magnetometer') != -1:
        # print(column)
        data_lean[column] = data_lean[column]/200

In [None]:
stats = data_lean.describe()

In [None]:
stats

Unnamed: 0,time_stamp,activity_id,hand_3D_accelerometer_16_x,hand_3D_accelerometer_16_y,hand_3D_accelerometer_16_z,hand_3D_gyroscope_x,hand_3D_gyroscope_y,hand_3D_gyroscope_z,hand_3D_magnetometer_x,hand_3D_magnetometer_y,...,ankle_3D_accelerometer_16_x,ankle_3D_accelerometer_16_y,ankle_3D_accelerometer_16_z,ankle_3D_gyroscope_x,ankle_3D_gyroscope_y,ankle_3D_gyroscope_z,ankle_3D_magnetometer_x,ankle_3D_magnetometer_y,ankle_3D_magnetometer_z,id
count,1942872.0,1942872.0,1942872.0,1942872.0,1942872.0,1942872.0,1942872.0,1942872.0,1942872.0,1942872.0,...,1942872.0,1942872.0,1942872.0,1942872.0,1942872.0,1942872.0,1942872.0,1942872.0,1942872.0,1942872.0
mean,1705.202,8.08183,-0.03095608,0.02240674,0.02251395,5.289203e-05,0.001419494,-0.0001136733,0.1057349,-0.06984156,...,0.05880242,-0.0009574616,-0.0162027,0.0003557253,-0.001304559,0.0002378054,-0.1579485,0.006984798,0.08625196,104.5664
std,1093.463,6.174789,0.03899789,0.04308566,0.02473098,0.04758326,0.03415111,0.05710064,0.119042,0.121159,...,0.04080089,0.04837145,0.02460096,0.04024652,0.02279745,0.07188921,0.09173196,0.1084339,0.09853262,2.333052
min,31.2,1.0,-0.9085437,-0.6518812,-0.634075,-1.004836,-0.6374821,-0.5094536,-0.519705,-1.000215,...,-0.969175,-0.9840188,-0.9932875,-0.8569643,-0.6473893,-0.5007,-0.864325,-0.68954,-0.51358,101.0
25%,744.54,3.0,-0.056061,0.006613813,0.007259188,-0.01351494,-0.008002357,-0.01377612,0.02426888,-0.1453945,...,0.05266581,-0.01358144,-0.02381881,-0.007453411,-0.003807438,-0.01577499,-0.2085816,-0.06242325,0.01897334,102.0
50%,1480.33,6.0,-0.03406425,0.02204094,0.02144631,-0.0002140586,0.0002118809,-0.0002012596,0.114762,-0.0806415,...,0.05961481,-0.001816787,-0.01511931,0.0001645075,-0.0001410773,-8.31275e-05,-0.170041,0.003884685,0.09383775,105.0
75%,2663.61,13.0,-0.00598568,0.04033441,0.04082202,0.01200273,0.009555938,0.01314512,0.197023,0.01073315,...,0.06433328,0.01112645,-0.006518187,0.00468,0.004152036,0.003279217,-0.08953,0.08919462,0.156058,107.0
max,4245.68,24.0,0.3928725,0.9731188,0.986,0.9434214,0.8242107,0.5120857,0.68772,0.547435,...,0.9827,0.9830813,0.99295,0.6221571,0.4852929,0.5903143,0.457758,0.471239,0.7345,109.0


In [None]:
# print(stats.loc['std'])

## Preparing Samples

In [None]:
# labeling by unique labels
u, labels = np.unique(data_lean['activity_id'].values, return_inverse=True)
print(u)
activity_names = []
for i in u:
    activity_names.append(load_activity_map[i])
print(activity_names)
data_lean['activity_id'] = labels

[ 1  2  3  4  5  6  7 12 13 16 17 24]
['lying', 'sitting', 'standing', 'walking', 'running', 'cycling', 'Nordic_walking', 'ascending_stairs', 'descending_stairs', 'vacuum_cleaning', 'ironing', 'rope_jumping']


#### TRAIN-TEST Split by user 6

In [None]:
train_df = data_lean[data_lean['id'] != 106]
test_df = data_lean[data_lean['id'] == 106]
train_labels = train_df['activity_id'].values
test_labels = test_df['activity_id'].values

train_df.drop(['activity_id', 'id', 'time_stamp'], axis=1, inplace = True)
test_df.drop(['activity_id', 'id', 'time_stamp'], axis=1, inplace = True)
final_features = train_df.columns

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.drop(['activity_id', 'id', 'time_stamp'], axis=1, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.drop(['activity_id', 'id', 'time_stamp'], axis=1, inplace = True)


In [None]:
from scipy.spatial.transform import Rotation as R

In [None]:
def add_rotation(df: pd.DataFrame, scale):
    df_to_rotate = df.copy()

    acc_hand = df_to_rotate[['hand_3D_accelerometer_16_x', 'hand_3D_accelerometer_16_y', 'hand_3D_accelerometer_16_z']].to_numpy()
    # angles = np.random.normal(loc=0.0, scale=10, size=3) # 3 angles with std = 10 degrees here 0.84 for test
    angles = np.random.normal(loc=0.0, scale=scale, size=3) # 3 angles with std = 10 degrees here 0.84 for test

    r = R.from_euler('zyx', [angles], degrees=True)
    rotated = r.apply(acc_hand)
    
    df_to_rotate['hand_3D_accelerometer_16_x'] = rotated[:, 0]
    df_to_rotate['hand_3D_accelerometer_16_y'] = rotated[:, 1]
    df_to_rotate['hand_3D_accelerometer_16_z'] = rotated[:, 2]

    acc_chest = df_to_rotate[['chest_3D_accelerometer_16_x', 'chest_3D_accelerometer_16_y', 'chest_3D_accelerometer_16_z']].to_numpy()
    angles = np.random.normal(loc=0.0, scale=scale, size=3) # 3 angles with std = 10 degrees here

    r = R.from_euler('zyx', [angles], degrees=True)
    rotated = r.apply(acc_chest)
    
    df_to_rotate['chest_3D_accelerometer_16_x'] = rotated[:, 0]
    df_to_rotate['chest_3D_accelerometer_16_y'] = rotated[:, 1]
    df_to_rotate['chest_3D_accelerometer_16_z'] = rotated[:, 2]

    acc_ankle = df_to_rotate[['ankle_3D_accelerometer_16_x', 'ankle_3D_accelerometer_16_y', 'ankle_3D_accelerometer_16_z']].to_numpy()
    angles = np.random.normal(loc=0.0, scale=scale, size=3) # 3 angles with std = 10 degrees here

    r = R.from_euler('zyx', [angles], degrees=True)
    rotated = r.apply(acc_ankle)
    
    df_to_rotate['ankle_3D_accelerometer_16_x'] = rotated[:, 0]
    df_to_rotate['ankle_3D_accelerometer_16_y'] = rotated[:, 1]
    df_to_rotate['ankle_3D_accelerometer_16_z'] = rotated[:, 2]
            
    return df_to_rotate.to_numpy()

In [None]:
def add_noise(df: pd.DataFrame):
    df_noise = df.copy()
    for col_name in df:
        if 'accelerometer' in col_name: 
            # noise = np.random.normal(loc=0.0, scale=0.008, size=df_noise.shape[0])
            # noise = np.random.normal(loc=0.0, scale=0.08, size=df_noise.shape[0])
            noise = np.random.normal(loc=0.0, scale=0.016, size=df_noise.shape[0])
            df_noise[col_name] = df_noise[col_name] + noise
        elif 'gyroscope' in col_name:
            # noise = np.random.normal(loc=0.0, scale=0.005, size=df_noise.shape[0])
            # noise = np.random.normal(loc=0.0, scale=0.05, size=df_noise.shape[0])
            noise = np.random.normal(loc=0.0, scale=0.01, size=df_noise.shape[0])
            df_noise[col_name] = df_noise[col_name] + noise
            
    return df_noise.to_numpy()

In [None]:
def add_noise_2(df: pd.DataFrame):
    df_noise = df.copy()
    for col_name in df:
        if 'accelerometer' in col_name: 
            # noise = np.random.normal(loc=0.0, scale=0.008, size=df_noise.shape[0])
            # noise = np.random.normal(loc=0.0, scale=0.08, size=df_noise.shape[0])
            noise = np.random.normal(loc=0.0, scale=0.016, size=df_noise.shape[0])
            df_noise[col_name] = df_noise[col_name] + noise
        elif 'gyroscope' in col_name:
            # noise = np.random.normal(loc=0.0, scale=0.005, size=df_noise.shape[0])
            # noise = np.random.normal(loc=0.0, scale=0.05, size=df_noise.shape[0])
            noise = np.random.normal(loc=0.0, scale=0.01, size=df_noise.shape[0])
            df_noise[col_name] = df_noise[col_name] + noise
            
    return df_noise

In [None]:
def add_augmentation(X_train_df: pd.DataFrame, y_train, aug_factor):
    
    orig_size = X_train_df.shape[0]
    
    for m in range(aug_factor):
        X_train_df_noise = X_train_df[0:orig_size].copy()
    
        for col_name in X_train_df:
            if 'accelerometer' in col_name: 
                noise = np.random.normal(loc=0.0, scale=0.5, size=X_train_df_noise.shape[0])
                X_train_df_noise[col_name] = X_train_df_noise[col_name] + noise
            elif 'gyroscope' in col_name:
                noise = np.random.normal(loc=0.0, scale=0.2, size=X_train_df_noise.shape[0])
                X_train_df_noise[col_name] = X_train_df_noise[col_name] + noise
            
        X_train_df = pd.concat( [X_train_df , X_train_df_noise], axis = 0 )
        y_train = np.concatenate((y_train,y_train[0:orig_size]), axis = 0)
    
    return X_train_df.to_numpy(), y_train

In [None]:
# train_df, train_labels = add_augmentation(train_df , train_labels , 5)

In [None]:
from scipy.fft import fft

def prepare_segments(data_df, labels, seq_len = 20, needs_split = False, ratio = None, augment_factor = 1):

    time_step_samples = 25
    # seq_len = 20
    win_size = time_step_samples*seq_len
    overlap_factor = 0.25
    count_non_homogenous = 0

    skip = int(win_size * (1-overlap_factor))
    ## window size is time_step_samples x seq_len
    len_data = len(data_df)
    samples = []
    dev_samples = []
    current_index = 0
    while current_index + win_size < len_data: #get another window
        final_index = current_index + win_size
        win_labels = labels[current_index:final_index]
        if sum(win_labels - win_labels[0]) == 0: #all the same label
            label = win_labels[0]
            window_data = data_df.iloc[current_index:final_index, :]
            if needs_split:
                random_number = np.random.uniform(size = 1)
                to_train = random_number > ratio
                if to_train and augment_factor>1:
                    for aug in range(augment_factor-1):
                        # aug_window_data = add_noise(window_data)
                        aug_window_data = add_rotation(window_data, 10) #second parameter is the angle STD
                        window_data_reshaped = aug_window_data.reshape((seq_len, time_step_samples, -1))
                        window_data_reshaped_T = np.transpose(window_data_reshaped, (0,2,1))
                        yf = fft(window_data_reshaped_T)
                        yf_trimmed_abs = torch.tensor(np.abs(yf[:, :, 0:time_step_samples//2+1]), dtype=torch.float32)
                        sample = (yf_trimmed_abs, label)
                        samples.append(sample)

            window_data_reshaped = window_data.to_numpy().reshape((seq_len, time_step_samples, -1))
            window_data_reshaped_T = np.transpose(window_data_reshaped, (0,2,1))

            yf = fft(window_data_reshaped_T)
            yf_trimmed_abs = torch.tensor(np.abs(yf[:, :, 0:time_step_samples//2+1]), dtype=torch.float32)
            sample = (yf_trimmed_abs, label)
            if needs_split:
                if to_train:
                    samples.append(sample)
                else:
                    dev_samples.append(sample)
            else:
                samples.append(sample)
        else:
            count_non_homogenous+= 1
        current_index = current_index + skip

    print('Number of sequences not used for being non pure = ', count_non_homogenous)
    return (samples, dev_samples) if needs_split else samples


In [None]:
# print(window_data_reshaped_T.shape)
# print(samples[0][0].shape)

In [None]:
# print(count_non_homogenous)

In [None]:
print(len(train_labels))

1692776


In [None]:
# train, dev = prepare_segments(train_df, train_labels, seq_len = 20, needs_split = True, ratio = 0.1, augment_factor = 1)
# test = prepare_segments(test_df, test_labels, seq_len = 20)

# train, dev = prepare_segments(train_df, train_labels, seq_len = 5, needs_split = True, ratio = 0.1, augment_factor = 1)
# test = prepare_segments(test_df, test_labels, seq_len = 5)

In [None]:
# print('size of training set is ', len(train))
# print('size of dev set is ', len(dev))
# print('size of test set is ', len(test))

In [None]:
# path = './train_dev_test_aug1_n30'
# with open(path , 'wb') as f:
#     pickle.dump((train, dev, test), f)

### Augmented data

In [None]:
train, dev = prepare_segments(train_df, train_labels, seq_len = 20, needs_split = True, ratio = 0.1, augment_factor = 5)
test = prepare_segments(test_df, test_labels, seq_len = 20)

# train, dev = prepare_segments(train_df, train_labels, seq_len = 10, needs_split = True, ratio = 0.1, augment_factor = 5)
# test = prepare_segments(test_df, test_labels, seq_len = 10)

Number of sequences not used for being non pure =  123
Number of sequences not used for being non pure =  16


In [None]:
print('size of training set is ', len(train))
print('size of dev set is ', len(dev))
print('size of test set is ', len(test))

size of training set is  19800
size of dev set is  430
size of test set is  650


In [None]:
path = './train_dev_test_aug5_rot_10'
with open(path , 'wb') as f:
    pickle.dump((train, dev, test), f)

In [None]:
# from scipy.fft import fftfreq
# # Number of sample points
# N = 25
# # sample spacing
# T = 1.0 / 100.0
# x = np.linspace(0.0, N*T, N, endpoint=False)
# trimmed_yf = train[100][0][0]
# xf = fftfreq(N, T)[:N//2+1]
# plt.plot( 2.0/N * trimmed_yf[0])
# plt.grid()
# plt.show()

In [None]:
# fig1 = plt.figure(figsize = (12,6))
# ax1 = fig1.gca()
# ax1.imshow(trimmed_yf[:, 1:], extent=[0, 1, 0, 1]) # not plotting 0 term

In [None]:
# final_features

In [None]:
# from torch.utils.data import DataLoader
# traindl = DataLoader(train, batch_size=8, shuffle=True)
# testdl = DataLoader(test, batch_size=8, shuffle=True)

In [None]:
# dl_iter = iter(traindl)
# batch = next(dl_iter)

In [None]:
# len(batch[1])

In [None]:
# batch[0].shape