# Data Preprocessing

This Jupyter demonstrates different method of manipulating the data before training

In [1]:
#Imports
import math

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from constants import *
from data_loader import load_data_from_file
from datetime import timedelta
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.decomposition import PCA
from utils import print_line_divider

# from datahandler.constants import all_features, data_version, acc_features, tensorboard_dir, location_labels


In [2]:
test_file_path = os.path.join(train_folder, "op1_datacollection.csv")
window_length_in_seconds = 1
window_size = 20

In [3]:
# Step 0: Data loading
df = load_data_from_file(test_file_path)
df

Unnamed: 0_level_0,accelerometerX,accelerometerY,accelerometerZ,magnetometerX,magnetometerY,magnetometerZ,gyroscopeX,gyroscopeY,gyroscopeZ,labelPhone,labelActivity
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2022-05-24 15:31:22.641000+02:00,-0.483794,5.170847,7.762258,193.24768,-48.899918,118.240080,-0.173468,0.031127,-0.269560,beingusedinhand,walking
2022-05-24 15:31:22.648000+02:00,-0.464634,5.144502,7.618557,193.24768,-48.899918,118.240080,-0.095703,-0.018408,-0.229080,beingusedinhand,walking
2022-05-24 15:31:22.652000+02:00,-0.567620,5.192402,7.539522,193.34616,-49.615753,120.270350,-0.073333,-0.049833,-0.217362,beingusedinhand,walking
2022-05-24 15:31:22.657000+02:00,-0.598755,5.283413,7.403006,193.34616,-49.615753,120.270350,-0.035516,-0.116945,-0.183806,beingusedinhand,walking
2022-05-24 15:31:22.678000+02:00,-0.222737,5.501360,6.849756,193.47401,-48.607376,120.840996,0.018280,-0.285257,-0.095922,beingusedinhand,walking
...,...,...,...,...,...,...,...,...,...,...,...
2022-05-24 15:36:24.268000+02:00,-0.605940,7.082073,8.042476,182.12231,-29.269182,109.225770,-0.414750,-0.445046,-0.087933,beingusedinhand,walking
2022-05-24 15:36:24.270000+02:00,-0.605940,7.082073,8.042476,182.12231,-29.269182,109.225770,-0.449371,-0.474874,-0.105510,beingusedinhand,walking
2022-05-24 15:36:24.271000+02:00,-0.519719,6.986272,8.301138,183.10033,-30.274485,109.563500,-0.449371,-0.474874,-0.105510,beingusedinhand,walking
2022-05-24 15:36:24.272000+02:00,-0.443079,6.852151,8.672366,183.10033,-30.274485,109.563500,-0.476535,-0.521745,-0.131608,beingusedinhand,walking


In [4]:
# Step 1: Divide the collected data into fixed-size chunk
fixed_size_data = []
fixed_size_indexes = []

current_timestamp = df.index[0].to_pydatetime()
last_timestamp_raw = df.index[df.shape[0] - 1].to_pydatetime()
current_timestamp_raw_index = 0
one_window_length_in_millis = window_length_in_seconds * 1000 / window_size

while True:
    current_timestamp = current_timestamp + timedelta(milliseconds=one_window_length_in_millis)
    if current_timestamp > last_timestamp_raw:
        break

    while current_timestamp_raw_index < df.shape[0] - 1:
        next_timestamp_raw = df.index[current_timestamp_raw_index + 1].to_pydatetime()
        if next_timestamp_raw < current_timestamp:
            current_timestamp_raw_index += 1
        else:
            break

    fixed_size_data.append(df.iloc[current_timestamp_raw_index])
    fixed_size_indexes.append(current_timestamp)

fixed_size_df = pd.DataFrame(
    data=fixed_size_data,
    index=fixed_size_indexes,
    columns=df.columns
)
fixed_size_df

Unnamed: 0,accelerometerX,accelerometerY,accelerometerZ,magnetometerX,magnetometerY,magnetometerZ,gyroscopeX,gyroscopeY,gyroscopeZ,labelPhone,labelActivity
2022-05-24 15:31:22.691000+02:00,0.064666,5.621111,6.749166,193.47401,-48.607376,120.840996,-0.029124,-0.233059,-0.082606,beingusedinhand,walking
2022-05-24 15:31:22.741000+02:00,0.112566,6.035450,7.798184,193.61359,-49.902206,117.832910,-0.306093,0.100902,-0.189133,beingusedinhand,walking
2022-05-24 15:31:22.791000+02:00,-0.210762,5.786367,8.255632,193.97388,-49.087320,119.856590,0.122676,-0.154762,-0.032539,beingusedinhand,walking
2022-05-24 15:31:22.841000+02:00,0.390388,6.643784,8.622070,193.95080,-51.130830,120.310700,-0.058419,0.089716,0.004745,beingusedinhand,walking
2022-05-24 15:31:22.891000+02:00,0.014370,6.725215,9.457932,194.85138,-49.879868,117.881160,0.073141,-0.020006,0.056411,beingusedinhand,walking
...,...,...,...,...,...,...,...,...,...,...,...
2022-05-24 15:36:24.041000+02:00,-0.756826,6.861731,7.161109,183.85406,-30.071634,108.581540,-0.189447,-0.037583,0.065998,beingusedinhand,walking
2022-05-24 15:36:24.091000+02:00,-1.037044,6.591094,8.087981,184.47568,-30.054789,108.317345,-0.091442,-0.096705,0.161339,beingusedinhand,walking
2022-05-24 15:36:24.141000+02:00,-0.467029,7.094048,8.883128,184.34580,-30.568605,108.462090,-0.074931,-0.167012,0.293432,beingusedinhand,walking
2022-05-24 15:36:24.191000+02:00,-0.064666,7.192244,10.020762,183.71968,-29.929827,109.872340,0.105632,-0.308692,0.109141,beingusedinhand,walking


In [5]:
raw_features = ['accelerometerX', 'accelerometerY', 'accelerometerZ', 'magnetometerX', 'magnetometerY', 'magnetometerZ',
                'gyroscopeX', 'gyroscopeY', 'gyroscopeZ']

In [6]:
# Step 3: Adding extra feature - Magnitude and Angle
added_features = ["accMag", "gyroMag", "magMag", "accAng", "gyroAng", "magAng"]
## MAGNITUDE
accMag = []
gyroMag = []
magMag = []
for index, row in fixed_size_df.iterrows():
    accMag.append(math.sqrt(row['accelerometerX'] ** 2 + row['accelerometerY'] ** 2 + row['accelerometerZ'] ** 2))
    gyroMag.append(math.sqrt(row['gyroscopeX'] ** 2 + row['gyroscopeY'] ** 2 + row['gyroscopeZ'] ** 2))
    magMag.append(math.sqrt(row['magnetometerX'] ** 2 + row['magnetometerY'] ** 2 + row['magnetometerZ'] ** 2))
fixed_size_df["accMag"] = accMag
fixed_size_df["gyroMag"] = gyroMag
fixed_size_df["magMag"] = magMag


## ANGLE - angle from unit vector (1,1,1)
## Formula: angle = arccos([xa * xb + ya * yb + za * zb] / (√(xa2 + ya2 + za2) * √(xb2 + yb2 + zb2))]
def calculate_angle(input_x, input_y, input_z):
    dividend = input_x * 1 + input_y * 1 + input_z * 1
    divisor = math.sqrt(1 + 1 + 1) * math.sqrt(input_x ** 2 + input_y ** 2 + input_z ** 2)
    return math.acos(dividend / divisor)


accAng = []
gyroAng = []
magAng = []
for index, row in fixed_size_df.iterrows():
    accAng.append(calculate_angle(row["accelerometerX"], row["accelerometerY"], row["accelerometerZ"]))
    gyroAng.append(calculate_angle(row['gyroscopeX'], row['gyroscopeY'], row['gyroscopeZ']))
    magAng.append(calculate_angle(row["magnetometerX"], row["magnetometerY"], row["magnetometerZ"]))
fixed_size_df['accAng'] = accAng
fixed_size_df['gyroAng'] = gyroAng
fixed_size_df['magAng'] = magAng

fixed_size_df.head()

Unnamed: 0,accelerometerX,accelerometerY,accelerometerZ,magnetometerX,magnetometerY,magnetometerZ,gyroscopeX,gyroscopeY,gyroscopeZ,labelPhone,labelActivity,accMag,gyroMag,magMag,accAng,gyroAng,magAng
2022-05-24 15:31:22.691000+02:00,0.064666,5.621111,6.749166,193.47401,-48.607376,120.840996,-0.029124,-0.233059,-0.082606,beingusedinhand,walking,8.783639,0.248975,233.232536,0.613998,2.497318,0.852982
2022-05-24 15:31:22.741000+02:00,0.112566,6.03545,7.798184,193.61359,-49.902206,117.83291,-0.306093,0.100902,-0.189133,beingusedinhand,walking,9.861592,0.373691,232.079829,0.6155,2.225882,0.862357
2022-05-24 15:31:22.791000+02:00,-0.210762,5.786367,8.255632,193.97388,-49.08732,119.85659,0.122676,-0.154762,-0.032539,beingusedinhand,walking,10.083746,0.200148,233.240291,0.656858,1.75831,0.856176
2022-05-24 15:31:22.841000+02:00,0.390388,6.643784,8.62207,193.9508,-51.13083,120.3107,-0.058419,0.089716,0.004745,beingusedinhand,walking,10.891849,0.107165,233.893008,0.591869,1.375375,0.863841
2022-05-24 15:31:22.891000+02:00,0.01437,6.725215,9.457932,194.85138,-49.879868,117.88116,0.073141,-0.020006,0.056411,beingusedinhand,walking,11.605223,0.094509,233.133072,0.633749,0.837654,0.861961


In [7]:
# Step 4: Extracting mean/std/min/max (time-domain features)
window_index_start = 0
window_index_increasing_size = int(window_size / 4)
feature_columns = []
feature_data = []

domain_types = raw_features + added_features
feature_types = ["mean", "std", "min", "max"]
for domain_type in domain_types:
    for feature_type in feature_types:
        feature_columns.append(feature_type + domain_type)

feature_columns.append("labelPhone")

while window_index_start + window_size < fixed_size_df.shape[0]:
    # Iteration per large sliding window of 20 windows.
    first_index = window_index_start
    last_index = window_index_start + window_size
    sub_df = fixed_size_df[first_index:last_index]

    # Feature extraction from mean/max/min/std
    feature = []
    for domain_type in domain_types:
        raw_feature_series_describe = sub_df[domain_type].describe()
        for feature_type in feature_types:
            feature.append(raw_feature_series_describe[feature_type])
    feature_data.append(feature)

    # Final: Label
    feature.append(sub_df["labelPhone"][0])
    window_index_start += window_index_increasing_size

features_df = pd.DataFrame(
    data=feature_data,
    columns=feature_columns
)
features_df.head()

Unnamed: 0,meanaccelerometerX,stdaccelerometerX,minaccelerometerX,maxaccelerometerX,meanaccelerometerY,stdaccelerometerY,minaccelerometerY,maxaccelerometerY,meanaccelerometerZ,stdaccelerometerZ,...,maxaccAng,meangyroAng,stdgyroAng,mingyroAng,maxgyroAng,meanmagAng,stdmagAng,minmagAng,maxmagAng,labelPhone
0,-0.049337,0.549471,-1.257385,1.341211,5.972101,0.84463,4.196074,7.33834,8.327603,1.213075,...,0.778359,1.901229,0.778735,0.43488,2.969028,0.860412,0.003969,0.852761,0.867262,beingusedinhand
1,0.046703,0.708177,-1.257385,1.631008,5.777745,0.885845,4.196074,7.33834,8.339578,1.428087,...,0.778359,2.061527,0.760651,0.43488,2.969028,0.860356,0.003755,0.852761,0.867262,beingusedinhand
2,0.143342,0.636278,-0.689766,1.631008,5.771997,0.642066,4.802014,6.868917,8.020681,1.110337,...,0.702157,2.215729,0.635894,0.677052,2.969028,0.858446,0.004345,0.848736,0.867262,beingusedinhand
3,0.145497,0.677149,-1.130449,1.631008,6.025989,0.768714,4.802014,7.129973,8.661589,1.280769,...,0.72186,1.7659,0.853068,0.206821,2.835709,0.855757,0.006717,0.844051,0.867262,beingusedinhand
4,0.078916,0.659926,-1.130449,1.631008,5.663742,0.788709,4.802014,7.129973,8.212882,1.484548,...,0.72509,1.405452,0.861323,0.206821,2.835709,0.850104,0.008087,0.835524,0.864769,beingusedinhand


In [8]:
# Step 5.1: Get a normalized df
feature_count = features_df.shape[1] - 1
values = features_df.iloc[:, 0:feature_count]
labels = features_df.iloc[:, feature_count: feature_count + 1]

normalizer = Normalizer()
normalized_values = normalizer.fit_transform(values)
normalized_df = pd.DataFrame(data=normalized_values, columns=values.columns)
normalized_df["labelPhone"] = labels

# Step 5.2 - Get a standardized_df
standard_scaler = StandardScaler()
standardized_values = standard_scaler.fit_transform(values)
standard_df = pd.DataFrame(data=standardized_values, columns=values.columns)
standard_df["labelPhone"] = labels

# Results
column_count = 4
print("Original data: ")
print(features_df.head().iloc[:,0:column_count])
print_line_divider()

print("Normalized data - Scale to range 0 or 1: ")
print(normalized_df.head().iloc[:,0:column_count])
print_line_divider()
print("Standardized data - Scale to mean 0 and std 1 (Normal distribution): ")
print(standard_df.head().iloc[:,0:column_count])

Original data: 
   meanaccelerometerX  stdaccelerometerX  minaccelerometerX  maxaccelerometerX
0           -0.049337           0.549471          -1.257385           1.341211
1            0.046703           0.708177          -1.257385           1.631008
2            0.143342           0.636278          -0.689766           1.631008
3            0.145497           0.677149          -1.130449           1.631008
4            0.078916           0.659926          -1.130449           1.631008
****************************************************
Normalized data - Scale to range 0 or 1: 
   meanaccelerometerX  stdaccelerometerX  minaccelerometerX  maxaccelerometerX
0           -0.000086           0.000953          -0.002181           0.002326
1            0.000080           0.001217          -0.002161           0.002804
2            0.000245           0.001087          -0.001179           0.002787
3            0.000247           0.001151          -0.001922           0.002773
4            0.00013

In [9]:
# Step 6 - Use PCA from standardized df
pca = PCA()
standard_values = standard_df.iloc[:, 0:feature_count]
labels = standard_df.iloc[:, feature_count: feature_count + 1]
pca_values = pca.fit_transform(standard_values)
pca_df = pd.DataFrame(data=pca_values, columns=standard_values.columns)
pca_df["labelPhone"] = labels
pca_df.head()

Unnamed: 0,meanaccelerometerX,stdaccelerometerX,minaccelerometerX,maxaccelerometerX,meanaccelerometerY,stdaccelerometerY,minaccelerometerY,maxaccelerometerY,meanaccelerometerZ,stdaccelerometerZ,...,maxaccAng,meangyroAng,stdgyroAng,mingyroAng,maxgyroAng,meanmagAng,stdmagAng,minmagAng,maxmagAng,labelPhone
0,-1.724499,6.496717,0.260324,1.38911,-0.229724,-1.688471,0.819361,-2.140537,1.119099,-1.186147,...,-0.012383,-0.00536,-0.052039,-0.021557,-0.026148,-0.012432,-0.011175,0.019767,-0.016528,beingusedinhand
1,-0.514967,8.888951,1.509406,1.350561,0.443643,-0.899922,0.285086,-0.725952,0.948603,-0.392928,...,-0.064847,-0.080633,0.047651,-0.025666,-0.005524,-0.020841,-0.015117,0.033187,-0.018931,beingusedinhand
2,-2.97291,6.775822,3.78253,1.372185,1.487733,-3.028354,-0.777782,-1.755839,0.659757,1.046754,...,0.013745,0.059888,-0.050403,-0.019747,-0.000709,-0.013703,0.022278,0.008282,-0.014047,beingusedinhand
3,-0.001303,8.822797,0.221553,1.198139,2.282421,0.63734,-0.398966,-3.623822,-0.019194,-1.063098,...,0.01169,0.041518,-0.075938,-0.023889,-0.013942,-0.006964,0.022083,-0.000311,0.001646,beingusedinhand
4,-0.112264,6.872948,-1.731081,1.647119,1.985916,0.566754,1.400854,-2.223861,-1.541188,0.547376,...,-0.009288,0.026922,-0.027596,-0.02404,-0.001764,-0.014313,0.024721,-0.004941,-0.005386,beingusedinhand


In [None]:
# FINAL explaination
# df - Raw data from the beginnning
# fixed_size_df - Data transformed into fixed length window + Added features
# features_df - Data with features extracted (min, max, std, mean) from fixed_size_df
# standard_df - Standardized from features_df
# normalized_df - Normalized from features_df
# pca_df - PCA from standardized df