In [78]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
from scipy import stats
import time
from src.OutlierDetection import DistanceBasedOutlierDetection
from sklearn.neighbors import LocalOutlierFactor
from matplotlib.legend_handler import HandlerPathCollection
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from util.plot_util import plot_lof_2d_grad, plot_lof_2d_circle, plot_lof_3d_grad ,plot_lof_3d_circle

In [79]:
def remove_start_end(df, secs=5):
    return df[(df["Time (s)"] > secs) & (df["Time (s)"] < max(df["Time (s)"]) - secs)]


def calculate_lof(df, cols, label):
    if not all(col in df.columns for col in cols):
        missing_cols = [col for col in cols if col not in df.columns]
        print(f"Missing columns for LOF calculation: {missing_cols}")
        return df
    
    df[cols] = df[cols].apply(lambda x: x.fillna(x.median()))
    #df[cols] = df[cols].interpolate()  
    lof = LocalOutlierFactor(n_neighbors=20, contamination=0.2)
    y_pred = lof.fit_predict(df[cols])
    df[f'lof_{label}'] = y_pred
    df[f'lof_factor_{label}'] = -lof.negative_outlier_factor_  # Negate to align with typical plotting conventions
    return df

def remove_outliers_and_impute(df, cols, label):
    # Mark outliers with NaN
    df.loc[df[f'lof_{label}'] == -1, cols] = pd.NA
    # Interpolate NaNs
    df[cols] = df[cols].interpolate()
    return df

def correct_column_names(df, corrections):
    return df.rename(columns=corrections)

column_corrections = {
        "Accelerometer": {
                "X (m/s^2)": "Acceleration x (m/s^2)",
                "Y (m/s^2)": "Acceleration y (m/s^2)",
                "Z (m/s^2)": "Acceleration z (m/s^2)"
        },
        "Gyroscope": {
                "X (rad/s)": "Gyroscope x (rad/s)",
                "Y (rad/s)": "Gyroscope y (rad/s)",
                "Z (rad/s)": "Gyroscope z (rad/s)"
        },
        "Linear Acceleration": {
                "X (m/s^2)": "Linear Acceleration x (m/s^2)",
                "Y (m/s^2)": "Linear Acceleration y (m/s^2)",
                "Z (m/s^2)": "Linear Acceleration z (m/s^2)"},
         "Location": {
                "Vertical Accuracy (°)": "Vertical Accuracy (m)"}}

In [80]:
for cat in ["Walking", "Sitting", "Cycling", "Sport"]:
    for f in os.listdir("data/"):
        if cat in f:
            for subf in os.listdir("data/" + f):
                start_time = time.time()
                subf_path = os.path.join("data", f, subf)
                if not os.path.isdir(subf_path):
                    continue  # Skip if it's not a directory
                print(subf)
                acc = pd.read_csv("data/" + f + "/" + subf + "/Accelerometer.csv")
                acc = correct_column_names(acc, column_corrections["Accelerometer"])

                gyro = pd.read_csv("data/" + f + "/" + subf + "/Gyroscope.csv")
                gyro = correct_column_names(gyro, column_corrections["Gyroscope"])

                linacc_path_1 = "data/" + f + "/" + subf + "/Linear Acceleration.csv"
                linacc_path_2 = "data/" + f + "/" + subf + "/Linear Accelerometer.csv"
                if os.path.exists(linacc_path_1):
                    linacc = pd.read_csv(linacc_path_1)
                elif os.path.exists(linacc_path_2):
                    linacc = pd.read_csv(linacc_path_2)
                else:
                    print(f"Warning: Neither Linear Acceleration.csv nor Linear Accelerometer.csv found in {subf_path}.")
                    linacc = pd.DataFrame()  
                linacc = correct_column_names(linacc, column_corrections["Linear Acceleration"])

                loc = pd.read_csv("data/" + f + "/" + subf + "/Location.csv")
                loc.drop(columns = ["Height (m)", "Velocity (m/s)", "Direction (°)"])
                loc = correct_column_names(loc, column_corrections["Location"])



                cols = ["Acceleration x (m/s^2)", "Acceleration y (m/s^2)", "Acceleration z (m/s^2)",
                        "Gyroscope x (rad/s)", "Gyroscope y (rad/s)", "Gyroscope z (rad/s)",
                        "Velocity (m/s)", "Height (m)", "Linear Acceleration x (m/s^2)", 
                        "Linear Acceleration y (m/s^2)", "Linear Acceleration z (m/s^2)"]
                
                         
                acc_cols = ["Acceleration x (m/s^2)", "Acceleration y (m/s^2)", "Acceleration z (m/s^2)", "Time (s)"]
                loc_height_col = ["Latitude (°)", "Longitude (°)", "Height (m)", "Time (s)"]
                gyro_cols = ['Gyroscope x (rad/s)', 'Gyroscope y (rad/s)', 'Gyroscope z (rad/s)', "Time (s)"]
                loc_cols = ["Latitude (°)", "Longitude (°)", "Horizontal Accuracy (m)", "Vertical Accuracy (m)", "Time (s)"]#, "Height (m)", "Velocity (m/s), "Direction (°)","]
                linacc_cols = [ "Linear Acceleration y (m/s^2)", "Linear Acceleration z (m/s^2)", "Linear Acceleration x (m/s^2)", "Time (s)"]

                
                acc = remove_start_end(acc)
                loc = remove_start_end(loc)
                gyro = remove_start_end(gyro)
                linacc = remove_start_end(linacc)


                acc = acc.rename(columns={"Datetime": "Datetime_acc"})
                loc = loc.rename(columns={"Datetime": "Datetime_loc"})
                gyro = gyro.rename(columns={"Datetime": "Datetime_gyro"})
                linacc = linacc.rename(columns={"Datetime": "Datetime_linacc"})
        
                df = pd.merge_asof(linacc, pd.merge_asof(loc, pd.merge_asof(acc, gyro, on="Time (s)", direction="nearest"), on="Time (s)", direction="nearest"), on="Time (s)", direction="nearest")
                #Activity column
                df['Activity'] = cat

                # Calculate LOF
                df = calculate_lof(df, acc_cols, 'acc')
                df = calculate_lof(df, gyro_cols, 'gyro')
                df = calculate_lof(df, loc_cols, 'loc')
                df = calculate_lof(df, linacc_cols, 'linacc')

                # Remove outliers and impute
                df = remove_outliers_and_impute(df, acc_cols, 'acc')
                df = remove_outliers_and_impute(df, gyro_cols, 'gyro')
                df = remove_outliers_and_impute(df, loc_cols, 'loc')
                df = remove_outliers_and_impute(df, linacc_cols, 'linacc')


                # Create the output directory if it doesn't exist
                output_dir = "data_processed/" + f + "/" + subf
                os.makedirs(output_dir, exist_ok=True)

                # Write out the merged dataframe to CSV
                df.to_csv(output_dir + f"/{subf}.csv", index=False)

                end_time = time.time()
                print(f"Processed {subf} in {f} for category {cat} in {end_time - start_time:.2f} seconds.")


                # PLOTTING
                # plot_lof_3d_grad(df, acc_cols, cat)
                # plot_lof_3d_grad(df, gyro_cols, cat)
                # plot_lof_2d_grad(df, loc_cols, cat)


    


QS 2024-06-04 14-52-10
Processed QS 2024-06-04 14-52-10 in Walking for category Walking in 0.40 seconds.
QS 2024-06-05 16-40-25
Processed QS 2024-06-05 16-40-25 in Walking for category Walking in 1.15 seconds.
QS 2024-06-05 16-52-32
Processed QS 2024-06-05 16-52-32 in Walking for category Walking in 1.36 seconds.
Aj walking 2024-06-09 09-56-12
Processed Aj walking 2024-06-09 09-56-12 in Walking for category Walking in 1.09 seconds.
QS 2024-06-05 15-54-27
Processed QS 2024-06-05 15-54-27 in Walking for category Walking in 0.97 seconds.
QS 2024-06-04 14-06-23
Processed QS 2024-06-04 14-06-23 in Walking for category Walking in 1.12 seconds.
Walk 2024-06-04 18-07-03
Processed Walk 2024-06-04 18-07-03 in Walking for category Walking in 0.62 seconds.
Aj walking 2024-06-04 15-48-32
Processed Aj walking 2024-06-04 15-48-32 in Walking for category Walking in 0.46 seconds.
AJRest1 2024-06-04 14-43-06
Processed AJRest1 2024-06-04 14-43-06 in Sitting for category Sitting in 0.41 seconds.
AJRest1 2