In [131]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
from scipy import stats
from datetime import datetime, timedelta
import time
from src.OutlierDetection import DistanceBasedOutlierDetection
from sklearn.neighbors import LocalOutlierFactor
from matplotlib.legend_handler import HandlerPathCollection
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from util.plot_util import plot_lof_2d_grad, plot_lof_2d_circle, plot_lof_3d_grad ,plot_lof_3d_circle


In [132]:
def remove_start_end(df, secs=5):
    return df[(df["Time (s)"] > secs) & (df["Time (s)"] < max(df["Time (s)"]) - secs)]


def calculate_lof(df, cols, label):
    if not all(col in df.columns for col in cols):
        missing_cols = [col for col in cols if col not in df.columns]
        print(f"Missing columns for LOF calculation: {missing_cols}")
        return df
    
    df[cols] = df[cols].apply(lambda x: x.fillna(x.median()))
    #df[cols] = df[cols].interpolate()  
    lof = LocalOutlierFactor(n_neighbors=20, contamination=0.2)
    y_pred = lof.fit_predict(df[cols])
    df[f'lof_{label}'] = y_pred
    df[f'lof_factor_{label}'] = -lof.negative_outlier_factor_  # Negate to align with typical plotting conventions
    return df

def remove_outliers_and_impute(df, cols, label):
    # Mark outliers with NaN
    df.loc[df[f'lof_{label}'] == -1, cols] = pd.NA
    # Interpolate NaNs
    df[cols] = df[cols].interpolate()
    return df

def correct_column_names(df, corrections):
    return df.rename(columns=corrections)

def add_missing_datetime_column(folder_path):
    # Extract start time from the folder name
    folder_name = os.path.basename(folder_path)
    
    try:
        start_time_str = ' '.join(folder_name.split()[-2:])
        start_time = datetime.strptime(start_time_str, "%Y-%m-%d %H-%M-%S")
    except ValueError as e:
        print(f"Skipping folder {folder_path} due to ValueError: {e}")
        return

    # Iterate over each CSV file in the folder
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        # Skip non-CSV files
        if not filename.endswith('.csv'):
            continue
        
        try:
            df = pd.read_csv(file_path)
            
            # Drop rows where 'Time (s)' is missing
            df = drop_rows_with_no_time(df)
            
            # Ensure the Time (s) column exists and the DataFrame is not empty
            if 'Time (s)' in df.columns and not df.empty:
                df['Datetime'] = df['Time (s)'].apply(lambda x: start_time + timedelta(seconds=x))
                df.to_csv(file_path, index=False)
                print(f"Updated {filename} with Datetime column.")
            else:
                print(f"Skipped {filename}: 'Time (s)' column not found or DataFrame is empty.")
        except Exception as e:
            print(f"Error processing {file_path}: {e}")

def drop_rows_with_no_time(df):
    """
    Drop rows where there is no value in the 'Time (s)' column.

    Parameters:
    df (pd.DataFrame): The DataFrame to process.

    Returns:
    pd.DataFrame: The DataFrame with rows removed where 'Time (s)' is NaN or missing.
    """
    if 'Time (s)' in df.columns:
        # Drop rows where 'Time (s)' is NaN or an empty string
        df_cleaned = df[df['Time (s)'].notna() & (df['Time (s)'] != '')]
        print(f"Dropped rows: {len(df) - len(df_cleaned)}")
        return df_cleaned
    else:
        print("The column 'Time (s)' does not exist in the DataFrame.")
        return df


column_corrections = {
        "Accelerometer": {
                "X (m/s^2)": "Acceleration x (m/s^2)",
                "Y (m/s^2)": "Acceleration y (m/s^2)",
                "Z (m/s^2)": "Acceleration z (m/s^2)"
        },
        "Gyroscope": {
                "X (rad/s)": "Gyroscope x (rad/s)",
                "Y (rad/s)": "Gyroscope y (rad/s)",
                "Z (rad/s)": "Gyroscope z (rad/s)"
        },
        "Linear Acceleration": {
                "X (m/s^2)": "Linear Acceleration x (m/s^2)",
                "Y (m/s^2)": "Linear Acceleration y (m/s^2)",
                "Z (m/s^2)": "Linear Acceleration z (m/s^2)"},
         "Location": {
                "Vertical Accuracy (°)": "Vertical Accuracy (m)"}}

In [134]:
for cat in ["Walking", "Sitting", "Cycling", "Sport"]:
    for f in os.listdir("data/"):
        if cat in f:
            for subf in os.listdir("data/" + f):

                add_missing_datetime_column("data/" + f + "/" + subf)
                start_time = time.time()
                subf_path = os.path.join("data", f, subf)
                if not os.path.isdir(subf_path):
                    continue  # Skip if it's not a directory
                print(subf)
                acc = pd.read_csv("data/" + f + "/" + subf + "/Accelerometer.csv")
                acc = correct_column_names(acc, column_corrections["Accelerometer"])

                gyro = pd.read_csv("data/" + f + "/" + subf + "/Gyroscope.csv")
                gyro = correct_column_names(gyro, column_corrections["Gyroscope"])

                linacc_path_1 = "data/" + f + "/" + subf + "/Linear Acceleration.csv"
                linacc_path_2 = "data/" + f + "/" + subf + "/Linear Accelerometer.csv"
                if os.path.exists(linacc_path_1):
                    linacc = pd.read_csv(linacc_path_1)
                elif os.path.exists(linacc_path_2):
                    linacc = pd.read_csv(linacc_path_2)
                else:
                    print(f"Warning: Neither Linear Acceleration.csv nor Linear Accelerometer.csv found in {subf_path}.")
                    linacc = pd.DataFrame()  
                linacc = correct_column_names(linacc, column_corrections["Linear Acceleration"])

                loc = pd.read_csv("data/" + f + "/" + subf + "/Location.csv")
                loc.drop(columns = ["Height (m)", "Velocity (m/s)", "Direction (°)"])
                loc = correct_column_names(loc, column_corrections["Location"])



                cols = ["Acceleration x (m/s^2)", "Acceleration y (m/s^2)", "Acceleration z (m/s^2)",
                        "Gyroscope x (rad/s)", "Gyroscope y (rad/s)", "Gyroscope z (rad/s)",
                        "Velocity (m/s)", "Height (m)", "Linear Acceleration x (m/s^2)", 
                        "Linear Acceleration y (m/s^2)", "Linear Acceleration z (m/s^2)"]
                
                         
                acc_cols = ["Acceleration x (m/s^2)", "Acceleration y (m/s^2)", "Acceleration z (m/s^2)", "Time (s)"]
                loc_height_col = ["Latitude (°)", "Longitude (°)", "Height (m)", "Time (s)"]
                gyro_cols = ['Gyroscope x (rad/s)', 'Gyroscope y (rad/s)', 'Gyroscope z (rad/s)', "Time (s)"]
                loc_cols = ["Latitude (°)", "Longitude (°)", "Horizontal Accuracy (m)", "Vertical Accuracy (m)", "Time (s)"]#, "Height (m)", "Velocity (m/s), "Direction (°)","]
                linacc_cols = [ "Linear Acceleration y (m/s^2)", "Linear Acceleration z (m/s^2)", "Linear Acceleration x (m/s^2)", "Time (s)"]

                
                acc = remove_start_end(acc)
                loc = remove_start_end(loc)
                gyro = remove_start_end(gyro)
                linacc = remove_start_end(linacc)


                acc = acc.rename(columns={"Datetime": "Datetime_acc"})
                loc = loc.rename(columns={"Datetime": "Datetime_loc"})
                gyro = gyro.rename(columns={"Datetime": "Datetime_gyro"})
                linacc = linacc.rename(columns={"Datetime": "Datetime_linacc"})
        
                df = pd.merge_asof(linacc, pd.merge_asof(loc, pd.merge_asof(acc, gyro, on="Time (s)", direction="nearest"), on="Time (s)", direction="nearest"), on="Time (s)", direction="nearest")
                #Activity column
                df['Activity'] = cat


                #Remove rows with no Times (s)
                print(df.head)
                df = drop_rows_with_no_time(df)

                # Calculate LOF
                df = calculate_lof(df, acc_cols, 'acc')
                df = calculate_lof(df, gyro_cols, 'gyro')
                df = calculate_lof(df, loc_cols, 'loc')
                df = calculate_lof(df, linacc_cols, 'linacc')

                # Remove outliers and impute
                df = remove_outliers_and_impute(df, acc_cols, 'acc')
                df = remove_outliers_and_impute(df, gyro_cols, 'gyro')
                df = remove_outliers_and_impute(df, loc_cols, 'loc')
                df = remove_outliers_and_impute(df, linacc_cols, 'linacc')


                # Create the output directory if it doesn't exist
                output_dir = "data_processed/" + f + "/" + subf
                os.makedirs(output_dir, exist_ok=True)

                # Write out the merged dataframe to CSV
                df.to_csv(output_dir + f"/{subf}.csv", index=False)
                

                end_time = time.time()
                print(f"Processed {subf} in {f} for category {cat} in {end_time - start_time:.2f} seconds.")


                # PLOTTING
                # plot_lof_3d_grad(df, acc_cols, cat)
                # plot_lof_3d_grad(df, gyro_cols, cat)
                # plot_lof_2d_grad(df, loc_cols, cat)


    


Dropped rows: 0
Updated Location.csv with Datetime column.
Dropped rows: 0
Updated Accelerometer.csv with Datetime column.
Dropped rows: 0
Updated Gyroscope.csv with Datetime column.
Dropped rows: 0
Updated Linear Acceleration.csv with Datetime column.
QS 2024-06-04 14-52-10
<bound method NDFrame.head of        Time (s)  Linear Acceleration x (m/s^2)  Linear Acceleration y (m/s^2)  \
0      5.017430                      -0.536451                      -0.442930   
1      5.057918                      -0.218776                       0.049279   
2      5.098400                      -0.061657                       0.216771   
3      5.138949                       0.659386                       0.515375   
4      5.179377                       0.590524                       0.388112   
...         ...                            ...                            ...   
2050  88.018003                       0.159312                       0.023964   
2051  88.058486                       1.007364