In [45]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
from scipy import stats
from datetime import datetime, timedelta
import time
from src.OutlierDetection import DistanceBasedOutlierDetection
from sklearn.neighbors import LocalOutlierFactor
from matplotlib.legend_handler import HandlerPathCollection
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from util.plot_util import plot_lof_2d_grad, plot_lof_2d_circle, plot_lof_3d_grad ,plot_lof_3d_circle


In [49]:
def remove_start_end(df, secs=5):
    return df[(df["Time (s)"] > secs) & (df["Time (s)"] < max(df["Time (s)"]) - secs)]


def calculate_lof(df, cols, label):
    if not all(col in df.columns for col in cols):
        missing_cols = [col for col in cols if col not in df.columns]
        #print(f"Missing columns for LOF calculation: {missing_cols}")
        return df
    
    df[cols] = df[cols].apply(lambda x: x.fillna(x.median()))
    #df[cols] = df[cols].interpolate()  
    lof = LocalOutlierFactor(n_neighbors=20, contamination=0.2)
    y_pred = lof.fit_predict(df[cols])
    df[f'lof_{label}'] = y_pred
    df[f'lof_factor_{label}'] = -lof.negative_outlier_factor_  # Negate to align with typical plotting conventions
    return df

def remove_outliers_and_impute(df, cols, label):
    df.loc[df[f'lof_{label}'] == -1, cols] = pd.NA
    # Forward fill NaNs
    df[cols] = df[cols].ffill()
    df.drop(columns = [f'lof_{label}', f'lof_factor_{label}'])
    return df

def correct_column_names(df, corrections):
    return df.rename(columns=corrections)

def add_missing_datetime_column(folder_path):
    # Extract start time from the folder name
    folder_name = os.path.basename(folder_path)
    
    try:
        start_time_str = ' '.join(folder_name.split()[-2:])
        start_time = datetime.strptime(start_time_str, "%Y-%m-%d %H-%M-%S")
    except ValueError as e:
        #print(f"Skipping folder {folder_path} due to ValueError: {e}")
        return

    # Iterate over each CSV file in the folder
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        # Skip non-CSV files
        if not filename.endswith('.csv'):
            continue
        
        try:
            df = pd.read_csv(file_path)
            
            # Drop rows where 'Time (s)' is missing
            df = drop_rows_with_no_time(df)
            
            # Ensure the Time (s) column exists and the DataFrame is not empty
            if 'Time (s)' in df.columns and not df.empty:
                df['Datetime'] = df['Time (s)'].apply(lambda x: start_time + timedelta(seconds=x))
                df.to_csv(file_path, index=False)
                #print(f"Updated {filename} with Datetime column.")
            else:
                print(f"Skipped {filename}: 'Time (s)' column not found or DataFrame is empty.")
        except Exception as e:
            print(f"Error processing {file_path}: {e}")

def drop_rows_with_no_time(df):
    """
    Drop rows where there is no value in the 'Time (s)' column.

    Parameters:
    df (pd.DataFrame): The DataFrame to process.

    Returns:
    pd.DataFrame: The DataFrame with rows removed where 'Time (s)' is NaN or missing.
    """
    if 'Time (s)' in df.columns:
        #print(df.head)
        # Drop rows where 'Time (s)' is NaN or an empty string
        df_cleaned = df[df['Time (s)'].notna() & (df['Time (s)'] != '')]
        #print(f"Dropped rows: {len(df) - len(df_cleaned)}")
        return df_cleaned
    else:
        #print("The column 'Time (s)' does not exist in the DataFrame.")
        return df


column_corrections = {
        "Accelerometer": {
                "X (m/s^2)": "Acceleration x (m/s^2)",
                "Y (m/s^2)": "Acceleration y (m/s^2)",
                "Z (m/s^2)": "Acceleration z (m/s^2)"
        },
        "Gyroscope": {
                "X (rad/s)": "Gyroscope x (rad/s)",
                "Y (rad/s)": "Gyroscope y (rad/s)",
                "Z (rad/s)": "Gyroscope z (rad/s)"
        },
        "Linear Acceleration": {
                "X (m/s^2)": "Linear Acceleration x (m/s^2)",
                "Y (m/s^2)": "Linear Acceleration y (m/s^2)",
                "Z (m/s^2)": "Linear Acceleration z (m/s^2)"},
         "Location": {
                "Vertical Accuracy (°)": "Vertical Accuracy (m)"}}


acc_cols = ["Acceleration x (m/s^2)", "Acceleration y (m/s^2)", "Acceleration z (m/s^2)"]
loc_height_col = ["Latitude (°)", "Longitude (°)", "Height (m)"]
gyro_cols = ['Gyroscope x (rad/s)', 'Gyroscope y (rad/s)', 'Gyroscope z (rad/s)']
loc_cols = ["Latitude (°)", "Longitude (°)", "Horizontal Accuracy (m)", "Vertical Accuracy (m)"]#, "Height (m)", "Velocity (m/s), "Direction (°)","]
linacc_cols = [ "Linear Acceleration y (m/s^2)", "Linear Acceleration z (m/s^2)", "Linear Acceleration x (m/s^2)"]

In [50]:
for cat in ["Walking", "Sitting", "Cycling", "Sport"]:
    for f in os.listdir("data/"):
        if cat in f:
            for subf in os.listdir("data/" + f):

                add_missing_datetime_column("data/" + f + "/" + subf)
                start_time = time.time()
                subf_path = os.path.join("data", f, subf)
                if not os.path.isdir(subf_path):
                    continue  # Skip if it's not a directory
                #print(subf)
                acc = pd.read_csv("data/" + f + "/" + subf + "/Accelerometer.csv")
                acc = correct_column_names(acc, column_corrections["Accelerometer"])

                gyro = pd.read_csv("data/" + f + "/" + subf + "/Gyroscope.csv")
                gyro = correct_column_names(gyro, column_corrections["Gyroscope"])

                linacc_path_1 = "data/" + f + "/" + subf + "/Linear Acceleration.csv"
                linacc_path_2 = "data/" + f + "/" + subf + "/Linear Accelerometer.csv"
                if os.path.exists(linacc_path_1):
                    linacc = pd.read_csv(linacc_path_1)
                elif os.path.exists(linacc_path_2):
                    linacc = pd.read_csv(linacc_path_2)
                else:
                    #print(f"Warning: Neither Linear Acceleration.csv nor Linear Accelerometer.csv found in {subf_path}.")
                    linacc = pd.DataFrame()  
                linacc = correct_column_names(linacc, column_corrections["Linear Acceleration"])

                loc = pd.read_csv("data/" + f + "/" + subf + "/Location.csv")
                loc = correct_column_names(loc, column_corrections["Location"])

                cols = ["Acceleration x (m/s^2)", "Acceleration y (m/s^2)", "Acceleration z (m/s^2)",
                        "Gyroscope x (rad/s)", "Gyroscope y (rad/s)", "Gyroscope z (rad/s)",
                        "Velocity (m/s)", "Height (m)", "Linear Acceleration x (m/s^2)", 
                        "Linear Acceleration y (m/s^2)", "Linear Acceleration z (m/s^2)"]
                
                acc = remove_start_end(acc)
                loc = remove_start_end(loc)
                gyro = remove_start_end(gyro)
                linacc = remove_start_end(linacc)

                acc = acc.rename(columns={"Datetime": "Datetime_acc"})
                loc = loc.rename(columns={"Datetime": "Datetime_loc"})
                gyro = gyro.rename(columns={"Datetime": "Datetime_gyro"})
                linacc = linacc.rename(columns={"Datetime": "Datetime_linacc"})
        
                df = pd.merge_asof(linacc, pd.merge_asof(loc, pd.merge_asof(acc, gyro, on="Time (s)", direction="nearest"), on="Time (s)", direction="nearest"), on="Time (s)", direction="nearest")
                #Activity column
                df['Activity'] = cat

                #Remove rows with no Times (s)
                df = drop_rows_with_no_time(df)

                # Calculate LOF
                df = calculate_lof(df, acc_cols, 'acc')
                df = calculate_lof(df, gyro_cols, 'gyro')
                df = calculate_lof(df, loc_cols, 'loc')
                df = calculate_lof(df, linacc_cols, 'linacc')

                # Remove outliers and impute
                df = remove_outliers_and_impute(df, acc_cols, 'acc')
                df = remove_outliers_and_impute(df, gyro_cols, 'gyro')
                df = remove_outliers_and_impute(df, loc_cols, 'loc')
                df = remove_outliers_and_impute(df, linacc_cols, 'linacc')

                #df = df.drop(columns=['lof_acc', 'lof_gyro', 'lof_loc', 'lof_linacc'])

                # Create the output directory if it doesn't exist
                output_dir = "data_processed/" + f + "/" + subf
                os.makedirs(output_dir, exist_ok=True)

                # Write out the merged dataframe to CSV
                df.to_csv(output_dir + f"/{subf}.csv", index=False)

                end_time = time.time()
                #print(f"Processed {subf} in {f} for category {cat} in {end_time - start_time:.2f} seconds.")

            

Skipped Barometer.csv: 'Time (s)' column not found or DataFrame is empty.
Skipped Barometer.csv: 'Time (s)' column not found or DataFrame is empty.
Skipped Barometer.csv: 'Time (s)' column not found or DataFrame is empty.
Skipped Barometer.csv: 'Time (s)' column not found or DataFrame is empty.
Skipped Barometer.csv: 'Time (s)' column not found or DataFrame is empty.
Skipped Barometer.csv: 'Time (s)' column not found or DataFrame is empty.
Skipped Barometer.csv: 'Time (s)' column not found or DataFrame is empty.
Skipped Barometer.csv: 'Time (s)' column not found or DataFrame is empty.
Skipped Barometer.csv: 'Time (s)' column not found or DataFrame is empty.
Skipped Barometer.csv: 'Time (s)' column not found or DataFrame is empty.
Skipped Barometer.csv: 'Time (s)' column not found or DataFrame is empty.
Skipped Barometer.csv: 'Time (s)' column not found or DataFrame is empty.
Skipped Barometer.csv: 'Time (s)' column not found or DataFrame is empty.
Skipped Barometer.csv: 'Time (s)' colu

In [51]:
def concatenate_and_sort_nested_csv_files(base_directory, sort_column):
    df_list = []
    
    # Traverse the base directory and read CSV files
    for root, dirs, files in os.walk(base_directory):
        for file in files:
            if file.endswith('.csv'):
                
                file_path = os.path.join(root, file)
                df = pd.read_csv(file_path)
                df_list.append(df)
    
    # Concatenate all DataFrames and sort by the specified column
    concatenated_df = pd.concat(df_list, ignore_index=True)
    sorted_df = concatenated_df.sort_values(by=sort_column)
    return sorted_df

# Concatenate and sort the CSV files by 'Datetime_linacc'
sorted_nested_df = concatenate_and_sort_nested_csv_files("data_processed", 'Datetime_linacc')

# Save the sorted DataFrame to a new CSV file
sorted_nested_output_file = 'data_processed/final_aggregated_output.csv'
sorted_nested_df.to_csv(sorted_nested_output_file, index=False)


In [52]:
#remove nulls
df = pd.read_csv('data_processed/final_aggregated_output.csv')
df_cleaned = df.dropna(subset=["Time (s)"])
df_cleaned.to_csv("data_processed/final_aggregated_output.csv", index=False)

In [53]:
df = pd.read_csv('data_processed/final_aggregated_output.csv')
df.drop(columns=["Direction (°)"], inplace=True)

# Replace every null value for "height" and "velocity" with 0
df["Height (m)"].fillna(0, inplace=True)
df["Velocity (m/s)"].fillna(0, inplace=True)
df.to_csv("data_processed/final_aggregated_output.csv", index=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Height (m)"].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Velocity (m/s)"].fillna(0, inplace=True)


In [55]:
df = pd.read_csv('data_processed/final_aggregated_output.csv')

df = remove_outliers_and_impute(df, acc_cols, 'acc')
df = remove_outliers_and_impute(df, gyro_cols, 'gyro')
df = remove_outliers_and_impute(df, loc_cols, 'loc')
df = remove_outliers_and_impute(df, linacc_cols, 'linacc')


df.to_csv(f"data_processed/final_aggregated_output_no_outlier.csv", index=False)



In [None]:
#NBNB:: BROKEN WORK IN PRGRESS, IGNORE

# Load the dataset
file_path = 'data_processed/final_aggregated_output_no_outlier.csv'  # replace with your dataset path
df = pd.read_csv(file_path)

# Convert the datetime column to a pandas datetime object
df['Datetime_linacc'] = pd.to_datetime(df['Datetime_linacc'], errors='coerce')
df = df.dropna(subset=['Datetime_linacc'])  # Drop rows where datetime conversion failed

# Set the datetime column as the index
df.set_index('Datetime_linacc', inplace=True)

# Keep only numeric columns and the 'Activity' column
numeric_columns = df.select_dtypes(include=['number']).columns
df_filtered = df[numeric_columns.to_list() + ['Activity']]

# Group by 5-second intervals and aggregate
df_grouped_5s = df_filtered.groupby(pd.Grouper(freq='5S')).agg({**{col: 'mean' for col in numeric_columns}, 'Activity': 'first'})
# Drop rows where all values are NaN
df_grouped_5s = df_grouped_5s.dropna(how='all')

# Group by 0.25-second intervals and aggregate
df_grouped_0_25s = df_filtered.groupby(pd.Grouper(freq='250L')).agg({**{col: 'mean' for col in numeric_columns}, 'Activity': 'first'})
# Drop rows where all values are NaN
df_grouped_0_25s = df_grouped_0_25s.dropna(how='all')

# Save the aggregated data to new CSV files
df_grouped_5s.to_csv('data_processed/grouped_5s.csv')
df_grouped_0_25s.to_csv('data_processed/grouped_0_25s.csv')


  df_grouped_5s = df_filtered.groupby(pd.Grouper(freq='5S')).agg({**{col: 'mean' for col in numeric_columns}, 'Activity': 'first'})
  df_grouped_0_25s = df_filtered.groupby(pd.Grouper(freq='250L')).agg({**{col: 'mean' for col in numeric_columns}, 'Activity': 'first'})
