In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
from scipy import stats
from datetime import datetime, timedelta
import time
from src.OutlierDetection import DistanceBasedOutlierDetection
from sklearn.neighbors import LocalOutlierFactor
from matplotlib.legend_handler import HandlerPathCollection
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from util.plot_util import plot_lof_2d_grad, plot_lof_2d_circle, plot_lof_3d_grad ,plot_lof_3d_circle


In [14]:
def remove_start_end(df, secs=5):
    return df[(df["Time (s)"] > secs) & (df["Time (s)"] < max(df["Time (s)"]) - secs)]


def calculate_lof(df, cols, label):
    if not all(col in df.columns for col in cols):
        missing_cols = [col for col in cols if col not in df.columns]
        #print(f"Missing columns for LOF calculation: {missing_cols}")
        return df
    
    df[cols] = df[cols].apply(lambda x: x.fillna(x.median()))
    #df[cols] = df[cols].interpolate()  
    lof = LocalOutlierFactor(n_neighbors=20, contamination=0.2)
    y_pred = lof.fit_predict(df[cols])
    df[f'lof_{label}'] = y_pred
    df[f'lof_factor_{label}'] = -lof.negative_outlier_factor_  # Negate to align with typical plotting conventions
    return df

def remove_outliers_and_impute(df, cols, label):
    df.loc[df[f'lof_{label}'] == -1, cols] = pd.NA
    # Forward fill NaNs
    df[cols] = df[cols].ffill()
    df.drop(columns = [f'lof_{label}', f'lof_factor_{label}'])
    return df

def correct_column_names(df, corrections):
    return df.rename(columns=corrections)

def add_missing_datetime_column(folder_path):
    # Extract start time from the folder name
    folder_name = os.path.basename(folder_path)
    
    try:
        start_time_str = ' '.join(folder_name.split()[-2:])
        start_time = datetime.strptime(start_time_str, "%Y-%m-%d %H-%M-%S")
    except ValueError as e:
        #print(f"Skipping folder {folder_path} due to ValueError: {e}")
        return

    # Iterate over each CSV file in the folder
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        # Skip non-CSV files
        if not filename.endswith('.csv'):
            continue
        
        try:
            df = pd.read_csv(file_path)
            
            # Drop rows where 'Time (s)' is missing
            df = drop_rows_with_no_time(df)
            
            # Ensure the Time (s) column exists and the DataFrame is not empty
            if 'Time (s)' in df.columns and not df.empty:
                df['Datetime'] = df['Time (s)'].apply(lambda x: start_time + timedelta(seconds=x))
                df.to_csv(file_path, index=False)
                #print(f"Updated {filename} with Datetime column.")
            else:
                pass
                # print(f"Skipped {filename}: 'Time (s)' column not found or DataFrame is empty.")
        except Exception as e:
            print(f"Error processing {file_path}: {e}")

def drop_rows_with_no_time(df):
    """
    Drop rows where there is no value in the 'Time (s)' column.

    Parameters:
    df (pd.DataFrame): The DataFrame to process.

    Returns:
    pd.DataFrame: The DataFrame with rows removed where 'Time (s)' is NaN or missing.
    """
    if 'Time (s)' in df.columns:
        #print(df.head)
        # Drop rows where 'Time (s)' is NaN or an empty string
        df_cleaned = df[df['Time (s)'].notna() & (df['Time (s)'] != '')]
        #print(f"Dropped rows: {len(df) - len(df_cleaned)}")
        return df_cleaned
    else:
        #print("The column 'Time (s)' does not exist in the DataFrame.")
        return df
    

def haversine(lat1, lon1, lat2, lon2, to_radians=True, earth_radius=6371):
    """
    slightly modified version: of http://stackoverflow.com/a/29546836/2901002

    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees or in radians)

    All (lat, lon) coordinates must have numeric dtypes and be of equal length.

    """
    if to_radians:
        lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])

    a = np.sin((lat2-lat1)/2.0)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2

    return earth_radius * 2 * np.arcsin(np.sqrt(a))


column_corrections = {
        "Accelerometer": {
                "X (m/s^2)": "Acceleration x (m/s^2)",
                "Y (m/s^2)": "Acceleration y (m/s^2)",
                "Z (m/s^2)": "Acceleration z (m/s^2)"
        },
        "Gyroscope": {
                "X (rad/s)": "Gyroscope x (rad/s)",
                "Y (rad/s)": "Gyroscope y (rad/s)",
                "Z (rad/s)": "Gyroscope z (rad/s)"
        },
        "Linear Acceleration": {
                "X (m/s^2)": "Linear Acceleration x (m/s^2)",
                "Y (m/s^2)": "Linear Acceleration y (m/s^2)",
                "Z (m/s^2)": "Linear Acceleration z (m/s^2)"},
         "Location": {
                "Vertical Accuracy (°)": "Vertical Accuracy (m)"}}


acc_cols = ["Acceleration x (m/s^2)", "Acceleration y (m/s^2)", "Acceleration z (m/s^2)"]
loc_height_col = ["Latitude (°)", "Longitude (°)", "Height (m)"]
gyro_cols = ['Gyroscope x (rad/s)', 'Gyroscope y (rad/s)', 'Gyroscope z (rad/s)']
loc_cols = ["Latitude (°)", "Longitude (°)", "Horizontal Accuracy (m)", "Vertical Accuracy (m)"]#, "Height (m)", "Velocity (m/s), "Direction (°)","]
linacc_cols = [ "Linear Acceleration y (m/s^2)", "Linear Acceleration z (m/s^2)", "Linear Acceleration x (m/s^2)"]

In [15]:
for cat in ["Walking", "Sitting", "Cycling", "Sport"]:
    for f in os.listdir("data/"):
        if cat in f:
            for subf in os.listdir("data/" + f):

                add_missing_datetime_column("data/" + f + "/" + subf)
                start_time = time.time()
                subf_path = os.path.join("data", f, subf)
                if not os.path.isdir(subf_path):
                    continue  # Skip if it's not a directory
                #print(subf)
                acc = pd.read_csv("data/" + f + "/" + subf + "/Accelerometer.csv")
                acc = correct_column_names(acc, column_corrections["Accelerometer"])

                gyro = pd.read_csv("data/" + f + "/" + subf + "/Gyroscope.csv")
                gyro = correct_column_names(gyro, column_corrections["Gyroscope"])

                linacc_path_1 = "data/" + f + "/" + subf + "/Linear Acceleration.csv"
                linacc_path_2 = "data/" + f + "/" + subf + "/Linear Accelerometer.csv"
                if os.path.exists(linacc_path_1):
                    linacc = pd.read_csv(linacc_path_1)
                elif os.path.exists(linacc_path_2):
                    linacc = pd.read_csv(linacc_path_2)
                else:
                    #print(f"Warning: Neither Linear Acceleration.csv nor Linear Accelerometer.csv found in {subf_path}.")
                    linacc = pd.DataFrame()  
                linacc = correct_column_names(linacc, column_corrections["Linear Acceleration"])

                loc = pd.read_csv("data/" + f + "/" + subf + "/Location.csv")
                loc = correct_column_names(loc, column_corrections["Location"])

                cols = ["Acceleration x (m/s^2)", "Acceleration y (m/s^2)", "Acceleration z (m/s^2)",
                        "Gyroscope x (rad/s)", "Gyroscope y (rad/s)", "Gyroscope z (rad/s)",
                        "Velocity (m/s)", "Height (m)", "Linear Acceleration x (m/s^2)", 
                        "Linear Acceleration y (m/s^2)", "Linear Acceleration z (m/s^2)"]
                
                acc = remove_start_end(acc)
                loc = remove_start_end(loc)
                gyro = remove_start_end(gyro)
                linacc = remove_start_end(linacc)

                acc = acc.rename(columns={"Datetime": "Datetime_acc"})
                loc = loc.rename(columns={"Datetime": "Datetime_loc"})
                gyro = gyro.rename(columns={"Datetime": "Datetime_gyro"})
                linacc = linacc.rename(columns={"Datetime": "Datetime_linacc"})

                # print(df[["Latitude (°)", "Longitude (°)"]].isna().sum())

                loc['Displacement (m)'] = haversine(loc['Latitude (°)'].iloc[0], loc['Longitude (°)'].iloc[0], loc['Latitude (°)'].iloc[-1], loc['Longitude (°)'].iloc[-1])
                try:
                    loc['Distance From Last (m)'] = haversine(loc['Latitude (°)'].shift().astype(float), loc['Longitude (°)'].shift().astype(float), loc.loc[1:, 'Latitude (°)'].astype(float), loc.loc[1:, 'Longitude (°)'].astype(float))
                except:
                    loc['Distance From Last (m)'] = 0

                loc["Distance From Last (m)"] = loc["Distance From Last (m)"].fillna(0)
        
                df = pd.merge_asof(linacc, pd.merge_asof(loc, pd.merge_asof(acc, gyro, on="Time (s)", direction="nearest"), on="Time (s)", direction="nearest"), on="Time (s)", direction="nearest")
                #Activity column
                df['Activity'] = cat

                #Remove rows with no Times (s)
                df = drop_rows_with_no_time(df)

                # Calculate LOF
                df = calculate_lof(df, acc_cols, 'acc')
                df = calculate_lof(df, gyro_cols, 'gyro')
                df = calculate_lof(df, loc_cols, 'loc')
                df = calculate_lof(df, linacc_cols, 'linacc')

                # Remove outliers and impute
                df = remove_outliers_and_impute(df, acc_cols, 'acc')
                df = remove_outliers_and_impute(df, gyro_cols, 'gyro')
                df = remove_outliers_and_impute(df, loc_cols, 'loc')
                df = remove_outliers_and_impute(df, linacc_cols, 'linacc')

                # Create the output directory if it doesn't exist
                output_dir = "data_processed/" + f + "/" + subf
                os.makedirs(output_dir, exist_ok=True)

                df.drop_duplicates(inplace=True)

                # Write out the merged dataframe to CSV
                df.to_csv(output_dir + f"/{subf}.csv", index=False)

                end_time = time.time()
                #print(f"Processed {subf} in {f} for category {cat} in {end_time - start_time:.2f} seconds.")

            

In [16]:
def concatenate_and_sort_nested_csv_files(base_directory, sort_column):
    df_list = []
    
    # Traverse the base directory and read CSV files
    for root, dirs, files in os.walk(base_directory):
        for file in files:
            if file.endswith('.csv'):
                
                file_path = os.path.join(root, file)
                df = pd.read_csv(file_path)
                df_list.append(df)
    
    # Concatenate all DataFrames and sort by the specified column
    concatenated_df = pd.concat(df_list, ignore_index=True)
    sorted_df = concatenated_df.sort_values(by=sort_column)
    return sorted_df

# Concatenate and sort the CSV files by 'Datetime_linacc'
sorted_nested_df = concatenate_and_sort_nested_csv_files("data_processed", 'Datetime_linacc')

# Save the sorted DataFrame to a new CSV file
sorted_nested_output_file = 'data_agg/final_aggregated_output.csv'
sorted_nested_df.to_csv(sorted_nested_output_file, index=False)


In [17]:
df = sorted_nested_df
df = df.dropna(subset=["Time (s)"])
df.drop(columns=["Direction (°)"], inplace=True)

# Replace every null value for "height" and "velocity" with 0
df["Height (m)"].fillna(0, inplace=True)
df["Velocity (m/s)"].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Height (m)"].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Velocity (m/s)"].fillna(0, inplace=True)


In [18]:
df = remove_outliers_and_impute(df, acc_cols, 'acc')
df = remove_outliers_and_impute(df, gyro_cols, 'gyro')
df = remove_outliers_and_impute(df, loc_cols, 'loc')
df = remove_outliers_and_impute(df, linacc_cols, 'linacc')

df.drop_duplicates(inplace=True)

df.drop(columns=df.columns[-8:], inplace=True)
df.set_index(pd.to_datetime(df['Datetime_linacc']), inplace=True)
df.drop(columns=[c for c in df.columns if "Datetime" in c], inplace=True)
df.drop(columns=["Latitude (°)", "Longitude (°)", "Horizontal Accuracy (m)", "Vertical Accuracy (m)"], inplace=True)

df.to_csv(f"data_agg/final_aggregated_output_no_outlier.csv")
df

Unnamed: 0_level_0,Time (s),Linear Acceleration x (m/s^2),Linear Acceleration y (m/s^2),Linear Acceleration z (m/s^2),Height (m),Velocity (m/s),Displacement (m),Distance From Last (m),Acceleration x (m/s^2),Acceleration y (m/s^2),Acceleration z (m/s^2),Gyroscope x (rad/s),Gyroscope y (rad/s),Gyroscope z (rad/s),Activity
Datetime_linacc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2024-06-04 14:06:28.024712,5.024712,1.475718,-1.026762,-1.760057,-4.745554,0.98,0.105261,0.000000,0.147458,3.445964,9.239586,-0.027829,-0.391340,0.373098,Walking
2024-06-04 14:06:28.065210,5.065210,-0.108604,-0.699649,-3.437370,-4.745554,0.98,0.105261,0.000000,0.147458,3.445964,9.239586,-0.027829,-0.391340,0.373098,Walking
2024-06-04 14:06:28.105655,5.105655,-1.297407,-0.599205,-2.629323,-4.745554,0.98,0.105261,0.000000,0.147458,3.445964,9.239586,-0.027829,-0.391340,0.373098,Walking
2024-06-04 14:06:28.146197,5.146197,1.051061,-0.073347,-0.506624,-4.745554,0.98,0.105261,0.000000,0.147458,3.445964,9.239586,-0.027829,-0.391340,0.373098,Walking
2024-06-04 14:06:28.186665,5.186665,0.602122,-0.430895,-0.995502,-4.745554,0.98,0.105261,0.000000,0.147458,3.445964,9.239586,-0.027829,-0.391340,0.373098,Walking
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-06-11 14:47:20.032852,296.032852,5.564073,-3.819289,3.387449,5.859561,0.00,0.004123,0.000246,13.409976,-8.886046,-0.433699,-0.054194,0.124233,0.115445,Sport
2024-06-11 14:47:20.073338,296.073338,5.564073,-3.819289,3.387449,5.859561,0.00,0.004123,0.000246,13.409976,-8.886046,-0.433699,-0.054194,0.124233,0.115445,Sport
2024-06-11 14:47:20.113847,296.113847,2.491645,2.183503,2.287658,5.859561,0.00,0.004123,0.000246,13.409976,-8.886046,-0.433699,-0.054194,0.124233,0.115445,Sport
2024-06-11 14:47:20.154319,296.154319,-2.570542,-3.156097,0.373885,5.859561,0.00,0.004123,0.000246,13.409976,-8.886046,-0.433699,-0.054194,0.124233,0.115445,Sport


In [22]:


# Load the CSV file
file_path = 'data_agg/final_aggregated_output_no_outlier.csv'
df = pd.read_csv(file_path)

# Convert the 'Datetime_linacc' column to datetime
df['Datetime_linacc'] = pd.to_datetime(df['Datetime_linacc'])

# Set the 'Datetime_linacc' column as the index
df.set_index('Datetime_linacc', inplace=True)

# Define a custom resampling function to handle non-numeric 'Activity' column
def resample_with_activity(data, interval):
    numeric_df = data.drop(columns=['Activity']).resample(interval).mean()
    activity_df = data['Activity'].resample(interval).first()
    return numeric_df.join(activity_df)

# Resample the data into 5-second intervals
df_5s = resample_with_activity(df, '5S')

# Resample the data into 0.2-second intervals
df_0_2s = resample_with_activity(df, '200ms')

df_5s_clean = df_5s.dropna()
df_0_2s_clean = df_0_2s.dropna()

# Save the cleaned, resampled dataframes to CSV files
df_5s_clean.to_csv("data_agg/five_sec_agg_clean.csv")
df_0_2s_clean.to_csv("data_agg/point_two_sec_agg_clean.csv")


  numeric_df = data.drop(columns=['Activity']).resample(interval).mean()
  activity_df = data['Activity'].resample(interval).first()
