In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os

from tqdm import tqdm

In [2]:
def load_data(path: str):
    df_list = []

    for csv_file in os.listdir(path):
        df = pd.read_csv(data_path + csv_file, skiprows=5)
        df.name = csv_file
        df_list.append(df)

    return df_list


data_path = r'./data/'
dataframes = load_data(path=data_path)

In [3]:
def set_dataframes_format(df_list):
    good_columns = ['Time [sec]', 'ACC X', 'ACC Y', 'ACC Z']

    for dataframe in df_list:
        columns_list = list(dataframe.columns)
        for _ in range(2):
            if columns_list != good_columns:
                if "Unnamed: 0" in columns_list:
                    dataframe.columns = dataframe.iloc[0]
                    dataframe.drop(index=[0], axis=0, inplace=True)
                else:
                    dataframe.columns = good_columns
                columns_list = list(dataframe.columns)

    for dataframe in df_list:
        if dataframe.index[0] != 0:
            dataframe.set_index([pd.RangeIndex(start=0, step=1, stop=dataframe.index.stop - 1)], inplace=True)

    return df_list


dataframes = set_dataframes_format(dataframes)

In [4]:
def plot_df(df):
    plt.plot(df['Time [sec]'], df['ACC X'])
    plt.plot(df['Time [sec]'], df['ACC Y'])
    plt.plot(df['Time [sec]'], df['ACC Z'])
    plt.legend(['X', 'Y', 'Z'])
    plt.title(df.name)
    plt.show()

We start by cleaning the data, since it contains **a lot** of garbage.
We remove the following recordings:

1. All recording with timeskips, where the timestamp jumps to a high value and then back to a lower value
2. All recordings with high variance in at least one axis
3. All recordings where there are missing measurements for at least 1.5 seconds straight

Also, for each recording, if there's a measurement with a value > 1e6, we remove that measurement from the file.

In [5]:
def find_timeskips(dataframes):
    dataframes_with_timeskips = []

    for df in dataframes:
        try:
            if any([df.iat[i, 0] - df.iat[i + 1, 0] >= 0 for i in range(df.shape[0] - 1)]):
                dataframes_with_timeskips.append(df.name)
        except TypeError:
            print(f"{df.name} causes a TypeError")

    dataframes = [df for df in dataframes if df.name not in dataframes_with_timeskips]
    return dataframes

In [6]:
def find_high_variance(dataframes):
    high_variance_dataframes = []
    for df in dataframes:
        try:
            for col in ['ACC X', 'ACC Y', 'ACC Z']:
                max_val, min_val = df[col].max(), df[col].min()
                test_val = max_val - min_val
        except TypeError:
            high_variance_dataframes.append(df.name)

    dataframes = [df for df in dataframes if df.name not in high_variance_dataframes]
    return dataframes

In [7]:
def find_missing_measurements(dataframes):
    missing_measurements_dataframes = []

    for df in dataframes:
        if any([df.iat[i + 1, 0] - df.iat[i, 0] >= 1 for i in range(df.shape[0] - 1)]):
            missing_measurements_dataframes.append(df.name)

    dataframes = [df for df in dataframes if df.name not in missing_measurements_dataframes]
    return dataframes

In [8]:
def find_peaks(dataframes):
    dataframes_without_peaks = []
    for df in dataframes:
        new_df = df[(df['ACC X'] < 1e6) & (df['ACC Y'] < 1e6) & (df['ACC Z'] < 1e6)]
        new_df.name = df.name
        dataframes_without_peaks.append(new_df)
    return dataframes_without_peaks

In [9]:
def filter_data(df_list: list):
    df_list = find_timeskips(df_list)
    df_list = find_high_variance(df_list)
    df_list = find_missing_measurements(df_list)
    df_list = find_peaks(df_list)
    return df_list

dataframes = filter_data(dataframes)

11_walk_1_1.csv causes a TypeError
11_walk_2_1.csv causes a TypeError
11_walk_3_1.csv causes a TypeError
11_walk_5_1.csv causes a TypeError
6_run_3_1.csv causes a TypeError
6_run_4_1.csv causes a TypeError
6_walk_5_1.csv causes a TypeError


In [11]:
raise TypeError

TypeError: 

In [None]:
path = r'./data/11_walk_1_1.csv'
df = pd.read_csv(path, skiprows=5)
fixed_df = set_dataframes_format([df])
fixed_df[0]['Time [sec]'] = fixed_df[0]['Time [sec]'].apply(lambda x: 1 if isinstance(x, str) else 0)
# fixed_df[0]['Time [sec]']


fixed_df[0][fixed_df[0]['Time [sec]'] == 0]