In [87]:
import pandas as pd  
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.dates as mdates

In [105]:
# Load the CSV Data
df= pd.read_csv('../data/handlebar/Accelerometer_data_merged_ESP1.csv')  # Load the CSV data using pandas
df['NTP'] = pd.to_datetime(df['NTP'])
# Filter the DataFrame to include only the necessary columns
df_filtered = df[['NTP','Acc-Z','curb_scene']]
df_filtered.head()

Unnamed: 0,NTP,Acc-Z,curb_scene
0,2024-07-24 10:26:01.863,8.762772,0.0
1,2024-07-24 10:26:01.867,8.753204,0.0
2,2024-07-24 10:26:01.872,8.762772,0.0
3,2024-07-24 10:26:01.877,8.781937,0.0
4,2024-07-24 10:26:01.884,8.753204,0.0


In [106]:
def handle_missing_values_length(df_filtered):
    # Handle missing values length
    missing_info = {}  # List to store (start_index, current_count)
    current_count = 0
    start_index = None  # To store the start time of missing values
    for index, row in df_filtered.iterrows():
        if np.isnan(row['Acc-Z']):
            if start_index is None:
                start_index = index
                current_count += 1
            if start_index is not None:
                current_count += 1
        # If the value is not NaN, continue sequence broke
        else:
            if start_index is not None:
                missing_info[start_index] = current_count
                start_index = None
                current_count = 0

    # Handle case where the last segment has missing values
    if current_count > 0:
        missing_info[start_index] = current_count

    for key, value in missing_info.items():
        print(key, value)
        if value > 100:
            print("Start Index:", key, "Count:", value)  
    print("No missing value longer than half a second")

    return missing_info


In [109]:
# Function to fill missing values based on the specified conditions
def fill_missing_values(df):
    for index in range(1, len(df)):
        if pd.isnull(df['Acc-Z'].iloc[index]):
            if df['curb_scene'].iloc[index - 1] == df['curb_scene'].iloc[index]:
                df.at[index, 'Acc-Z'] = df['Acc-Z'].iloc[index - 1]
                df.at[index, 'NTP'] = df['NTP'].iloc[index - 1] + pd.Timedelta(milliseconds=1)
            else:
                # Find the next non-NaN value below
                for j in range(index + 1, len(df)):
                    if not pd.isnull(df['Acc-Z'].iloc[j]):
                        df.at[index, 'Acc-Z'] = df['Acc-Z'].iloc[j]
                        df.at[index, 'NTP'] = df['NTP'].iloc[j] - pd.Timedelta(milliseconds=1)
                        break
    return df

In [None]:
#check for missing values longer than 0.5 seconds
#missing_info = handle_missing_values_length(df_filtered)

In [None]:
#fill missing values
fill_missing_values(df_filtered)

Unnamed: 0,NTP,Acc-Z,curb_scene
0,2024-07-24 10:26:01.863,8.762772,0.0
1,2024-07-24 10:26:01.867,8.753204,0.0
2,2024-07-24 10:26:01.872,8.762772,0.0
3,2024-07-24 10:26:01.877,8.781937,0.0
4,2024-07-24 10:26:01.884,8.753204,0.0
...,...,...,...
549230,2024-07-24 11:11:20.851,8.777145,0.0
549231,2024-07-24 11:11:20.857,8.681381,0.0
549232,2024-07-24 11:11:20.866,8.370132,0.0
549233,2024-07-24 11:11:20.870,8.307877,0.0


In [None]:
# Check if there are any missing values
#missing_info= handle_missing_values_length(df_filtered)
# #check some random numbers:
# print(df_filtered.loc[21827 -10:21827 +5])
# print(df_filtered.loc[41417 -10:41417 +5]) 
# print(df_filtered.loc[41458 -10:41458 +5])

No missing value longer than half a second


In [None]:
#slicing data test
slice_df = df_filtered.loc[0: 99]
# Extract the 'Acc-Z' values
acc_z_values = slice_df['Acc-Z'].values
curb_scene_value = slice_df['curb_scene'].iloc[0]
# Create a dictionary with the specified structure
data = {'curb_scene': curb_scene_value}
for i, value in enumerate(acc_z_values):
    data[f'Acc-Z_{i+1}'] = value
# Create a new DataFrame with the specified structure
new_df = pd.DataFrame([data])
new_df

Unnamed: 0,curb_scene,Acc-Z_1,Acc-Z_2,Acc-Z_3,Acc-Z_4,Acc-Z_5,Acc-Z_6,Acc-Z_7,Acc-Z_8,Acc-Z_9,...,Acc-Z_91,Acc-Z_92,Acc-Z_93,Acc-Z_94,Acc-Z_95,Acc-Z_96,Acc-Z_97,Acc-Z_98,Acc-Z_99,Acc-Z_100
0,0.0,8.762772,8.753204,8.762772,8.781937,8.753204,8.748413,8.738831,8.748413,8.743622,...,8.786728,8.801086,8.820236,8.786728,8.777145,8.772354,8.781937,8.801086,8.781937,8.753204


In [120]:
grouped = df_filtered.groupby('curb_scene')
lengths = grouped.size()
print(lengths)

curb_scene
0.0    530463
1.0     18772
dtype: int64


In [None]:

# Initialize an empty list to store processed segments
processed_segments = []
grouped = df_filtered.groupby('curb_scene')
for name, group in grouped:
    group = group.sort_values(by='NTP')
    # Split the groupmembers into many segments of 100 samples
    for i in range(0, len(group), 100):
        segment = group.iloc[i:i+100]
        if len(segment) < 100:
            break
        acc_z_values = segment['Acc-Z'].values
        curb_scene_value = segment['curb_scene'].iloc[0]
        data = {'curb_scene': curb_scene_value}
        for j, value in enumerate(acc_z_values):
            data[f'Acc-Z_{j+1}'] = value
        new_df = pd.DataFrame([data])
        processed_segments.append(new_df)
        
final_df = pd.concat(processed_segments, ignore_index=True)

# Save the final DataFrame to a CSV file
final_df.to_csv('processed_segments.csv', index=False)
