In [None]:
import sys
from pathlib import Path
sys.path.append(str(Path('..', 'src')))

import utils
import config
import pandas as pd

In [None]:
raw_path = Path(config.DATA_PATH, 'raw')
clean_path = Path(config.DATA_PATH, 'clean')
clean_path.mkdir(exist_ok=True)

for file_name in config.RAW_FILES:
    csv_file = raw_path / file_name

    df = pd.read_csv(csv_file)
    
    # Step 1: Remove rows where Action2 is not NaN
    df = df[df['Action2'].isna()]
    print(f"After removing Action2 NaN: {df.shape}")
    
    # Step 2: Create 'Action' column from Action1 and drop Action1 and Action2
    df['Action'] = df['Action1']
    df = df.drop(columns=['Action1', 'Action2'])
    
    # Step 3: Remove 0.2s from the start of each new action
    # Group by consecutive actions
    df['action_change'] = df['Action'].ne(df['Action'].shift()).cumsum()
    
    cleaned_groups = []
    for _, group in df.groupby('action_change'):
        start_time = group['Timestamp'].iloc[0]
        # Keep only rows where Timestamp >= start_time + 0.2
        group_clean = group[group['Timestamp'] >= start_time + 0.2]
        if not group_clean.empty:
            cleaned_groups.append(group_clean)
    
    # Concatenate cleaned groups
    if cleaned_groups:
        df_clean = pd.concat(cleaned_groups, ignore_index=True)
    else:
        df_clean = pd.DataFrame()  # Empty if no valid groups
    
    # Save the cleaned data to the clean directory with the same filename
    output_file = clean_path / file_name
    df_clean.to_csv(output_file, index=False)
    print(f"Processed and saved: {output_file}")

(12737, 6)
       Timestamp    Channel1     Channel2    Channel3 Action  action_change
0          0.000  677.859866  1164.716559  717.733847   Rest              1
1          0.282  668.390195  1165.841803  695.476999   Rest              1
2          0.285  669.773064  1165.709453  693.656169   Rest              1
3          0.288  670.606267  1165.900061  670.324506   Rest              1
4          0.291  672.538846  1166.019899  672.959012   Rest              1
...          ...         ...          ...         ...    ...            ...
15508     47.028  667.142799  1164.278397  689.959485   Rest             13
15509     47.031  671.017109  1164.277338  696.657565   Rest             13
15510     47.034  673.360860  1164.460522  703.118061   Rest             13
15511     47.037  672.898276  1164.262602  700.319771   Rest             13
15512     47.040  670.999857  1164.339815  679.767386   Rest             13

[12737 rows x 6 columns]
Processed and saved: ../data/badr-data/clean/WS_R_c