In [4]:
import pandas as pd
import numpy as np
from pathlib import Path

# Setup paths
CLEAN_DIR = Path('../data/badr-data/clean')

In [5]:
# Check mean and std for each file
files = sorted(CLEAN_DIR.glob('*.csv'))
print(f"Found {len(files)} files\n")

for file in files:
    df = pd.read_csv(file)
    print(f"{file.name}:")
    print(f"  Shape: {df.shape}")
    
    # Get sensor columns (Channel1, Channel2, Channel3)
    sensor_cols = [col for col in df.columns if col.startswith('Channel')]
    
    for col in sensor_cols:
        mean = df[col].mean()
        std = df[col].std()
        print(f"  {col}: mean={mean:.4f}, std={std:.4f}")
    print()

Found 6 files

WS_R_ch3_seq4_250523175746.csv:
  Shape: (12062, 5)
  Channel1: mean=690.5718, std=10.9362
  Channel2: mean=1167.0594, std=0.2565
  Channel3: mean=704.6077, std=13.3446

WS_R_ch3_seq4_250523180012.csv:
  Shape: (12485, 5)
  Channel1: mean=674.1098, std=11.8944
  Channel2: mean=1166.7730, std=0.2443
  Channel3: mean=687.2567, std=25.1928

WS_R_ch3_seq4_250523180128.csv:
  Shape: (11070, 5)
  Channel1: mean=672.2220, std=9.6132
  Channel2: mean=1166.6852, std=0.2192
  Channel3: mean=685.3770, std=15.4254

WS_R_ch3_seq4_250523180233.csv:
  Shape: (12786, 5)
  Channel1: mean=670.7056, std=17.4000
  Channel2: mean=1166.6600, std=0.2722
  Channel3: mean=684.5361, std=13.1949

WS_R_ch3_seq4_250523180349.csv:
  Shape: (12438, 5)
  Channel1: mean=668.7648, std=16.4162
  Channel2: mean=1163.1282, std=0.2663
  Channel3: mean=682.1575, std=12.7902

WS_R_ch3_seq4_250523180502.csv:
  Shape: (11936, 5)
  Channel1: mean=671.4891, std=17.1172
  Channel2: mean=1164.5004, std=0.6220
  Chan

In [6]:
# Combine all files
dfs = [pd.read_csv(file) for file in files]
df_combined = pd.concat(dfs, ignore_index=True)

print(f"Combined data shape: {df_combined.shape}\n")

# Check mean and std of combined data
sensor_cols = [col for col in df_combined.columns if col.startswith('Channel')]

for col in sensor_cols:
    mean = df_combined[col].mean()
    std = df_combined[col].std()
    print(f"{col}: mean={mean:.4f}, std={std:.4f}")

Combined data shape: (72777, 5)

Channel1: mean=674.6097, std=16.0839
Channel2: mean=1165.7916, std=1.5128
Channel3: mean=688.3832, std=17.6697


In [7]:
# Test bandpass filter effect
import sys
sys.path.append('../src')
import config
from preprocessing import bandpass_filter

df_test = df_combined.copy()
print("Before bandpass filter:")
for col in ['Channel1', 'Channel2', 'Channel3']:
    print(f"  {col}: mean={df_test[col].mean():.4f}, std={df_test[col].std():.4f}")

df_filtered = bandpass_filter(df_test)

print("\nAfter bandpass filter:")
for col in ['Channel1', 'Channel2', 'Channel3']:
    print(f"  {col}: mean={df_filtered[col].mean():.4f}, std={df_filtered[col].std():.4f}")
    print(f"  {col}: min={df_filtered[col].min():.4f}, max={df_filtered[col].max():.4f}")

Before bandpass filter:
  Channel1: mean=674.6097, std=16.0839
  Channel2: mean=1165.7916, std=1.5128
  Channel3: mean=688.3832, std=17.6697
sampling rate: 1556.34 Hz

After bandpass filter:
  Channel1: mean=-16016984980640362486050418766856623797756776598800675693707796135171687648171229043284576605409133921958056754983270647872215983149918453508341760.0000, std=147375887055171797103671835638164993350423304638018030190892992614321663978420661247461521064295486864617640420096957348573572133182546618229981184.0000
  Channel1: min=-2739049775356903989396769674223304005685106231812961667584993297593468931989804479861454925639812047487219501520539272753128495576623181496737333248.0000, max=-906638802630947316792190622058974412248106342663042626346909120222724096.0000
  Channel2: mean=-1006479188936096686799429710121440439352637161170240562303735452277018752560139008328843074499351531611918371820444130847085576828968476518383616.0000, std=9283503618903293278993051699247236317705012846845185

In [8]:
# Check filter design
from scipy.signal import butter
sampling_rate = 1556.34
lowcut = 0.5
highcut = 20.0
order = 4

nyquist = 0.5 * sampling_rate
low_normalized = lowcut / nyquist
high_normalized = highcut / nyquist

print(f"Sampling rate: {sampling_rate:.2f} Hz")
print(f"Nyquist: {nyquist:.2f} Hz")
print(f"Low normalized: {low_normalized:.6f}")
print(f"High normalized: {high_normalized:.6f}")

b, a = butter(order, [low_normalized, high_normalized], btype='band')
print(f"\nFilter coefficients b: {b}")
print(f"Filter coefficients a: {a}")

# Check if filter is stable
from numpy import roots, abs
poles = roots(a)
print(f"\nPoles magnitude (should be < 1 for stability): {abs(poles)}")

Sampling rate: 1556.34 Hz
Nyquist: 778.17 Hz
Low normalized: 0.000643
High normalized: 0.025701

Filter coefficients b: [ 2.17036667e-06  0.00000000e+00 -8.68146668e-06  0.00000000e+00
  1.30222000e-05  0.00000000e+00 -8.68146668e-06  0.00000000e+00
  2.17036667e-06]
Filter coefficients a: [  1.          -7.79367314  26.57733566 -51.7955525   63.09632901
 -49.19794426  23.97848466  -6.67899211   0.81401267]

Poles magnitude (should be < 1 for stability): [0.97106676 0.97106676 1.00072886 1.00072886 0.99660081 0.99660081
 0.93159862 0.93159862]


In [9]:
# Test with SOS (second-order sections) - more stable
from scipy.signal import sosfiltfilt

sos = butter(order, [low_normalized, high_normalized], btype='band', output='sos')

df_test2 = df_combined.copy()
print("Before SOS bandpass filter:")
for col in ['Channel1', 'Channel2', 'Channel3']:
    print(f"  {col}: mean={df_test2[col].mean():.4f}, std={df_test2[col].std():.4f}")

# Apply SOS filter
for col in ['Channel1', 'Channel2', 'Channel3']:
    df_test2[col] = sosfiltfilt(sos, df_test2[col].values)

print("\nAfter SOS bandpass filter:")
for col in ['Channel1', 'Channel2', 'Channel3']:
    print(f"  {col}: mean={df_test2[col].mean():.4f}, std={df_test2[col].std():.4f}")
    print(f"  {col}: min={df_test2[col].min():.4f}, max={df_test2[col].max():.4f}")

Before SOS bandpass filter:
  Channel1: mean=674.6097, std=16.0839
  Channel2: mean=1165.7916, std=1.5128
  Channel3: mean=688.3832, std=17.6697

After SOS bandpass filter:
  Channel1: mean=0.0199, std=6.4263
  Channel1: min=-29.3002, max=33.8879
  Channel2: mean=0.0008, std=0.2087
  Channel2: min=-1.7076, max=2.0525
  Channel3: mean=0.1243, std=7.8672
  Channel3: min=-86.3149, max=255.7386


In [10]:
# Check sampling rate calculation
timestamps = df_combined['Timestamp'].values

# Calculate time differences between consecutive samples
time_diffs = np.diff(timestamps)

print(f"Time differences (first 10): {time_diffs[:10]}")
print(f"\nMean time difference: {time_diffs.mean():.6f} seconds")
print(f"Median time difference: {np.median(time_diffs):.6f} seconds")
print(f"Std of time differences: {time_diffs.std():.6f} seconds")

# Sampling rate = 1 / mean_time_diff
sampling_rate_correct = 1.0 / time_diffs.mean()
sampling_rate_median = 1.0 / np.median(time_diffs)

print(f"\nSampling rate (from mean): {sampling_rate_correct:.2f} Hz")
print(f"Sampling rate (from median): {sampling_rate_median:.2f} Hz")

# Check what the preprocessing code calculates
from preprocessing import _calculate_sampling_rate
sampling_rate_code = _calculate_sampling_rate(df_combined['Timestamp'])
print(f"\nSampling rate (from code): {sampling_rate_code:.2f} Hz")

Time differences (first 10): [0.004 0.003 0.003 0.003 0.003 0.003 0.003 0.003 0.003 0.003]

Mean time difference: 0.000643 seconds
Median time difference: 0.003000 seconds
Std of time differences: 0.388045 seconds

Sampling rate (from mean): 1556.34 Hz
Sampling rate (from median): 333.33 Hz

Sampling rate (from code): 1556.34 Hz


In [11]:
# Investigate the time difference distribution
print(f"Min time diff: {time_diffs.min():.6f} seconds")
print(f"Max time diff: {time_diffs.max():.6f} seconds")
print(f"25th percentile: {np.percentile(time_diffs, 25):.6f} seconds")
print(f"75th percentile: {np.percentile(time_diffs, 75):.6f} seconds")
print(f"99th percentile: {np.percentile(time_diffs, 99):.6f} seconds")

# Count how many are close to 0.003 (normal) vs outliers
normal_samples = np.sum((time_diffs >= 0.002) & (time_diffs <= 0.004))
outliers = np.sum((time_diffs < 0.002) | (time_diffs > 0.004))
print(f"\nNormal samples (~0.003s): {normal_samples} ({100*normal_samples/len(time_diffs):.2f}%)")
print(f"Outliers: {outliers} ({100*outliers/len(time_diffs):.2f}%)")

# Where are the large gaps?
large_gaps = time_diffs > 1.0  # gaps > 1 second
print(f"\nLarge gaps (>1s): {np.sum(large_gaps)}")
if np.sum(large_gaps) > 0:
    print(f"Max gap: {time_diffs.max():.2f} seconds")
    gap_indices = np.where(large_gaps)[0]
    print(f"Gap locations (first 5): {gap_indices[:5]}")

Min time diff: -48.594000 seconds
Max time diff: 1.187000 seconds
25th percentile: 0.003000 seconds
75th percentile: 0.003000 seconds
99th percentile: 0.004000 seconds

Normal samples (~0.003s): 71926 (98.83%)
Outliers: 850 (1.17%)

Large gaps (>1s): 5
Max gap: 1.19 seconds
Gap locations (first 5): [51102 53035 61620 63404 67145]


In [12]:
# Load the requested file and inspect data around row 51102
PREPROC_DIR = Path('../data/badr-data/proc')

df_check = pd.read_csv(PREPROC_DIR / 'WS_R_ch3_seq4.csv')

row_idx = 51102


df_check.iloc[row_idx - 10: row_idx + 10]

Unnamed: 0,Timestamp,Channel1,Channel2,Channel3,Action
51092,10.138,0.796027,1.095359,0.386365,Rest
51093,10.141,0.846655,0.780524,0.401404,Rest
51094,10.144,0.817573,0.362459,0.421537,Rest
51095,10.147,0.706319,-0.147586,0.451641,Rest
51096,10.15,0.519191,-0.724765,0.493571,Rest
51097,10.153,0.271336,-1.331162,0.54531,Rest
51098,10.156,-0.01421,-1.918495,0.601171,Rest
51099,10.159,-0.308482,-2.4331,0.652994,Rest
51100,10.162,-0.579379,-2.822902,0.691975,Rest
51101,10.165,-0.795046,-3.045583,0.710551,Rest
