# Batch Processed Data Visualization & QA

This notebook helps you quickly visualize and compare raw vs. processed sensor data, highlight outliers, and automate QA for any comparison Excel file generated by your batch workflow.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import glob
import os

# Set the directory containing comparison files
comparison_dir = '../data/output/comparisons/'
# List all comparison Excel files
comparison_files = glob.glob(os.path.join(comparison_dir, '*.xlsx'))
print(f'Found {len(comparison_files)} comparison files.')
# Display file list
for i, f in enumerate(comparison_files):
    print(f'[{i}] {os.path.basename(f)}')

# Select a file to visualize (change index as needed)
file_idx = 0  # <-- Change this to select a different file
comparison_file = comparison_files[file_idx]
print(f'Using file: {comparison_file}')

# Load the comparison data
df = pd.read_excel(comparison_file)
df.head()

In [None]:
# Identify value columns
value_cols = [c for c in df.columns if 'Value' in c and ('raw' in c or 'processed' in c)]
print('Value columns:', value_cols)

# Plot raw vs. processed
plt.figure(figsize=(12, 6))
if 'Time (Seconds)' in df.columns:
    x = df['Time (Seconds)']
else:
    x = range(len(df))
plt.plot(x, df[value_cols[0]], label='Raw')
plt.plot(x, df[value_cols[1]], label='Processed')
if 'Outlier_Flag' in df.columns:
    plt.scatter(df.loc[df['Outlier_Flag'], 'Time (Seconds)'] if 'Time (Seconds)' in df.columns else df.loc[df['Outlier_Flag']].index,
    df.loc[df['Outlier_Flag'], value_cols[0]], color='red', label='Outliers', zorder=5)
plt.legend()
plt.title('Raw vs. Processed Data with Outliers Highlighted')
plt.xlabel('Time (Seconds)' if 'Time (Seconds)' in df.columns else 'Index')
plt.ylabel('Value')
plt.show()

## Batch QA: Summary Table

This cell summarizes the number of outliers detected and the overall change between raw and processed data.


In [None]:
summary = {'Total Points': len(df),
           'Outliers Detected': int(df['Outlier_Flag'].sum()) if 'Outlier_Flag' in df.columns else 0}
if len(value_cols) >= 2:
    summary['Mean Raw'] = df[value_cols[0]].mean()
    summary['Mean Processed'] = df[value_cols[1]].mean()
    summary['Mean Abs Change'] = (df[value_cols[0]] - df[value_cols[1]]).abs().mean()
pd.DataFrame([summary])

## Batch Automation (Optional)

Run this cell to generate plots and QA summaries for ALL comparison files in the folder.


In [None]:
for file in comparison_files:
    df = pd.read_excel(file)
    value_cols = [c for c in df.columns if 'Value' in c and ('raw' in c or 'processed' in c)]
    print(f'\nFile: {os.path.basename(file)}')
    print(f'Total Points: {len(df)}')
    outliers = int(df['Outlier_Flag'].sum()) if 'Outlier_Flag' in df.columns else 0
    print(f'Outliers Detected: {outliers}')
    if len(value_cols) >= 2:
        print(f'Mean Raw: {df[value_cols[0]].mean():.3f}')
        print(f'Mean Processed: {df[value_cols[1]].mean():.3f}')
        print(f'Mean Abs Change: {(df[value_cols[0]] - df[value_cols[1]]).abs().mean():.3f}')
    # Optionally plot
    plt.figure(figsize=(10, 4))
    if 'Time (Seconds)' in df.columns:
        x = df['Time (Seconds)']
    else:
        x = range(len(df))
    plt.plot(x, df[value_cols[0]], label='Raw')
    plt.plot(x, df[value_cols[1]], label='Processed')
    if 'Outlier_Flag' in df.columns:
        plt.scatter(df.loc[df['Outlier_Flag'], 'Time (Seconds)'] if 'Time (Seconds)' in df.columns else df.loc[df['Outlier_Flag']].index,
    df.loc[df['Outlier_Flag'], value_cols[0]], color='red', label='Outliers', zorder=5)
    plt.legend()
    plt.title(os.path.basename(file))
    plt.tight_layout()
    plt.show()