# Agilent HPLC Data Parsing Workflow

This notebook demonstrates the complete workflow for parsing LC-MS chromatography data from Agilent .D files using the rainbow-api library. The workflow includes parallel data reading, inspection, and visualizations for discussions.

## 1. Import Required Libraries

Essential imports for data processing, visualization, and parallel execution.

In [None]:
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
import time

import matplotlib.pyplot as plt
import numpy as np
import rainbow as rb

print("✅ Libraries imported successfully")

## 2. Set Up Data Folder Path and Discovery

Define the data container folder and discover all .D data folders using pathlib.

In [None]:
# Define the data folder container path
DATA_FOLDER = Path("sample-data")

# Check if folder exists
if not DATA_FOLDER.exists():
    print(f"⚠️  Warning: Data folder '{DATA_FOLDER}' not found!")
    print("Please update DATA_FOLDER path to your chromatography data location")
else:
    print(f"📁 Data folder found: {DATA_FOLDER.absolute()}")

# Discover all .D data folders
data_paths = sorted(list(DATA_FOLDER.glob("*.D")))

print(f"\n🔍 Discovery Results:")
print(f"   Found {len(data_paths)} .D folders")

if len(data_paths) > 0:
    print(f"   Example paths:")
    for i, path in enumerate(data_paths[:3]):  # Show first 3
        print(f"     {i+1}. {path.name}")
    if len(data_paths) > 3:
        print(f"     ... and {len(data_paths)-3} more")
else:
    print("   ❌ No .D folders found - check your data path")

## 3. Define Data Reading Function

Function to read individual .D data files with error handling.

In [None]:
def read_chromatogram_data(file_path):
    """
    Read a single .D chromatography data file using rb.read

    Args:
        file_path (str): Path to the .D data folder

    Returns:
        Data object from rb.read or None if error occurs
    """
    try:
        # Read the data file
        data_object = rb.read(file_path)

        # Basic validation
        if data_object is not None:
            return data_object
        else:
            print(f"⚠️  Warning: rb.read returned None for {file_path}")
            return None

    except Exception as e:
        print(f"❌ Error reading {file_path}: {str(e)}")
        return None

# Test the function with the first file (if available)
if len(data_paths) > 0:
    print("🧪 Testing data reading function...")
    test_result = read_chromatogram_data(str(data_paths[0]))

    if test_result is not None:
        print(f"✅ Test successful - data object created")
        print(f"📊 Test file: {data_paths[0].name}")
    else:
        print("❌ Test failed - check file path and rb.read compatibility")
else:
    print("⚠️  No files available for testing")

## 4. Read Data Files Using ThreadPoolExecutor

Use parallel processing to efficiently read all .D data files into a list.

In [None]:
# Use ThreadPoolExecutor to read all data files in parallel
print("🚀 Starting parallel data reading...")
print(f"📁 Files to process: {len(data_paths)}")

# Convert Path objects to strings for rb.read compatibility
path_strings = [str(path) for path in data_paths]

# Record start time for performance measurement
start_time = time.time()

# Execute parallel reading
with ThreadPoolExecutor(max_workers=4) as executor:
    # Map the reading function to all file paths
    data_objects = list(executor.map(read_chromatogram_data, path_strings))

# Record end time
end_time = time.time()
processing_time = end_time - start_time

# Filter out None results (failed reads)
successful_data_objects = [obj for obj in data_objects if obj is not None]

print(f"\n📊 Processing Results:")
print(f"   ⏱️  Processing time: {processing_time:.2f} seconds")
print(f"   ✅ Successfully read: {len(successful_data_objects)} files")
print(f"   ❌ Failed reads: {len(data_objects) - len(successful_data_objects)} files")
print(f"   🎯 Success rate: {len(successful_data_objects)/len(data_objects)*100:.1f}%")

if len(successful_data_objects) > 0:
    print(f"\n🎉 Data reading completed successfully!")
    print(f"📦 Created list with {len(successful_data_objects)} data objects")
else:
    print(f"\n⚠️  No data objects were successfully created")
    print("Check file paths and rb.read compatibility")

## 5. Inspect Data Objects

Examine the structure and properties of the data objects.

In [None]:
successful_data_objects[0].get_file("MSD2.MS").ylabels

In [None]:
# Inspect the first data object for lab manager discussion
if len(successful_data_objects) > 0:
    # Select first data object
    sample_data = successful_data_objects[0]

    print("🔍 DATA STRUCTURE INSPECTION")
    print("=" * 50)

    # Show directory metadata
    print(f"📁 Sample File: {data_paths[0].name}")
    print(f"🏷️  Directory Metadata:")
    try:
        dir_metadata = sample_data.__dict__
        for key, value in dir_metadata.items():
            if not key.startswith('_'):
                print(f"     {key}: {value}")
    except:
        print("     Metadata not directly accessible")

    print()

    # Get MSD2.MS file for detailed inspection
    msd_data = sample_data.get_file("MSD2.MS")

    if msd_data is not None:
        print("📊 MSD2.MS DATA STRUCTURE:")
        print(f"     ⏱️  Time points (xlabels): {msd_data.xlabels.shape[0]} points")
        print(f"     🎯 m/z values (ylabels): {msd_data.ylabels.shape[0]} traces")
        print(f"     📈 Data matrix shape: {msd_data.data.shape}")
        print(f"     🔬 Total measurements: {msd_data.data.size:,}")

        print(f"\n     ⏰ Time range: {msd_data.xlabels[0]:.3f} to {msd_data.xlabels[-1]:.1f} minutes")
        print(f"     ⚗️  m/z range: {msd_data.ylabels.min():.1f} to {msd_data.ylabels.max():.1f}")

        print(f"\n     🎯 Monitored m/z values:")
        mz_values = ", ".join([f"{mz:.0f}" for mz in msd_data.ylabels])
        print(f"     {mz_values}")

        # Show metadata - handle both string and dict return types
        print(f"\n     📋 Acquisition info:")
        try:
            info = msd_data.get_info()
            if isinstance(info, dict):
                for key, value in info.items():
                    print(f"        {key}: {value}")
            elif isinstance(info, str):
                print(f"        {info}")
            else:
                print(f"        {info}")
        except Exception as e:
            print(f"        Unable to retrieve info: {e}")

    else:
        print("❌ MSD2.MS file not found in data object")

    print(f"\n✅ Inspection completed for sample data")

else:
    print("⚠️  No data objects available for inspection")

## 8. Plot Single Trace

Create a visualization for a single m/z trace to show chromatogram structure.

In [None]:
if len(successful_data_objects) > 0:

    # Get MSD data from first sample
    sample_data = successful_data_objects[0]
    msd_data = sample_data.get_file("MSD2.MS")

    if msd_data is not None:
        # Select a prominent m/z for demonstration (e.g., m/z 546)
        target_mz = 546

        if target_mz in msd_data.ylabels:
            mz_index = np.where(msd_data.ylabels == target_mz)[0][0]
            chromatogram = msd_data.data[:, mz_index]
            times = msd_data.xlabels

            # Create the plot
            plt.figure(figsize=(12, 6))
            plt.plot(times, chromatogram, 'b-', linewidth=1.5, alpha=0.8)

            # Formatting the plot
            plt.xlabel('Retention Time (minutes)', fontsize=12)
            plt.ylabel('Intensity (detector response)', fontsize=12)
            plt.title(f'Single Trace Chromatogram: m/z {target_mz}',
                     fontsize=14, fontweight='bold')

            # Add grid and styling
            plt.grid(True, alpha=0.3)
            plt.tight_layout()

            # Add annotation about the data
            max_intensity = chromatogram.max()
            max_time = times[np.argmax(chromatogram)]
            plt.annotate(f'Peak: {max_intensity:,.0f} at {max_time:.2f} min',
                        xy=(max_time, max_intensity),
                        xytext=(max_time + 10, max_intensity * 0.8),
                        arrowprops=dict(arrowstyle='->', color='red', alpha=0.7),
                        fontsize=10, color='red')

            plt.show()

            # Summary
            print(f"📊 SINGLE TRACE SUMMARY:")
            print(f"   🎯 m/z value: {target_mz}")
            print(f"   ⏱️  Time range: {times[0]:.3f} to {times[-1]:.1f} minutes")
            print(f"   📈 Peak intensity: {max_intensity:,.0f} at {max_time:.2f} minutes")
            print(f"   📋 Data points: {len(chromatogram):,}")

        else:
            print(f"⚠️  m/z {target_mz} not found in data")
            print(f"Available m/z values: {msd_data.ylabels}")
    else:
        print("❌ Cannot create plot - MSD2.MS data not available")
else:
    print("⚠️  No data objects available for plotting")

## 9. Plot Multiple  (4) Traces from a sample

Generate comparison plots showing 4 different traces for comprehensive analysis.

In [None]:
# Plot 4 different traces for comprehensive lab analysis
if len(successful_data_objects) > 0:

    # Get MSD data from first sample
    sample_data = successful_data_objects[0]
    msd_data = sample_data.get_file("MSD2.MS")

    if msd_data is not None:
        # Select 4 representative m/z values for comparison
        target_mz_values = [544, 546, 590, 618]  # Adjust based on your data

        # Verify all m/z values exist
        available_mz = []
        for mz in target_mz_values:
            if mz in msd_data.ylabels:
                available_mz.append(mz)

        # If we don't have exactly 4, take the first 4 available
        if len(available_mz) < 4:
            available_mz = msd_data.ylabels[:4].tolist()

        # Create subplot layout
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        axes = axes.ravel()  # Flatten for easy indexing

        colors = ['blue', 'red', 'green', 'orange']

        for i, mz in enumerate(available_mz[:4]):
            # Get chromatogram data
            mz_index = np.where(msd_data.ylabels == mz)[0][0]
            chromatogram = msd_data.data[:, mz_index]
            times = msd_data.xlabels

            # Plot on subplot
            axes[i].plot(times, chromatogram, color=colors[i], linewidth=1.5, alpha=0.8)

            # Format subplot
            axes[i].set_xlabel('Retention Time (minutes)', fontsize=10)
            axes[i].set_ylabel('Intensity', fontsize=10)
            axes[i].set_title(f'm/z {mz:.0f}', fontsize=12, fontweight='bold')
            axes[i].grid(True, alpha=0.3)

            # Add peak annotation
            max_intensity = chromatogram.max()
            max_time = times[np.argmax(chromatogram)]
            axes[i].text(0.02, 0.98, f'Peak: {max_intensity:,.0f}\nat {max_time:.2f} min',
                        transform=axes[i].transAxes,
                        verticalalignment='top',
                        bbox=dict(boxstyle='round', facecolor='white', alpha=0.8),
                        fontsize=9)

        # Overall title
        fig.suptitle(f'Four-Trace Chromatogram Comparison\nSample: {data_paths[0].name}',
                    fontsize=16, fontweight='bold')

        plt.tight_layout()
        plt.show()

        # Create overlay plot for direct comparison
        plt.figure(figsize=(14, 8))

        for i, mz in enumerate(available_mz[:4]):
            mz_index = np.where(msd_data.ylabels == mz)[0][0]
            chromatogram = msd_data.data[:, mz_index]
            times = msd_data.xlabels

            # Normalize for better comparison (optional)
            normalized_chromatogram = chromatogram / chromatogram.max()

            plt.plot(times, normalized_chromatogram,
                    color=colors[i], linewidth=2, alpha=0.8,
                    label=f'm/z {mz:.0f}')

        plt.xlabel('Retention Time (minutes)', fontsize=12)
        plt.ylabel('Normalized Intensity', fontsize=12)
        plt.title(f'Normalized Four-Trace Overlay\nSample: {data_paths[0].name}',
                 fontsize=14, fontweight='bold')
        plt.legend(fontsize=11)
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()

        # Summary for lab manager
        print(f"📊 FOUR-TRACE ANALYSIS SUMMARY:")
        print(f"   🎯 Compared m/z values: {[f'{mz:.0f}' for mz in available_mz[:4]]}")
        print(f"   📋 Sample: {data_paths[0].name}")
        print(f"   ⏱️  Analysis time range: {msd_data.xlabels[0]:.3f} to {msd_data.xlabels[-1]:.1f} minutes")

        print(f"\n   📈 Peak intensities:")
        for mz in available_mz[:4]:
            mz_index = np.where(msd_data.ylabels == mz)[0][0]
            chromatogram = msd_data.data[:, mz_index]
            max_intensity = chromatogram.max()
            max_time = times[np.argmax(chromatogram)]
            print(f"      m/z {mz:.0f}: {max_intensity:,.0f} at {max_time:.2f} min")

    else:
        print("❌ Cannot create plots - MSD2.MS data not available")
else:
    print("⚠️  No data objects available for plotting")

## 10. Export a Sample to CSV

In [None]:
# Create CSV filename based on sample name
sample_name = data_paths[0].stem  # Get filename without .D extension
csv_filename = f"{sample_name}_chromatogram_data.csv"

print(f"📁 Source: {data_paths[0].name}")
print(f"💾 Output: {csv_filename}")
print(f"📊 Data dimensions: {msd_data.data.shape}")

# Use rainbow's built-in CSV export method
msd_data.export_csv(csv_filename)
print(f"✅ Successfully exported using rainbow's export_csv() method")

# Show export details
print(f"📈 CSV contains:")
print(f"   • {len(msd_data.xlabels):,} time points")
print(f"   • {len(msd_data.ylabels)} m/z traces")
print(f"   • {msd_data.data.size:,} total intensity measurements")

# Show file size
import os
if os.path.exists(csv_filename):
    file_size = os.path.getsize(csv_filename)
    print(f"\n💾 File size: {file_size:,} bytes ({file_size/1024/1024:.2f} MB)")

# Instructions for lab manager
print(f"\n💡 NOTES:")
print(f"   📊 This CSV file can be opened in Excel, R, MATLAB, etc.")
print(f"   🕒 First column contains retention times in minutes")
print(f"   ⚗️  Subsequent columns contain intensity data for each m/z value")
print(f"   📈 Column headers show the exact m/z values monitored")
print(f"   🔄 Data format is rainbow's standard export format")
print(f"   ⚡ Generated using rainbow's built-in export_csv() method")

## 11. Export all samples to CSV

In [None]:
# Export all samples to CSV
for data_path, sample_data in zip(data_paths, successful_data_objects):
    msd_data = sample_data.get_file("MSD2.MS")
    if msd_data is not None:
        sample_name = data_path.stem
        csv_filename = f"{sample_name}.csv"
        msd_data.export_csv(csv_filename)
        print(f"✅ Exported {csv_filename} with shape {msd_data.data.shape}")
    else:
        print(f"❌ MSD2.MS not found for sample {data_path.name}, skipping export")

# Summary

1. **Data Structure**: Each .D folder contains chromatography data with:
   - **Traces (columns)**: Individual m/z values monitored
   - **X-values**: Retention times in minutes  
   - **Intensity values**: Raw detector responses (NOT concentrations)

2. **Processing Workflow**: 
   - Automatic discovery and sorting of .D folders
   - Parallel reading using ThreadPoolExecutor for efficiency
   - Error handling for robust data processing

3. **Analysis Capabilities**:
   - Individual trace inspection and visualization
   - Multi-trace comparison for method validation

4. **Data Format**: Compatible with rainbow-api library for:
   - Reading Agilent .D files
   - Exporting to CSV for further analysis