Skip to content

Commit

Permalink
Support Chemstation "31" format and bump to 0.3.1
Browse files Browse the repository at this point in the history
Closes #34.
  • Loading branch information
bovee committed Jul 23, 2022
1 parent b0283c4 commit b82f1e3
Show file tree
Hide file tree
Showing 11 changed files with 168 additions and 88 deletions.
4 changes: 2 additions & 2 deletions entab-cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "entab-cli"
version = "0.3.0"
version = "0.3.1"
authors = ["Roderick <rbovee@gmail.com>"]
edition = "2018"
description = "Record-format file reader CLI"
Expand All @@ -11,7 +11,7 @@ categories = ["command-line-utilities", "parsing", "science"]

[dependencies]
clap = { version = "3.1.5", features = ["cargo"] }
entab = { path = "../entab", version = "0.3.0" }
entab = { path = "../entab", version = "0.3.1" }
memchr = "2.4"
memmap2 = { version = "0.5.3", optional = true }

Expand Down
2 changes: 1 addition & 1 deletion entab-js/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "entab-js"
version = "0.3.0"
version = "0.3.1"
authors = ["Roderick <rbovee@gmail.com>"]
license = "MIT"
description = "Record-format file reader"
Expand Down
2 changes: 1 addition & 1 deletion entab-py/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "entab-py"
version = "0.3.0"
version = "0.3.1"
authors = ["Roderick <rbovee@gmail.com>"]
license = "MIT"
description = "Record-format file reader"
Expand Down
2 changes: 1 addition & 1 deletion entab-r/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "entab-r"
version = "0.3.0"
version = "0.3.1"
authors = ["Roderick <rbovee@gmail.com>"]
edition = "2018"

Expand Down
2 changes: 1 addition & 1 deletion entab-r/DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: entab
Type: Package
Title: Entab
Version: 0.3.0
Version: 0.3.1
Author: Roderick
Maintainer: Roderick <rbovee@gmail.com>
Description: Entab is a record-format file reader.
Expand Down
2 changes: 1 addition & 1 deletion entab/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "entab"
version = "0.3.0"
version = "0.3.1"
authors = ["Roderick <rbovee@gmail.com>"]
edition = "2018"
description = "Record-format file reader"
Expand Down
9 changes: 8 additions & 1 deletion entab/src/filetype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ pub enum FileType {
// chemoinformatics
/// Agilent format used for MS-MS trace data
AgilentMsMsScan, // bin 0x01, 0x01
/// Agilent format used for UV-visible array data
AgilentChemstationDad,
/// Agilent format used for flame ionization trace data
AgilentChemstationFid,
/// Agilent format used for mass spectrometry trace data
Expand Down Expand Up @@ -99,6 +101,7 @@ impl FileType {
b"BAM\x01" => return FileType::Bam,
b"@HD\t" | b"@SQ\t" => return FileType::Sam,
b"\x2Escf" => return FileType::Scf,
[0x02, 0x33, 0x31, 0x00] => return FileType::AgilentChemstationDad,
[0x02, 0x38, 0x31, 0x00] => return FileType::AgilentChemstationFid,
[0x01, 0x32, 0x00, 0x00] => return FileType::AgilentChemstationMs,
[0x02, 0x33, 0x30, 0x00] => return FileType::AgilentChemstationMwd,
Expand Down Expand Up @@ -165,7 +168,10 @@ impl FileType {
"scf" => &[FileType::Scf],
"sd" => &[FileType::AgilentMasshunterDadHeader],
"sp" => &[FileType::AgilentMasshunterDad],
"uv" => &[FileType::AgilentChemstationUv],
"uv" => &[
FileType::AgilentChemstationDad,
FileType::AgilentChemstationUv,
],
"xz" => &[FileType::Lzma],
"zstd" => &[FileType::Zstd],
"ztr" => &[FileType::Ztr],
Expand All @@ -179,6 +185,7 @@ impl FileType {
/// If a file is unsupported, an error will be returned.
pub fn to_parser_name<'a>(&self, hint: Option<&'a str>) -> Result<&'a str, EtError> {
Ok(match (self, hint) {
(FileType::AgilentChemstationDad, None) => "chemstation_dad",
(FileType::AgilentChemstationFid, None) => "chemstation_fid",
(FileType::AgilentChemstationMs, None) => "chemstation_ms",
(FileType::AgilentChemstationMwd, None) => "chemstation_mwd",
Expand Down
210 changes: 148 additions & 62 deletions entab/src/parsers/agilent/chemstation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,25 +81,32 @@ impl<'r> From<&ChemstationMetadata> for BTreeMap<String, Value<'r>> {
}
}

fn get_metadata(header: &[u8]) -> Result<ChemstationMetadata, EtError> {
if header.len() < 652 {
fn get_metadata(header: &[u8], has_signal: bool) -> Result<ChemstationMetadata, EtError> {
if has_signal && header.len() < 652 {
return Err(
EtError::from("Chemstation header needs to be at least 648 bytes long").incomplete(),
);
} else if !has_signal && header.len() < 512 {
return Err(
EtError::from("Chemstation header needs to be at least 512 bytes long").incomplete(),
);
}
let start_time = f64::from(i32::extract(&header[282..], &Endian::Big)?) / 60000.;
let end_time = f64::from(i32::extract(&header[286..], &Endian::Big)?) / 60000.;

let offset_correction = f64::extract(&header[636..], &Endian::Big)?;
let mult_correction = f64::extract(&header[644..], &Endian::Big)?;
let mut offset_correction = 0.;
let mut mult_correction = 1.;
let mut signal_name = "";
if has_signal {
offset_correction = f64::extract(&header[636..], &Endian::Big)?;
mult_correction = f64::extract(&header[644..], &Endian::Big)?;

let signal_name_len = usize::from(header[596]);
if signal_name_len > 40 {
return Err("Invalid signal name length".into());
let signal_name_len = usize::from(header[596]);
if signal_name_len > 40 {
return Err("Invalid signal name length".into());
}
signal_name = str::from_utf8(&header[597..597 + signal_name_len])?.trim();
}
let signal_name = str::from_utf8(&header[597..597 + signal_name_len])?
.trim()
.to_string();

let sample_len = usize::from(header[24]);
if sample_len > 60 {
Expand Down Expand Up @@ -164,7 +171,7 @@ fn get_metadata(header: &[u8]) -> Result<ChemstationMetadata, EtError> {
Ok(ChemstationMetadata {
start_time,
end_time,
signal_name,
signal_name: signal_name.to_string(),
offset_correction,
mult_correction,
sequence,
Expand Down Expand Up @@ -213,7 +220,7 @@ impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationFidState {
}

fn get(&mut self, rb: &'b [u8], _state: &'s Self::State) -> Result<(), EtError> {
let metadata = get_metadata(rb)?;
let metadata = get_metadata(rb, true)?;
// offset the current time back one step so it'll be right after the first time that parse
self.cur_time = metadata.start_time - CHEMSTATION_TIME_STEP;
self.cur_intensity = 0.;
Expand Down Expand Up @@ -312,7 +319,7 @@ impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationMsState {
}

fn get(&mut self, buffer: &'b [u8], _state: &'s Self::State) -> Result<(), EtError> {
let metadata = get_metadata(buffer)?;
let metadata = get_metadata(buffer, true)?;
let n_scans = u32::extract(&buffer[278..], &Endian::Big)? as usize;

self.n_scans_left = n_scans;
Expand Down Expand Up @@ -430,7 +437,7 @@ impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationMwdState {
}

fn get(&mut self, buf: &'b [u8], _state: &'s Self::State) -> Result<(), EtError> {
let metadata = get_metadata(buf)?;
let metadata = get_metadata(buf, true)?;

self.n_wvs_left = 0;
// offset the current time back one step so it'll be right after the first time that parse
Expand Down Expand Up @@ -514,55 +521,134 @@ impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationMwdRecord<'s> {
}
}

// scratch with offsets for info in different files

// FID - 02 38 31 00 ("81") (missing 01 38 00 00)
// MWD - 02 33 30 00 ("30")
// MS - 01 32 00 00 ("2") (missing 02 32 30?)
// (possibly also 03 31 37 39 and 03 31 38 31 ?)
// - 5 - "GC / MS Data File" or other?
// - 24 - Sample Name
// - 86 - Sample Description?
// - 148 - Operator Name
// - 178 - Run Date
// - 208 - Instrument Name
// - 218 - LC or GC
// - 228 - Method Name
// - 252 - Sequence? (u16)
// - 254 - Vial? (u16)
// - 256 - Replicate? (u16)
// - 260 - TIC Offset? (i32)
// * 264 - FID/MWD - 512 byte header chunks // 2 + 1
// - 264 - MS - total header bytes // 2 + 1
// - 272 - Normalization offset? (i32)
// * 282 - Start Time (i32)
// * 286 - End Time (i32)
// M 322 - Collection software?
// M 355 - Software Version?
// - 368 - "GC / MS Data File" as utf16
// M 405 - Another Version?
// - 448 - MS - Instrument name as utf16
// - 530 - lower end of mz/wv range?
// - 532 - upper end of mz/wv range?
// - 576 - MS - "GC"
// - 580 - Units
// M 596 - Channel Info (str)
// - 616 - MS - Method directory
// - 644 - (f32/64?)
// - 5768 - MS - data start (GC)

// LC - 03 31 33 31 ("131")
// * 264 - 512 byte header chunks // 2 + 1
// ? 278 - Number of Records
// - 858 - Sample Name
// - 1880 - Operator Name
// - 2391 - Run Date
// - 2492 - Instrument Name
// - 2533 - "LC"
// - 2574 - Method Name
// - 3093 - Units
// 4096 - data start?
#[derive(Clone, Debug, Default)]
/// Internal state for the `ChemstationDadRecord` parser
pub struct ChemstationDadState {
n_scans_left: usize,
n_bytes_left: usize,
cur_time: f64,
cur_intensity: f64,
cur_wv: f64,
wv_step: f64,
metadata: ChemstationMetadata,
}

impl StateMetadata for ChemstationDadState {
fn metadata(&self) -> BTreeMap<String, Value> {
(&self.metadata).into()
}

fn header(&self) -> Vec<&str> {
vec!["time", "wavelength", "intensity"]
}
}

impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationDadState {
type State = ();

fn parse(
rb: &[u8],
_eof: bool,
consumed: &mut usize,
_state: &mut Self::State,
) -> Result<bool, EtError> {
*consumed += read_agilent_header(rb, false)?;
Ok(true)
}

fn get(&mut self, buf: &'b [u8], _state: &'s Self::State) -> Result<(), EtError> {
let metadata = get_metadata(buf, false)?;
let n_scans = u32::extract(&buf[278..], &Endian::Big)? as usize;

self.n_scans_left = n_scans;
self.metadata = metadata;
Ok(())
}
}

#[derive(Clone, Copy, Debug, Default)]
/// A single point from an e.g. moving wavelength detector trace
pub struct ChemstationDadRecord {
/// The time recorded at
pub time: f64,
/// The wavelength recorded at
pub wavelength: f64,
/// The intensity record
pub intensity: f64,
}

impl_record!(ChemstationDadRecord: time, wavelength, intensity);

impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationDadRecord {
type State = ChemstationDadState;

fn parse(
rb: &[u8],
_eof: bool,
consumed: &mut usize,
state: &mut Self::State,
) -> Result<bool, EtError> {
if state.n_scans_left == 0 {
return Ok(false);
}
let con = &mut 0;
let mut n_scans_left = state.n_scans_left;
let mut n_bytes_left = state.n_bytes_left;
if n_bytes_left == 0 {
let scan_type = extract::<u16>(rb, con, &mut Endian::Little)?;
if scan_type != 67 {
// i'm not sure we ever hit this (tracking the n_scans_left should prevent it), but
// sometimes there's a different type of scan (68) at the end which starts a stream
// of u16, u32, u32 data; the u32's appear to both increment separately and the u16
// is either 80 or 81 ~95% of the time and a number in the 50s-60s otherwise.
return Ok(false);
}
n_bytes_left =
usize::from(extract::<u16>(rb, con, &mut Endian::Little)?.saturating_sub(22));
state.cur_time = f64::from(extract::<u32>(rb, con, &mut Endian::Little)?);
state.cur_wv = f64::from(extract::<u16>(rb, con, &mut Endian::Little)?);
let _ = extract::<u16>(rb, con, &mut Endian::Little)?; // the end wavelength
state.wv_step = f64::from(extract::<u16>(rb, con, &mut Endian::Little)?);
let _ = extract::<&[u8]>(rb, con, &mut 8)?;
state.cur_intensity = 0.;
if n_bytes_left == 0 {
// TODO: consume the rest of the file so this can't accidentally repeat?
return Ok(false);
}
n_scans_left -= 1;
} else {
state.cur_wv += state.wv_step;
}

let intensity: i16 = extract(rb, con, &mut Endian::Little)?;
if intensity == -32768 {
state.cur_intensity = f64::from(extract::<i32>(rb, con, &mut Endian::Little)?);
state.n_bytes_left = n_bytes_left.saturating_sub(6);
} else {
state.cur_intensity += f64::from(intensity);
state.n_bytes_left = n_bytes_left.saturating_sub(2);
}

state.n_scans_left = n_scans_left;
*consumed += *con;
Ok(true)
}

fn get(&mut self, _rb: &'b [u8], state: &'s Self::State) -> Result<(), EtError> {
self.wavelength = state.cur_wv / 20.;
self.time = state.cur_time / 60_000.;
self.intensity = state.cur_intensity / 2000.;
Ok(())
}
}

impl_reader!(
ChemstationDadReader,
ChemstationDadRecord,
ChemstationDadRecord,
ChemstationDadState,
()
);
impl_reader!(
ChemstationFidReader,
ChemstationFidRecord,
Expand Down
14 changes: 0 additions & 14 deletions entab/src/parsers/agilent/chemstation_new.rs
Original file line number Diff line number Diff line change
Expand Up @@ -224,20 +224,6 @@ impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationUvRecord {
}
}

// scratch with offsets for info in different files

// LC - 03 31 33 31 ("131")
// * 264 - 512 byte header chunks // 2 + 1
// ? 278 - Number of Records
// - 858 - Sample Name
// - 1880 - Operator Name
// - 2391 - Run Date
// - 2492 - Instrument Name
// - 2533 - "LC"
// - 2574 - Method Name
// - 3093 - Units
// 4096 - data start?

impl_reader!(
ChemstationUvReader,
ChemstationUvRecord,
Expand Down
6 changes: 2 additions & 4 deletions entab/src/parsers/agilent/mod.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
// TODO: finish and reenable this
// /// Readers for instrument telemetry data generated by Chemstation
// pub mod chemstation_reg;
/// Readers for formats generated by the GC/LC control software Chemstation
pub mod chemstation;
/// Readers for newer formats generated by the GC/LC control software Chemstation
pub mod chemstation_new;
// /// Reader for Chemstation's logging files
// TODO: finish and reenable this
// /// Readers for instrument telemetry data generated by Chemstation
// pub mod chemstation_reg;
/// Readers for formats generated by the GC/LC control software Masshunter
#[cfg(feature = "std")]
Expand Down
Loading

0 comments on commit b82f1e3

Please sign in to comment.