# CHI File Parsing

> Parses files from a CH Instruments potentiostat

In [None]:
#| default_exp chi

In [None]:
#| hide
from nbdev.showdoc import *
from fastcore.all import *

CHI text file formats have a few features we want to handle:
1. Included sections. CHI allows the presence or absence of each of these four: memo, parameters, results, numeric data
2. Separator: comma, tab (and also space and linefeed, although these are less common and aren't planned features)
3. Different column formats (multichannel results in different columns, multichannel files are currently not supported)
4. Empty time column: some files have an empty time column for some reason

Let's look an example of a file, to get started

In [None]:
import os

In [None]:
test_root = "../test_files"

test_dpv_dir = os.path.join(test_root, "DPVs")
test_cv_dir = os.path.join(test_root, "CVs")
os.listdir(test_dpv_dir), os.listdir(test_cv_dir)

(['MPRN_comma.txt',
  'PN_tab_time.txt',
  'MPRN_tab.txt',
  'ps_single.csv',
  'MPN_tab.txt',
  'MPRN_tab_time.txt',
  'ps_multi.csv'],
 ['MPRN_charge.txt'])

In [None]:
with open(os.path.join(test_dpv_dir, "MPRN_comma.txt")) as f:
    lines = f.read().splitlines()

lines[:30]

['Feb. 6, 2025   17:35:06',
 'Differential Pulse Voltammetry',
 'File: iaa_7.8125um_mnge_gce_2_250206.bin',
 'Data Source:  Experiment',
 'Instrument Model:  CHI1040C',
 'Header: ',
 'Note: ',
 '',
 'Init E (V) = 0.2',
 'Final E (V) = 1',
 'Incr E (V) = 0.002',
 'Amplitude (V) = 0.025',
 'Pulse Width (sec) = 0.05',
 'Sample Width (sec) = 0.0167',
 'Pulse Period (sec) = 0.5',
 'Quiet Time (sec) = 2',
 'Sensitivity (A/V) = 1e-6',
 '',
 'Results:',
 '',
 'Channel 1:',
 'Ep = 0.698V',
 'ip = -1.338e-6A',
 'Ap = -2.077e-7VA',
 '',
 'Potential/V, Current/A',
 '',
 '0.202, -6.596e-7',
 '0.204, -6.557e-7',
 '0.206, -6.549e-7']

Now we can break down the sections:

In [None]:
memo = lines[:7]
memo

['Feb. 6, 2025   17:35:06',
 'Differential Pulse Voltammetry',
 'File: iaa_7.8125um_mnge_gce_2_250206.bin',
 'Data Source:  Experiment',
 'Instrument Model:  CHI1040C',
 'Header: ',
 'Note: ']

In [None]:
params = lines[8:17] # parameters
params

['Init E (V) = 0.2',
 'Final E (V) = 1',
 'Incr E (V) = 0.002',
 'Amplitude (V) = 0.025',
 'Pulse Width (sec) = 0.05',
 'Sample Width (sec) = 0.0167',
 'Pulse Period (sec) = 0.5',
 'Quiet Time (sec) = 2',
 'Sensitivity (A/V) = 1e-6']

In [None]:
results = lines[18:24] # results
results

['Results:',
 '',
 'Channel 1:',
 'Ep = 0.698V',
 'ip = -1.338e-6A',
 'Ap = -2.077e-7VA']

In [None]:
numeric_data = lines[25:]
numeric_data[:5]

['Potential/V, Current/A',
 '',
 '0.202, -6.596e-7',
 '0.204, -6.557e-7',
 '0.206, -6.549e-7']

First, let's turn the date string within a memo into a proper datetime object

In [None]:
#| export
from datetime import datetime

def parse_datetime(
    chi_timestamp: str
) -> datetime:
    """
    Turn a CHI timestamp into a python datetime object.
    If parsing fails, this function returns `None`.
    """
    
    # try multiple date formats, because CHI has a nonstandard way of displaying the month. It'd be better to directly obtain the exact strings for each month, but this works for now 
    date_formats = [
        "%B %d, %Y %H:%M:%S",      # "July 8, 2025 15:52:52"
        "%b. %d, %Y %H:%M:%S",     # "Sept. 29, 2022 16:23:14"
        "%b %d, %Y %H:%M:%S",      # "Sep 29, 2022 16:23:14"
    ]

    timestamp = None
    for date_format in date_formats:
        try:
            timestamp = datetime.strptime(chi_timestamp, date_format)
            break
        except ValueError:
            continue

    return timestamp

In [None]:
assert parse_datetime("Feb. 6, 2025   17:35:06") == datetime(2025, 2, 6, 17, 35, 6)
assert parse_datetime("invalid") == None
assert parse_datetime("Oct. 10, 2022   09:30:07") == datetime(2022, 10, 10, 9, 30, 7)
assert parse_datetime("Oct. 10, 2022   12:12:21") == datetime(2022, 10, 10, 12, 12, 21)
assert parse_datetime("July 19, 2022   10:35:26") == datetime(2022, 7, 19, 10, 35, 26)

Now notice that most metadata in this document is either colon separated for a string (e.g. `File: iaa_7.8125um_mnge_gce_2_250206.bin`, `Instrument Model:  CHI1040C`) or equals separated for a float (`Init E (V) = 0.2`, `Quiet Time (sec) = 2`) so let's create some parsers for those.

In [None]:
#| export
# Because colon parsing and equals parsing are very similar, let's create a generalized function
def parse_line(line, sep):
    # Parses a `sep` separated line into its two halves.
    # Returns a two-element tuple where the first element is the first part, the second element the other part
    # For instance, `Instrument Model:  CHI1040C` becomes `("Instrument Model", "CHI1040C")`
    # If the line doesn't have a colon, raises it raises a ValueError
    
    try:
        sep_index = line.index(sep)
    except ValueError:
        raise ValueError(f"Separator {sep!r} not found in line: {line!r}")
        
    first_part = line[:sep_index].strip()
    second_part = line[sep_index+1:].strip()

    return first_part, second_part

In [None]:
#| export
# now we define colon parser as a line parser with a colon separator
parse_colon = lambda line: parse_line(line, ":")

In [None]:
assert parse_colon("File: iaa_7.8125um_mnge_gce_2_250206.bin") == ("File", "iaa_7.8125um_mnge_gce_2_250206.bin")
assert parse_colon("Instrument Model:  CHI1040C") == ('Instrument Model', 'CHI1040C')
assert parse_colon("Header: ") == ('Header', '')
assert parse_colon("Note: This one contains a colon: for testing") == ('Note', 'This one contains a colon: for testing')
test_fail(lambda: parse_colon("no colon in this one"), exc=ValueError)

In [None]:
#| export
# equals parsing additionally converts the second number to a float
def parse_equals(
    line,
) -> tuple[str, str | float]:
    """
    Parse an equals line, e.g. `Init E (V) = 0.5`
    Automatically casts the value after the `"="` to a float if possible.
    """
    first, second = parse_line(line, "=")

    try:
        second = float(second)
    except ValueError: # can't cast
        pass
    
    return first, second

In [None]:
assert parse_equals("Init E (V) = 0.5") == ('Init E (V)', 0.5)
assert parse_equals("Final E (V) = 1") == ('Final E (V)', 1.0)
assert parse_equals("Sample Width (sec) = 0.0167") == ('Sample Width (sec)', 0.0167)
assert parse_equals("Sensitivity (A/V) = 1e-6") == ('Sensitivity (A/V)', 1e-06)
test_fail(lambda: parse_equals("no equals in this one"), exc=ValueError)

now we have everything we need to parse the memo

In [None]:
memo

['Feb. 6, 2025   17:35:06',
 'Differential Pulse Voltammetry',
 'File: iaa_7.8125um_mnge_gce_2_250206.bin',
 'Data Source:  Experiment',
 'Instrument Model:  CHI1040C',
 'Header: ',
 'Note: ']

In [None]:
#| export
def parse_memo(memo_lines: list[str]) -> dict[str, str]:
    """
    Parses lines containing a memo into a dictionary, e.g.
    ```python
    ['Feb. 6, 2025   17:35:06',
     'Differential Pulse Voltammetry',
     'File: iaa_7.8125um_mnge_gce_2_250206.bin',
     'Data Source:  Experiment',
     'Instrument Model:  CHI1040C',
     'Header: ',
     'Note: ']
     ```

     into
     
     ```python
     {'timestamp': '2025-02-06T17:35:06',
     'technique': 'Differential Pulse Voltammetry',
     'file': 'iaa_7.8125um_mnge_gce_2_250206.bin',
     'instrument_model': 'CHI1040C'}```
    """

    # Let's do a simple format check first:
    if len(memo_lines) != 7:
        raise ValueError(f"Expected the memo to be a list of seven strings. Instead received: {memo_lines!r}")

    # parse timestamp
    timestamp = parse_datetime(memo_lines[0])
    if timestamp == None:
        raise ValueError(f"Error parsing memo timestamp: {memo_lines[0]!r}")
    else:
        timestamp = timestamp.isoformat()
    
    return {
        "timestamp": timestamp,
        "technique": memo_lines[1],
        "file": parse_colon(memo_lines[2])[1],
        # skip memo[3], which is data source. it should always be experiment
        "instrument_model": parse_colon(memo_lines[4])[1],
        # skip memo[4], which is header, because nobody uses it
        # skip memo[5], which is note, because nobody uses it
    }

In [None]:
assert parse_memo(memo) == {'timestamp': '2025-02-06T17:35:06',
 'technique': 'Differential Pulse Voltammetry',
 'file': 'iaa_7.8125um_mnge_gce_2_250206.bin',
 'instrument_model': 'CHI1040C'}

assert parse_memo("""Oct. 10, 2022   12:12:21
Differential Pulse Voltammetry
File: pg62.5um_dapg0um_m9_gce_dpv_221010.bin
Data Source:  Experiment
Instrument Model:  CHI1040C
Header: 
Note: """.splitlines()) == {'timestamp': '2022-10-10T12:12:21',
 'technique': 'Differential Pulse Voltammetry',
 'file': 'pg62.5um_dapg0um_m9_gce_dpv_221010.bin',
 'instrument_model': 'CHI1040C'}

test_fail(lambda: parse_memo("Not a memo"), exc=ValueError)
test_fail(lambda: parse_memo(["Feb. 6, 2025   17:35:06"] + [""]*6))

Parsing the results section is currently not a supported feature. Usually, this information isn't important in my analysis.

Now we can parse parameters. Each line should contain an equals sign, and we can just extract the value with parse_equals

In [None]:
#| export
def parse_parameters(lines: list[str]) -> dict[str, str | float]:
    parsed_lines = [parse_equals(line) for line in lines]
    return {param:val for param, val in parsed_lines}

In [None]:
lines = """Init E (V) = -0.9
High E (V) = 0.9
Low E (V) = -0.9
Init P/N = P
Scan Rate (V/s) = 0.025
Segment = 8
Sample Interval (V) = 0.001
Quiet Time (sec) = 2
Sensitivity (A/V) = 1e-5""".splitlines()

assert parse_parameters(lines) == {'Init E (V)': -0.9,
 'High E (V)': 0.9,
 'Low E (V)': -0.9,
 'Init P/N': 'P',
 'Scan Rate (V/s)': 0.025,
 'Segment': 8.0,
 'Sample Interval (V)': 0.001,
 'Quiet Time (sec)': 2.0,
 'Sensitivity (A/V)': 1e-05}

Now we need to extract column names from the numeric data. There's a catch here, in that the separator could be one of a few options. These are both valid:
```python
Potential/V	Current/A

0.204	-5.151e-7
0.208	-5.008e-7
0.212	-4.915e-7
```

```python
Potential/V, Current/A

0.202, -6.596e-7
0.204, -6.557e-7
0.206, -6.549e-7
```

So let's first detect the parser for this section. The assumption is that Potential/V is always the first column. So we can just look at the next character directly afterward.

In [None]:
#| export
def detect_column_sep(
    line: str # line containing columns
) -> str: # seperator, usually either ", " or "\t"
    """
    Detects the separator for the line containing columns within a CHI numeric data section.
    
    e.g. `Potential/V, Current/A` returns `","`
    
    e.g. `Potential/V	Current/A` returns `"\t"`
    """
    if not line.startswith("Potential/V"):
        raise ValueError(f"Column line expected to start with 'Potential/V': {line!r}")
    sep = line[len("Potential/V")]

    if sep == ",":
        return ", " # CHI's comma separated actually adds a space, too
    
    return sep

In [None]:
tab_sep_example = """Potential/V	Current/A

0.204	-5.151e-7
0.208	-5.008e-7
0.212	-4.915e-7""".splitlines()

comma_sep_example = """Potential/V, Current/A

0.202, -6.596e-7
0.204, -6.557e-7
0.206, -6.549e-7""".splitlines()

assert detect_column_sep(tab_sep_example[0]) == "\t"
assert detect_column_sep(comma_sep_example[0]) == ", "
test_fail(lambda: detect_column_sep("this is not a column line"), exc=ValueError)

Now we can parse the whole numeric data section

In [None]:
#| export
import numpy as np

def parse_numeric_data(
    lines: list[str] # data lines, starting with the column header
) -> dict[str, np.ndarray]: # a dictionary of column -> numpy float array
    """
    Parses numerical data within a CHI data file.
    """
    
    # parse column line
    col_line = lines[0]
    sep = detect_column_sep(col_line)
    cols = col_line.split(sep)
    
    # next line should be empty
    if lines[1] != "":
        raise ValueError(f"Expected an empty line after the column header, but found: {lines[1]!r}")
    
    # parse the data columns
    data = np.array([line.split(sep) for line in lines[2:]], dtype=float)
    
    # sometimes, CHI adds a Time/s column even when there's no time data. in that case, eliminate time
    if len(cols) == data.shape[1]+1 and "Time/s" in cols: # there's one more column than the data indicates and one of the columns is time
        cols.remove("Time/s")
    
    if len(cols) != data.shape[1]:
        raise ValueError(f"The number of columns does not match the data. Columns: {cols!r}; Data: {lines[2].split(sep)}")
    
    return {col: data[:, i] for i, col in enumerate(cols)}

In [None]:
# tab sep example
test(parse_numeric_data("""Potential/V	Current/A

0.204	-5.151e-7
0.208	-5.008e-7
0.212	-4.915e-7""".splitlines()), {'Potential/V': np.array([0.204, 0.208, 0.212]),
 'Current/A': np.array([-5.151e-07, -5.008e-07, -4.915e-07])}, all_equal)

# comma sep example
test(parse_numeric_data("""Potential/V, Current/A

0.202, -6.596e-7
0.204, -6.557e-7
0.206, -6.549e-7""".splitlines()), {'Potential/V': np.array([0.202, 0.204, 0.206]),
 'Current/A': np.array([-6.596e-07, -6.557e-07, -6.549e-07])}, all_equal)

# tab sep with faulty time column
test(parse_numeric_data("""Potential/V	Current/A	Time/s

0.504	-4.187e-7
0.508	-3.890e-7
0.512	-3.711e-7
0.516	-3.597e-7""".splitlines()), {'Potential/V': np.array([0.504, 0.508, 0.512, 0.516]),
 'Current/A': np.array([-4.187e-07, -3.890e-07, -3.711e-07, -3.597e-07])}, all_equal)

# number of columns mismatched
test_fail(lambda: parse_numeric_data("""Potential/V	Current/A	Imaginary/W

0.504	-4.187e-7
0.508	-3.890e-7
0.512	-3.711e-7
0.516	-3.597e-7""".splitlines()), exc=ValueError)

Now that we can parse all the sections, we just need to auto-detect them within files. Sections are always in order. So we can detect if the memo is present by seeing if line 2 starts with `"File:"` and line three starts with `"Data Source":`.

In [None]:
#| export
def contains_memo(lines: list[str]):
    file_line = lines[2].startswith("File:")
    data_source_line = lines[3].startswith("Data Source:")
    return file_line and data_source_line

In [None]:
assert contains_memo("""Oct. 4, 2024   11:25:46
Cyclic Voltammetry
File: 241004_3.125um_curcumin_incitratenah2po4buffer_ph3_50um_methylv
Data Source:  Experiment
Instrument Model:  CHI1040C
Header: 
Note: """.splitlines()) == True

assert contains_memo("""Init E (V) = 0.5
Final E (V) = 1
Incr E (V) = 0.002
Amplitude (V) = 0.025
Pulse Width (sec) = 0.05
Sample Width (sec) = 0.0167
Pulse Period (sec) = 0.5
Quiet Time (sec) = 2
Sensitivity (A/V) = 1e-6""".splitlines()) == False

we can tell if the next section contains parameters by checking if it starts with `"Init E (V) = "`, which is the first parameter for both DPV and CV

In [None]:
#| export
def contains_parameters(lines):
    return lines[0].startswith("Init E (V) = ")

In [None]:
assert contains_parameters("""Init E (V) = -0.9
High E (V) = 0.9
Low E (V) = -0.9
Init P/N = P""".splitlines())

assert not contains_parameters("""
Init E (V) = -0.9
High E (V) = 0.9
Low E (V) = -0.9
Init P/N = P
""".splitlines())

assert not contains_parameters("""Potential/V, Current/A, Charge/C, Time/s

-0.900, 1.305e-5, 5.218e-7, 4.000e-2
-0.899, 1.291e-5, 1.038e-6, 8.000e-2
-0.898, 1.277e-5, 1.549e-6, 1.200e-1""".splitlines())

and we can find the end of the parameters lines by looking for the `"Sensitivity (A/V) = "` line.

In [None]:
#| export
def find_parameters_end(lines: list[str]) -> int: # returns the index of line after the last parameter line
    if not isinstance(lines, list):
        raise TypeError(f"Expected a list of strings, got {type(lines).__name__}")
    
    if not contains_parameters(lines):
        raise ValueError(f"Expected lines to start with parameters, received: {lines[:5]!r}")

    for i in range(min(len(lines), 20)): # shouldn't be more than 20 lines down
        if lines[i].startswith("Sensitivity (A/V) = "):
            return i+1

    raise ValueError(f"Never found Sensitivty (A/V) = in lines: {lines[:20]!r}")

In [None]:
assert find_parameters_end("""Init E (V) = -0.9
High E (V) = 0.9
Low E (V) = -0.9
Init P/N = P
Scan Rate (V/s) = 0.025
Segment = 8
Sample Interval (V) = 0.001
Quiet Time (sec) = 2
Sensitivity (A/V) = 1e-5""".splitlines()) == 9

# remove sensitivity so it fails
test_fail(lambda: find_parameters_end("""Init E (V) = -0.9
High E (V) = 0.9
Low E (V) = -0.9
Init P/N = P
Scan Rate (V/s) = 0.025
Segment = 8
Sample Interval (V) = 0.001
Quiet Time (sec) = 2

Segment 1:
""".splitlines()))

In [None]:
#| export
def find_start_of_numeric_data(lines: list[str]) -> int:
    """
    Finds the index of the columns line by looking for Potential/V.
    """

    for i, line in enumerate(lines):
        if line.startswith("Potential/V"):
            return i

    raise ValueError(f"No line found starting with 'Potential/V'")

In [None]:
assert find_start_of_numeric_data(['Results:',
 '',
 'Channel 1:',
 'Ep = 0.698V',
 'ip = -1.338e-6A',
 'Ap = -2.077e-7VA',
 '',
 'Potential/V, Current/A',
 '',
 '0.202, -6.596e-7',
 '0.204, -6.557e-7',
 '0.206, -6.549e-7',
 '0.208, -6.547e-7']) == 7

test_fail(lambda: find_start_of_numeric_data(['Results:',
 '',
 'Channel 1:',
 'Ep = 0.698V',
 'ip = -1.338e-6A',
 'Ap = -2.077e-7VA']), exc=ValueError)

In [None]:
#| export
def parse_chi_file(contents: str):
    lines = contents.splitlines()
    
    # extract the memo, if it's present
    lines_left = lines
    if contains_memo(lines):
        memo = parse_memo(lines_left[:7]) # the memo is always the first 7 lines
        lines_left = lines[8:]
    else:
        memo = {}
    
    # now extract parameters
    if contains_parameters(lines_left):
        end_line = find_parameters_end(lines_left)
        parameters = parse_parameters(lines_left[:end_line]) 
        lines_left = lines_left[end_line+1:]
    else:
        parameters = {}
    
    # now extract numerical data
    try:
        numeric_data_index = find_start_of_numeric_data(lines_left)
        lines_left = lines_left[numeric_data_index:]
        numeric_data = parse_numeric_data(lines_left)
    except ValueError:
        numeric_data = {}
    
    return {
        "memo": memo,
        "parameters": parameters,
        "numeric_data": numeric_data
    }

Now for the DPV tests

In [None]:
# MPN_tab test
with open(os.path.join(test_dpv_dir, "MPN_tab.txt")) as f:
    lines = f.read()

parsed_data = parse_chi_file(lines)
assert parsed_data["memo"]["timestamp"] == "2022-10-10T09:30:07"
assert parsed_data["parameters"]["Init E (V)"] == 0.2
assert parsed_data["parameters"]["Final E (V)"] == 0.9
assert parsed_data["numeric_data"]["Current/A"][0] == -2.681e-7

In [None]:
# MPRN_comma test
with open(os.path.join(test_dpv_dir, "MPRN_comma.txt")) as f:
    lines = f.read()

parsed_data = parse_chi_file(lines)
assert parsed_data["memo"]["timestamp"] == "2025-02-06T17:35:06"
assert parsed_data["parameters"]["Init E (V)"] == 0.2
assert parsed_data["parameters"]["Final E (V)"] == 1
assert parsed_data["numeric_data"]["Current/A"][0] == -6.596e-07

In [None]:
# MPRN_tab test
with open(os.path.join(test_dpv_dir, "MPRN_tab.txt")) as f:
    lines = f.read()

parsed_data = parse_chi_file(lines)
assert parsed_data["memo"]["timestamp"] == "2022-10-10T12:12:21"
assert parsed_data["parameters"]["Init E (V)"] == 0.2
assert parsed_data["parameters"]["Final E (V)"] == 0.9
assert parsed_data["numeric_data"]["Current/A"][0] == -5.151e-7

In [None]:
# MPRN_tab_time test
with open(os.path.join(test_dpv_dir, "MPRN_tab_time.txt")) as f:
    lines = f.read()

parsed_data = parse_chi_file(lines)
assert parsed_data["memo"]["timestamp"] == "2022-07-19T10:35:26"
assert parsed_data["parameters"]["Init E (V)"] == 0.4
assert parsed_data["parameters"]["Final E (V)"] == 1.4
assert parsed_data["numeric_data"]["Current/A"][0] == -5.844e-7

In [None]:
# PN_tab_time test
with open(os.path.join(test_dpv_dir, "PN_tab_time.txt")) as f:
    lines = f.read()

parsed_data = parse_chi_file(lines)
assert parsed_data["memo"] == {}
assert parsed_data["parameters"]["Init E (V)"] == 0.5
assert parsed_data["parameters"]["Final E (V)"] == 1
assert parsed_data["numeric_data"]["Current/A"][0] == -4.187e-7

and the CV tests

In [None]:
with open(os.path.join(test_cv_dir, "MPRN_charge.txt")) as f:
    lines = f.read()

parsed_data = parse_chi_file(lines)
assert parsed_data["memo"]["timestamp"] == "2024-10-04T11:25:46"
assert parsed_data["parameters"]["Init E (V)"] == -0.9
assert parsed_data["parameters"]["High E (V)"] == 0.9
assert parsed_data["numeric_data"]["Current/A"][0] == 1.305e-5
assert parsed_data["numeric_data"]["Charge/C"][0] == 5.218e-7

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()