# NMR Processing Overview

---

1. Split files into different categories.
    1. How many individual fids?
    2. How many array experiments?
    3. How are temperature sets stored?
    4. How are materials stored?
2. Develop / confirm metadata for those categories.
    + Cross reference with documentation provided by Trent.
    + Compare processing demo results to Trent's data. 
    + Meet with Trent to confirm assignments.
3. Prioritize subsets.
3. **Design Bokeh application**
4. Process subsets.

In [1]:
%load_ext autoreload
%autoreload 2

#### Set Local Data Path

---

Since the total available data is around 2 gb it may be stored in different locations on different machines. Define a base path to the data to simplify this.

In [2]:
data_folder = '/home/tylerbiggs/data/Sep-2016-23Na'
processed_data = "/home/tylerbiggs/data/processed_nmr"
# data_folder = '/home/tyler/data/Sep-2016-23Na'

In [3]:
import nmrglue as ng
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import itertools
import multiprocessing as mp
import glob
import re
import os
from tqdm import tqdm
%matplotlib inline

In [4]:
from trentnmr import *

# File Structure

---

From `tree -I *.fid` to find all non-fid directories.

```bash
└── Sep-2016-23Na
    ├── 23Na
    │   └── 27Al
    │       ├── 0808G1-0p15M-AlOH3-3M-NaOH-D2O
    │       ├── 0808G1-0p5M-AlOH3-3M-NaOH-D2O
    │       ├── 0808G1-1M-AlOH3-3M-NaOH-D2O
    │       ├── 0819G1-0p1M-AlOH3-3M-LiOH-D2O
    │       ├── 0819G1-0p5M-AlOH3-3M-KOH-D2O
    │       ├── 0819G1-0p5M-AlOH3-3M-LiOH-D2O
    │       ├── 0819G1-1M-AlOH3-3M-NaOH-D2O
    │       ├── background
    │       └── standard
    └── VT

```

Which seems like and error. Re-ordering to:

```bash
└── Sep-2016-23Na
    ├── 23Na
    ├── 27Al
    │   ├── 0808G1-0p15M-AlOH3-3M-NaOH-D2O
    │   ├── 0808G1-0p5M-AlOH3-3M-NaOH-D2O
    │   ├── 0808G1-1M-AlOH3-3M-NaOH-D2O
    │   ├── 0819G1-0p1M-AlOH3-3M-LiOH-D2O
    │   ├── 0819G1-0p5M-AlOH3-3M-KOH-D2O
    │   ├── 0819G1-0p5M-AlOH3-3M-LiOH-D2O
    │   ├── 0819G1-1M-AlOH3-3M-NaOH-D2O
    │   ├── background
    │   └── standard
    └── VT

```

## Glob Parent Folders

---

In [5]:
# Sodium folders.
VT   = os.path.join(data_folder, 'VT')
Na23 = os.path.join(data_folder, '23Na')

# Aluminum folders.
Al27 = os.path.join(data_folder, '27Al')
# Aluminum sub-paths.
sub_paths_strings = [
    "0808G1-0p15M-AlOH3-3M-NaOH-D2O",
    "0808G1-0p5M-AlOH3-3M-NaOH-D2O",
    "0808G1-1M-AlOH3-3M-NaOH-D2O",
    "0819G1-0p1M-AlOH3-3M-LiOH-D2O",
    "0819G1-0p5M-AlOH3-3M-KOH-D2O",
    "0819G1-0p5M-AlOH3-3M-LiOH-D2O",
    "0819G1-1M-AlOH3-3M-NaOH-D2O",
    "background",
    "standard"
]

Al_sub_paths = [os.path.join(Al27, p) for p in sub_paths_strings]

In [6]:
Al_sub_paths.append(Al27)
sodium_paths = [VT, Na23]

## Glob Helper Functions

---

In [7]:
array_glob = '/*arrays*.fid'
mis_arrays = '/*arryas*.fid'
fid_glob = '/*.fid'
special_files = ['reference', 'REF', 'calibration', 'pwX90', 'static',
                 'spin-up', 'without-liquid']

def nmr_glob(path):
    arrays = {fn for fn in glob.iglob(path + array_glob, recursive=False)}
    other_arrays = {fn for fn in glob.iglob(path + mis_arrays, recursive=False)}
    arrays = arrays | other_arrays
    
    fids = {fn for fn in glob.iglob(path + fid_glob, recursive=False)}
    
    other_fids = set()
    for f in fids:
        if any(sf in f for sf in special_files):
            other_fids.add(f)

    fids = fids - other_fids - arrays
            
    return [list(x) for x in [arrays, fids, other_fids]]


def trim_folder(folders):
    return [os.sep.join(os.path.normpath(path).split(os.sep)[5:]) for path in folders]


def process_group(path_list):
    array, fid, other = list(), list(), list()
    for path in path_list:
        a, f, o = nmr_glob(path)
        if a: array.append(a)
        if f: fid.append(f)        
        if o: other.append(o)
        
    return [list(itertools.chain.from_iterable(x)) for x in [array, fid, other]]

## Running the Globs

---

In [8]:
al_array, al_fid, al_other = process_group(Al_sub_paths)
na_array, na_fid, na_other = process_group(sodium_paths)

In [9]:
print(f'Normal Al fids: {len(al_fid)}')

Normal Al fids: 139


# Processing the .fid Files

---

### Convert to NMRPipe files

In [10]:
al_fid_pipes = list()
for x in tqdm(al_fid):
    al_fid_pipes.append(write_varian_as_pipe(x, processed_data))

100%|██████████| 139/139 [00:01<00:00, 106.60it/s]


### Process NMR Pipe Files

Takes a few minutes...

In [11]:
def mp_proc_pipe(in_file, processed_data):
    return process_pipe_file(in_file, processed_data)

def pool_nmr_proc(file_list, processes=mp.cpu_count() - 1):
    pool = mp.Pool(processes=processes)
    results = [pool.apply_async(mp_proc_pipe, args=(v, processed_data)) for v in file_list]
    results = [p.get() for p in results]
    return results

In [None]:
proccessed_al_fids = pool_nmr_proc(al_fid_pipes)

Optimization terminated successfully.
         Current function value: 262772480007.739594
         Iterations: 142
         Function evaluations: 282
Optimization terminated successfully.
         Current function value: 8921374720009.123047
         Iterations: 97
         Function evaluations: 199
Optimization terminated successfully.
         Current function value: 178217152008.822418
Optimization terminated successfully.
         Current function value: 1197989632008.932129
         Iterations: 111
         Function evaluations: 221
         Iterations: 107
         Function evaluations: 223
Optimization terminated successfully.
         Current function value: 4221594112008.951660
         Iterations: 116
         Function evaluations: 239
Optimization terminated successfully.
         Current function value: 24433563648009.207031
         Iterations: 115
         Function evaluations: 237
Optimization terminated successfully.
         Current function value: 7395328512009.02734

In [None]:
dic, data = ng.pipe.read(proccessed_al_fids[0])
ng.analysis.peakpick.pick(data, 500000)

In [None]:
peak_dict = {f: ng.analysis.peakpick.pick(data, 500000)
             for f in proccessed_al_fids}

In [None]:
df = pd.DataFrame(peak_dict).melt()
df[['raw_hz', 'peak_id', 'line_width', 'integral']] = df['value'].apply(pd.Series)
df = df.drop(columns='value')
def trim_file(path):
    return os.sep.join(os.path.normpath(path).split(os.sep)[5:])

df['file_path'] = df['variable'].apply(trim_file)
df = df.drop(columns='variable')
df