In [282]:
import re
import csv
import pandas as pd
import numpy as np

In [2]:
import os
def diriter(root_dir, extension):
    ''' grab files from root/sub dirs having extension
    '''
    return ((f, os.path.join(root, f))
            for root, dirs, files in os.walk(root_dir)
            for f in files if f.endswith(extension))

In [18]:
file_path = 'data/FA03.D/RESULTS.CSV'

In [40]:
''' conditions
'''
def istablerow(line):
    return re.match('\d+=', line)

def isheader(line):
    return line[0] == 'Header='

In [125]:
def seek_rows(header, gen):
    ''' gen is at position after header seek until
        no more table rows or stopiter exception
    '''
    table = [header]
    try:
        while True:
            line = next(gen)
            if not istablerow(line[0]):
                break
            table.append(line)
    except StopIteration as e:
        line = None
    finally:
        return line, table

In [126]:
def scan_csv(gen):
    ''' split csv generator into meta information
        and list of individual tables of tokens
    '''    
    meta, tables = [], []
    try:
        while True:
            line = next(gen)
            if isheader(line):
                line, table = seek_rows(line, gen)
                tables.append(table)
            if line:
                meta.append(line)
    except StopIteration as e:
        return meta, tables

In [248]:
def make_dataframes(tables):
    ''' make data frame from lists of list of tokens
    '''
    return [pd.DataFrame(table[1:], columns=table[0])
            for table in tables]

def clean_col(col):
    '''
    '''
    return ''.join(['' if c in ['/', ' ', '.'] else c
                    for c in col]).lower()

def make_dataframe(table):
    df = pd.DataFrame(table[1:], columns=table[0])
    prefix = 'tic' if 'R.T.' in df.columns else 'lib'
    df.columns = [clean_col(col) for col in df.columns]
    return df.add_prefix('{}_'.format(prefix))


def cbind_dataframes(dfs):
    ''' column bind dfs and drop dup columns
    '''
    return pd.concat([df for df in dfs], axis=1)

In [249]:
meta, tables = scan_csv(csv.reader(open(file_path)))

In [250]:
df = cbind_dataframes(map(make_dataframe, tables))

In [279]:
def find_match(df):
    argmax = lambda x: ((x - df.lib_rt.astype(np.float32))**2).idxmin()
    return df.lib_libraryid[df.tic_rt.astype(np.float32).apply(argmax)]

In [280]:
df['match_libid'] = find_match(df)

In [281]:
df.head(10)

Unnamed: 0,tic_header=,tic_peak,tic_rt,tic_first,tic_max,tic_last,tic_pkty,tic_height,tic_area,tic_pctmax,tic_pcttotal,lib_header=,lib_pk,lib_rt,lib_areapct,lib_libraryid,lib_ref,lib_cas,lib_qual,match_libid
0,1=,1,5.788,465,473,494,rBV3,257808,1489466,13.12,2.034,1=,1,5.7877,2.0335,Methyl octanoate,17,000000-00-0,96,Methyl octanoate
1,2=,2,7.344,733,745,812,rBV,964743,2491449,21.94,3.401,2=,2,7.3441,3.4015,Methyl decanoate,1,000000-00-0,98,Methyl decanoate
2,3=,3,8.036,859,866,904,rBV,608418,1277982,11.25,1.745,3=,3,8.0364,1.7448,Methyl undecanoate,2,000000-00-0,98,Methyl undecanoate
3,4=,4,8.672,970,977,1017,rBV,1929049,2905961,25.59,3.967,4=,4,8.6715,3.9674,Methyl dodecanoate,3,000000-00-0,98,Methyl dodecanoate
4,5=,5,9.278,1077,1083,1116,rBV,882521,1436154,12.65,1.961,5=,5,9.2781,1.9607,Methyl tridecanoate,4,000000-00-0,99,Methyl tridecanoate
5,6=,6,9.793,1168,1173,1179,rBV,1075433,1379404,12.15,1.883,6=,6,9.793,1.8832,Methyl myristoleic acid,5,000000-00-0,99,Methyl myristoleic acid
6,7=,7,9.85,1179,1183,1220,rVB,2474155,3443757,30.32,4.702,7=,7,9.8503,4.7016,Methyl myristate,6,000000-00-0,99,Methyl myristate
7,8=,8,10.36,1267,1272,1278,rBV,1073796,1449016,12.76,1.978,8=,8,10.3595,1.9783,cis-10-Pentadecenoic acid methyl ester,7,000000-00-0,99,cis-10-Pentadecenoic acid methyl ester
8,9=,9,10.417,1278,1282,1315,rVB,1116633,1821171,16.04,2.486,9=,9,10.4168,2.4864,Methyl myristate,6,000000-00-0,83,Methyl myristate
9,10=,10,10.932,1363,1372,1386,rBV,1047443,1571132,13.83,2.145,10=,10,10.9317,2.145,Methyl palmitoleate,9,000000-00-0,99,Methyl palmitoleate
