In [229]:
import csv
import pandas as pd

In [230]:
import os
def diriter(root_dir, extension):
    ''' grab files from root/sub dirs having extension
    '''
    return ((f, os.path.join(root, f))
            for root, dirs, files in os.walk(root_dir)
            for f in files if f.endswith(extension))

In [231]:

def make_tokens(filepath, delim=','):
    ''' split csv file into list of token lists
    '''
    return [line for line in csv.reader(open(filepath))]


def find_headers(lines):
    ''' find poition of headers in file
    '''
    return [i for i, line in enumerate(lines)
            if line[0].strip() == 'Header='] + [0]

def make_lists(hpos, lines):
    ''' split lists of tokens by header
    '''
    n, k = len(hpos), len(lines)
    return [lines[hpos[i]:(k - 1 - hpos[i + 1])]
            for i in range(n - 1)]
    
def make_dataframes(lines):
    ''' make data frame from lists of list of tokens
    '''
    return [pd.DataFrame(line[1:], columns=line[0])
           for line in lines]


def cbind_dataframes(dfs):
    ''' column bind dfs and drop dup columns
    '''
    
    df = (pd.concat([df for df in dfs], axis=1)
            .drop(['Header=', 'R.T.', 'Area Pct', 'Peak'], axis=1))
    df.columns = [col.replace('/', '_').replace(' ', '_').lower()
                  for col in df.columns]
    return df
    

In [232]:
file_path = 'data/FA03.D/RESULTS.CSV'

In [233]:
tokens = make_tokens(file_path)

In [234]:
tokens[:10]

[['[contents]'],
 ['count=2'],
 ['Name=', 'C:\\HANNA AUCOIN\\2016.05.13\\FA03.D'],
 ['1=', 'INT TIC: FA03.D\\data.ms'],
 ['2=', 'PBM Apex'],
 ['[INT TIC: FA03.D\\data.ms]'],
 ['Time=', 'Fri May 13 13:03:08 2016'],
 ['Header=',
  'Peak',
  'R.T.',
  'First',
  'Max',
  'Last',
  'PK  TY',
  'Height',
  'Area',
  'Pct Max',
  'Pct Total'],
 ['1=',
  '  1',
  '  5.788',
  '  465',
  ' 473',
  ' 494',
  'rBV3',
  ' 257808',
  '  1489466',
  ' 13.12',
  '  2.034'],
 ['2=',
  '  2',
  '  7.344',
  '  733',
  ' 745',
  ' 812',
  'rBV ',
  ' 964743',
  '  2491449',
  ' 21.94',
  '  3.401']]

In [235]:
hposition = find_headers(tokens)

In [236]:
hposition

[7, 38, 0]

In [237]:
token_lists = make_lists(hposition, tokens)

In [238]:
dfs = make_dataframes(token_lists)

In [239]:
dfs[0].head()

Unnamed: 0,Header=,Peak,R.T.,First,Max,Last,PK TY,Height,Area,Pct Max,Pct Total
0,1=,1,5.788,465,473,494,rBV3,257808,1489466,13.12,2.034
1,2=,2,7.344,733,745,812,rBV,964743,2491449,21.94,3.401
2,3=,3,8.036,859,866,904,rBV,608418,1277982,11.25,1.745
3,4=,4,8.672,970,977,1017,rBV,1929049,2905961,25.59,3.967
4,5=,5,9.278,1077,1083,1116,rBV,882521,1436154,12.65,1.961


In [None]:
dfs[1].head()

In [None]:
final = cbind_dataframes(dfs)

In [None]:
final.head()

Unnamed: 0,first,max,last,pk__ty,height,area,pct_max,pct_total,pk,rt,library_id,ref,cas,qual
0,465.0,473.0,494.0,rBV3,257808.0,1489466.0,13.12,2.034,1,5.7877,Methyl octanoate,17,000000-00-0,96
1,733.0,745.0,812.0,rBV,964743.0,2491449.0,21.94,3.401,2,7.3441,Methyl decanoate,1,000000-00-0,98
2,859.0,866.0,904.0,rBV,608418.0,1277982.0,11.25,1.745,3,8.0364,Methyl undecanoate,2,000000-00-0,98
3,970.0,977.0,1017.0,rBV,1929049.0,2905961.0,25.59,3.967,4,8.6715,Methyl dodecanoate,3,000000-00-0,98
4,1077.0,1083.0,1116.0,rBV,882521.0,1436154.0,12.65,1.961,5,9.2781,Methyl tridecanoate,4,000000-00-0,99
5,1168.0,1173.0,1179.0,rBV,1075433.0,1379404.0,12.15,1.883,6,9.793,Methyl myristoleic acid,5,000000-00-0,99
6,1179.0,1183.0,1220.0,rVB,2474155.0,3443757.0,30.32,4.702,7,9.8503,Methyl myristate,6,000000-00-0,99
7,1267.0,1272.0,1278.0,rBV,1073796.0,1449016.0,12.76,1.978,8,10.3595,cis-10-Pentadecenoic acid methyl ester,7,000000-00-0,99
8,1278.0,1282.0,1315.0,rVB,1116633.0,1821171.0,16.04,2.486,9,10.4168,Methyl myristate,6,000000-00-0,83
9,1363.0,1372.0,1386.0,rBV,1047443.0,1571132.0,13.83,2.145,10,10.9317,Methyl palmitoleate,9,000000-00-0,99
