In [4]:
import pandas as pd
import numpy as np
import re
import os
import pickle
import patsy

from phenom.design import Formula
from phenom.phenotype import Phenotype

In [5]:
def parseFile(f, strain=None, header=0):
 
    pa = pd.read_excel(f, sheet_name=None, header=header)
    useSheets = [s for s in pa.keys() if re.match(
        '(pH ?[0-9.]+,? [0-7.]+ ?(mM)?)|(All pH 0 mM)', s)]
 
    data = meta = None
 
    # print useSheets
 
    for s in useSheets:
 
        m = re.match('pH ?([0-9.]+),? ([0-7.]+) ?(mM)?', s)
 
        if m:
            ph = m.group(1)
            la = m.group(2)
        else:
            ph = None
            la = 0
 
        temp = pa[s]
        # temp = temp.loc[:,['Time (hours)','Rep1','Rep2','Rep3']]
 
        if ph is None:
            temp = temp.iloc[:, :8]
 
            newmeta = pd.DataFrame([[ph, la]] * 7, columns=['pH', 'mM-acid'])
            newmeta['pH'] = [7, 6.5, 6, 5.5, 5, 4.5, 4]
            newmeta['batch'] = 1
        else:
            temp = temp.iloc[:, :7]
 
            newmeta = pd.DataFrame([[ph, la]] * 6, columns=['pH', 'mM-acid'])
            newmeta['batch'] = [0] * 3 + [1] * 3
 
        if not strain is None:
            newmeta['strain'] = strain
 
        if meta is None:
            meta = newmeta
        else:
            meta = pd.concat((meta, newmeta), 0)
 
        if data is None:
            data = temp
        else:
            if header is None:
                data = pd.merge(data, temp, on=0)
            else:
                data = pd.merge(data, temp, on='Time (hours)')
 
    data.columns = ['time'] + range(data.shape[1] - 1)
    meta.index = range(meta.shape[0])
 
    return data, meta
 

In [21]:
for target, strain, acid, header in [
            ('PA1054 Sodium Benzoate', 'PA1054', 'sodium-benzoate', 0),
            ('PA1054 Citric Acid', 'PA1054', 'citric', 0),
            ('PA1054 Potassium Sorbate', 'PA1054', 'potassium-sorbate', 0),
            ('PA1054 Butyric Acid', 'PA1054', 'butyric', 0),
            ('PA01 Malic 09.03.17', 'PA01', 'malic', 0),
            ('PA01 Lactic 06.03.17', 'PA01', 'lactic', 0),
            ('PA01 citric 15 min time points 06.03.17', 'PA01', 'citric', 0),
            ('PA01 Benzoate 15 min time points', 'PA01', 'benzoate', 0),
            ('PA1054 Propionic Acid', 'PA1054', 'propionic', None),
            ('PA1054 Acetic Acid', 'PA1054', 'acetic', None),
            ('PA01 Potassium Sorbate 01.12.16', 'PA01', 'potassium-sorbate', None),
            ('PA01 Butyric Acid 15 min time points 10.11.16', 'PA01', 'butyric', None),
            ('Propionic acid 15 min time points PA01 02.11.16', 'PA01', 'propionic', 0),
            ('PA01 Acetic 15 min time points 14.10.16', 'PA01', 'acetic', 0),
            ('PAB Lactic Acid (1)', 'PAB', 'lactic', 0),
            ('PA01 Lactic Acid (1)', 'PA01', 'lactic', 0),
            ('PA1054 Lactic Acid', 'PA1054', 'lactic', 0),
            ('PA1054 Malic Acid', 'PA1054', 'malic', 0),
            ('PA01 Benzoate repeat 19.07.17', 'PA01', 'benzoate', None),
            ('PA01 Citric rerun 11.07.17', 'PA01', 'citric', None),
            ('PA01 Lactic repeat 13.07.17', 'PA01', 'lactic', None),
            ('PA01 Malic repeat 27.07.17', 'PA01', 'malic', None)
        ]:
    data, meta = parseFile(
        os.path.join('data/raw/', target) + '.xlsx', strain, header)
    
    data.set_index('time', inplace=True)

    meta['acid'] = acid
    meta['genus'] = 'pseudomonas'
    meta['strain'] = strain
    meta.index = range(meta.shape[0])
    
    meta = meta.loc[~data.isnull().all(),:]
    data = data.loc[:,~data.isnull().all()]
    
    # remove < 0 vals
    if (data<=0).any().any():
        r, c = np.where(data<=0)
        data.iloc[r, c] = data[data>0].min().min()

    data = np.log2(data)
    
    data = data.loc[~data.isnull().any(1),:]
    data = data.iloc[5:, :]
    
    # scale first time point
    time = data.index.values[:5]
    group = meta.groupby(['pH', 'mM-acid'])
    for k,index in group.groups.iteritems():
        temp = data.loc[:,index]
        od = temp.values[:5,:].ravel()

        coeff = np.polyfit(time.tolist()*temp.shape[1],od,2)

        temp = temp - np.polyval(coeff,data.index.values[0])
        data.loc[:,index] = temp
        
    data = data.iloc[::3, :]
    
    # save output    
    d = '-'.join([strain, acid])
    if not d in os.listdir('data/processed/'):
        os.makedirs(os.path.join('data/processed/', d))

    data.to_csv(os.path.join('data/processed/', d, 'data.csv'))
    meta.to_csv(os.path.join('data/processed/', d, 'meta.csv'))
