In [1]:
from chemtbd.io import Agilent
from scipy.stats import linregress
import numpy as np
import pandas as pd

In [2]:
agi = Agilent.from_root('data')

__matclibarea_one__ matches lib dataframe to fid or tic dataframes

- key: string, typically the file name
- libdf: dataframe, dataframe containing chemical identification information i.e. library/id, rt, etc
- areaddf: dataframe, dataframe containing either tic or fid areas and retetion times (rt)
- areatype: string, `fid` or `tic`

example:
```
key = 'FA13.D'
libdf = agi[key].results.lib
ticdf  = agi[key].results.tic
fiddf = agi[key].results.fid
matchlib2area_one('FA14.D',libdf,ticdf,'tic')
```

__matchlib2area_all__ matches lib dataframe to fid or tic dataframes for all data sets in a folder containing .D files generated from Agilent

example:
```
matched_df = matchlib2area_all(agi,'tic')
```

In [3]:
def matchlib2area_one(key,libdf, areadf,areatype):
    returndf = libdf.copy()
    
    if areatype == 'tic':
        colname = 'tic_area'
    elif areatype == 'fid':
        colname = 'fid_area'
    else:
        print('incorrect area type')

    returndf[colname] = np.nan
    #find shortest
    if len(libdf) <= len(areadf):
        short = libdf
        long = areadf
    else:
        short = areadf
        long = libdf
        
    #loop and match
    match_list = []
    for index,row in short.iterrows():
        sqrError = (short['rt'].ix[index]-long['rt'].ix[index:len(long)])**2    
        area_row = sqrError.idxmin(axis=1)
        match_list.append(areadf['area'].ix[area_row])
    if areatype == 'tic':
        returndf['tic_area'] = match_list
    elif areatype == 'fid':
        returndf['fid_area'] = match_list
    
    return returndf

def matchlib2area_all(data_dic,areatype='tic'):
    libdf = data_dic.results('lib').groupby('key')
    areadf  = data_dic.results(areatype).groupby('key')

    #return_dic = {}
    return_df = pd.DataFrame()
    for name, group in libdf:
        try:
            return_df = return_df.append(matchlib2area_one(name,group,areadf.get_group(name),areatype))
            # return_dic[name] = matchlib2area_one(name,group,areadf.get_group(name),areatype)
        except:
            print(name+' did not match successfully')
    return return_df.reset_index()

matched_df = matchlib2area_all(agi,'tic')
matched_df.head()
matched_df.shape

FA14_2.D did not match successfully


(110, 11)

This creates a df containing the concentrations, I assumed the starting concentration of the stock was 1M, to get from hanna. The general data input represented by __calc_conc_df__ will always be this format; however, we will need to have a different ways for the user to input this information

In [16]:
cal_files = ['FA03.D','FA04.D','FA05.D']
cal_dilutions = {'FA03.D': 0.25,'FA04.D':0.50,'FA05.D':1.0}
cal_conc_df = pd.DataFrame({'library_id':agi['FA05.D'].results['lib']['library_id'],'Conc_0':1})

for key,val in cal_dilutions.items():
    cal_conc_df[key] = cal_conc_df['Conc_0']*val
cal_conc_df.drop('Conc_0',1,inplace=True)
cal_conc_df.head()

(31, 4)

Calculate a linear fit to calibration data for each species

In [28]:
def match_cal_conc(matched_df, cal_conc_df):
    '''
    this  function takes a dataframe which contains species matched to an area (matched_df) 
    and a calibration concentration dataframe and matches these two based on library_id
    '''
    cal_conc_df_melted = pd.melt(cal_conc_df, id_vars=['library_id'],value_vars=cal_files)
    cal_conc_df_melted.columns = ['library_id','key','cal_conc']
    
    return_df = pd.merge(matched_df,cal_conc_df_melted,how='left',on=['library_id','key'])
    return return_df.dropna(subset=['cal_conc'])

matched_cal_conc = match_cal_conc(matched_df, cal_conc_df)

def cal_curves_tic(matched_cal_conc):
    '''
    this function takes a matched calibration concentration dataframe (matched_cal_conc)
    and does a linear regression and returns a dataframe of the library_ids with 
    linregress stats and the min/max areas which is the range for which the calibration
    curve can (should only) be used.
    
    this is only for tic data, fid data calculation will be different
    '''
    b = (matched_cal_conc.groupby('library_id')
                         .apply(lambda a: linregress(a.tic_area,a.cal_conc))
                         .apply(pd.Series)
                         .reset_index())
    b.columns = ['library_id','slope','intercept','rvalue','pvalue','stderr']

    d = pd.DataFrame({'max':matched_cal_conc.groupby('library_id')['tic_area'].max(),
                    'min':matched_cal_conc.groupby('library_id')['tic_area'].min()}).reset_index()
    return pd.merge(b,d,on='library_id')
#pd.merge(b,d,on='library_id')

calced_curves = cal_curves_tic(matched_cal_conc)
calced_curves.head()

  slope = r_num / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)


Unnamed: 0,library_id,slope,intercept,rvalue,pvalue,stderr,max,min
0,"All cis-4,7,10,13,16,19-docosahexaenoate methy...",1.367304e-07,0.103774,0.999932,0.007420556,1.593827e-09,6566581.0,1094384.0
1,Methyl arachidate,7.146232e-08,-0.063537,0.999076,0.02736712,3.073928e-09,14785000.0,4222622.0
2,Methyl arachidonate,1.344684e-07,0.067813,0.999319,7.894786e-10,2.027146e-09,7056294.0,1390417.0
3,"Methyl cis-11,14,17-eicosatrienoate",1.306034e-07,0.102132,1.0,0.0,0.0,6874772.0,3046387.0
4,Methyl cis-11-eicosenoate,6.336257e-08,0.008515,1.0,0.0001097315,1.092154e-11,15647413.0,3810380.0


In [63]:
def calc_conc_tic(cal_curves, matched_df):
    '''
    this  function takes a dataframe which contains species matched to an area (matchlib2area_all) 
    and a calibration curve dataframe (cal_curves_tic)
    '''
    # calculate concentration of species
    return_df = pd.merge(matched_df,cal_curves,on='library_id',how='outer')
    return_df['conc'] = return_df.slope*return_df.tic_area+return_df.intercept
    return_df.drop(['rvalue','pvalue','stderr'],1,inplace=True)

    #calculate concentration percentage 
    totals_c = pd.DataFrame({'totals_c':(return_df.groupby('key')['conc']
                                            .apply(np.sum,axis=0))}).reset_index()
                        
    return_df = return_df.merge(totals_c, on=['key'])
    return_df['conc%']=return_df['conc']/return_df['totals_c']
    return_df.drop(['totals_c'],1,inplace=True)
    
    #calculate area percentage
    totals_a = pd.DataFrame({'totals_a':(return_df.groupby('key')['tic_area']
                                            .apply(np.sum,axis=0))}).reset_index()
                        
    return_df = return_df.merge(totals_a, on=['key'])
    return_df['tic_area%']=return_df['tic_area']/return_df['totals_a']
    return_df.drop(['totals_a'],1,inplace=True)
    
    return return_df

calced_conc_tic = calc_conc_tic(calced_curves, matched_df)
calced_conc_tic.groupby('key').get_group('FA14.D')

Unnamed: 0,index,header=,pk,rt,pct_area,library_id,ref,cas,qual,key,tic_area,slope,intercept,max,min,conc,conc%,tic_area%
106,0,1=,2.0,10.9317,7.8521,Methyl palmitoleate,9.0,000000-00-0,99.0,FA14.D,937080.0,1.415591e-07,0.027371,6870049.0,1571132.0,0.160023,0.445834,0.089095
107,1,2=,3.0,11.0462,47.9339,Methyl palmitate,10.0,000000-00-0,97.0,FA14.D,5720503.0,6.078866e-08,-0.096756,17912728.0,5489579.0,0.250985,0.699258,0.54389
108,3,4=,7.0,12.3222,2.4524,Methyl stearate,19.0,000000-00-0,97.0,FA14.D,292677.0,7.494718e-08,-0.074013,14241399.0,4171988.0,-0.052078,-0.145092,0.027827
109,2,3=,6.0,12.1734,29.8931,cis-9-Oleic methyl ester,33.0,000000-00-0,94.0,FA14.D,3567489.0,,,,,,,0.339188


Fuctions which still need to be built

In [7]:


def make_plot_percentarea():
    pass

def make_plot_concarea():
    pass

def make_plot_massarea():
    pass

def calc_mass():
    pass