In [1]:
import csv 
import json
import os
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from src.utils import parse_lipid, parse_p_value, ppm_tol

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.stats.multitest import fdrcorrection

In [2]:
colors = json.load(open(r'..\data\metadata\color_schemes.json'))
compound_superclasses = json.load(open('../data/metadata/compound_superclasses.json', 'r'))
ap = pd.read_excel(r'..\data\metadata\animal_phenotypes.xlsx', index_col=0)
fg = pd.read_csv(r'..\data\metadata\combined_metab_lipid_file_grouping.csv', index_col=0)

In [3]:
# Helper functions
def make_unique_name(name, rt, mz):
    return name + '_' + str(round(rt, 3)) + '_' + str(round(mz, 4))

def parse_tags(text):
    output = [
        ('mzCloudID' in text), 
        ('mzVaultID' in text), 
        ('ChemSpiderID' in text),
        ('Lipid' in text), 
        ('11.5' in text),
    ]
    return output

# def ppm_tol(mz1, mz2, tol=10):
#     return abs(mz1 - mz2) / mz1 * 1e6 < tol

# Lipid Data Processing


In [4]:
fr_raw = pd.read_csv(r'..\data\search\CD31\peak_finder\Final_Results_CD31.csv')
fr_raw = fr_raw.drop(fr_raw.filter(regex='Unnamed').columns, axis=1)  # filter out the "Unnamed: " column 
# log2 transform data
fr_raw.loc[:, fr_raw.filter(regex='Area \(max\)|\.raw').columns] = \
np.log2(fr_raw.loc[:, fr_raw.filter(regex='Area \(max\)|\.raw').columns])
# fr_raw

# Normalize to correct for run order bias

In [7]:
runorder = fg.loc[fg['original']].sort_values('lipid_run_order')['fr_name']
file_medians = fr_raw[runorder].median(axis=0)
model = LinearRegression() \
    .fit(np.arange(0, file_medians.shape[0]).reshape(-1,1), file_medians)
slope = model.coef_[0]
intercept = model.intercept_
normalization_factor_arr = np.arange(0, 60)*slope + intercept 
normalization_factor_arr = (normalization_factor_arr[0] / normalization_factor_arr)
fr_norm = fr_raw.copy()
fr_norm.loc[:, runorder] = fr_norm[runorder] * normalization_factor_arr

# Retain FAs (see `BJA-1.0.2`)

In [8]:
FAs = pd.read_csv(r'../data/interim/FAs_MS1_matched.csv', index_col=0)
fr_norm.loc[FAs.index, 'is_fa'] = True
fr_norm.loc[:, 'is_fa'].replace(np.nan, False, inplace=True)
fr_norm.loc[FAs.index, 'Identification'] = FAs.values
fr_norm.loc[FAs.index, 'Lipid Class'] = 'FA'
# fr_norm.loc[fr_norm['is_fa']]

# Many other tests for filtering lipids are in `BJA-1.1.1`, but the primary filters are:
1. High RSD compared to the QCs
1. low RT
2. small m/z
3. ratio of mean/blank < 3
3. Features found < 5

In [9]:
fr_norm['sample_mean'] = fr_norm[runorder].mean(axis=1)
fr_norm['blank_mean'] = fr_norm.filter(regex='Blank').mean(axis=1)
fr_norm['blank_max'] = fr_norm.filter(regex='Blank').max(axis=1)
fr_norm['sample_blank_diff'] = fr_norm['sample_mean'] - fr_norm['blank_mean']
fr_norm['sample_maxblank_diff'] = fr_norm['sample_mean'] - fr_norm['blank_max']

In [10]:
fr_final = fr_norm.copy()
fr_final = fr_final.loc[
  ~(
    (fr_norm['Retention Time (min)'] < 0.8) | 
    (fr_norm['Quant Ion'] < 350) | 
    (fr_norm['sample_maxblank_diff'] < 3) | 
    (fr_norm['Features Found'] < 5) 
   ) | (fr_norm['is_fa'] == True)
]
fr_final

Unnamed: 0,Retention Time (min),Quant Ion,Polarity,Area (max),Identification,Lipid Class,Features Found,20210729_AJ_Toh_RatBloodGlucose_ExtractionBlank.raw (F1),20210729_AJ_Toh_RatBloodGlucose_SolventBlank.raw (F2),20210729_AJ_Toh_RatBloodGlucose_T1060M_20210322_8wk_FBG.raw (F3),...,20210729_AJ_Toh_RatBloodGlucose_T1102M_20210413_10wk_RBG.raw (F64),20210729_AJ_Toh_RatBloodGlucose_Water_R1.raw (F65),20210729_AJ_Toh_RatBloodGlucose_Water_R2.raw (F66),20210729_AJ_Toh_RatBloodGlucose_Water_R3.raw (F67),is_fa,sample_mean,blank_mean,blank_max,sample_blank_diff,sample_maxblank_diff
67,0.818,622.29077,+,21.982179,,,19,12.941802,12.860211,20.182971,...,19.179026,13.015148,12.822537,12.804904,False,20.282531,12.901006,12.941802,7.381524,7.340729
75,0.871,453.16891,+,20.845767,,,19,15.573107,14.237157,20.067110,...,19.228641,14.003633,15.866233,15.239032,False,19.749192,14.905132,15.573107,4.844061,4.176086
76,0.873,645.11963,+,21.450707,,,7,14.087519,14.005928,20.045002,...,19.943100,14.185127,13.968254,13.950621,False,20.081812,14.046724,14.087519,6.035089,5.994293
77,0.876,631.34955,-,20.889744,,,10,12.929552,12.888105,18.343723,...,18.053283,13.326093,12.985003,12.902867,False,18.400726,12.908829,12.929552,5.491897,5.471174
84,0.933,376.25961,+,26.350005,,,58,12.595866,12.403226,23.817503,...,23.664533,12.755033,12.533462,12.467585,False,23.802752,12.499546,12.595866,11.303206,11.206886
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1473,19.885,908.86542,+,23.050005,TG 18:0_18:0_18:0,TG,29,14.966498,15.312007,20.999142,...,20.648004,18.511598,16.680126,16.234939,False,20.379256,15.139252,15.312007,5.240004,5.067250
1479,20.144,948.89648,+,22.185651,,,7,13.641698,13.780943,19.774817,...,19.189126,14.638193,14.434339,13.820448,False,18.777856,13.711320,13.780943,5.066536,4.996913
1480,20.148,897.82532,+,21.562814,,,15,13.549287,13.688532,20.517490,...,17.314487,13.863481,14.341928,13.728038,False,19.001263,13.618910,13.688532,5.382353,5.312730
1482,20.175,593.58716,+,21.652712,,,19,13.716785,13.856031,20.603688,...,17.454831,13.960918,13.906388,13.895536,False,18.908803,13.786408,13.856031,5.122395,5.052772


## Add column `lc_type` for distinguishing that the ACs (which are lipids) came from the HILIC run

In [11]:
fr_final['lc_type'] = 'RP'
fr_final['Type'] = 'lipid'

# Rename columns

In [12]:
fr_final = fr_final.rename(
    {'Retention Time (min)': 'RT',
     'Quant Ion': 'm/z',
     'Polarity': 'polarity',
     'Identification': 'ID', 
     'Lipid Class': 'molec_class',
     **{old: new for old, new in zip(fg['fr_name'], fg.index)}
    }, axis=1
)
# fr_final

## Fill in ID, molec_class and superclass

In [13]:
fr_final['ID'] = fr_final['ID'].replace(np.nan, 'Unidentified')
fr_final['molec_class'] = fr_final['molec_class'].replace(np.nan, 'Unidentified')
fr_final['superclass'] = fr_final['molec_class'].map(compound_superclasses)
# Drop 3 ACs that get overlapping IDs from the HILIC data
fr_final = fr_final.loc[~(fr_final['molec_class'] == 'AC')]

# Metabolite Data Table Processing

## Parse the Compounds Table and the Chemspider subtable together 

In [14]:
cd_path = '../data/CD33/Nilerat_metabs_20220516_compounds.csv'
cs_path = '../data/CD33/Nilerat_metabs_20220516_chemspider.csv'
cd_all = pd.read_csv(cd_path)
cs = []
with open(cs_path, encoding='utf-8-sig') as f:
    r = csv.reader(f)
    for line in r:
        cs.append(line) 
headers = cs[0]
cs_headers = cs[2]
ion_type_index = headers.index('Reference Ion')  # Get index of Reference Ion (e.g. [M+H]+1) in headers
cs_name_index = cs_headers.index('Name')         # Get index of chemspider name
df = []
for i, line in enumerate(cs[1:], start=1):
    cs_names = []
    if '[' in line[ion_type_index]:  # New Compound
        compound_line_index = i
        cs_line_index = compound_line_index + 2
#         print(cs[0])
        try:
            while cs[cs_line_index][1] == 'FALSE':
                cs_names.append(cs[cs_line_index][cs_name_index])
                cs_line_index += 1
        except IndexError:
            pass
        result = {header: entry for header, entry in zip(headers, cs[compound_line_index])}
        if not cs_names:
            cs_names = None
        else:
            cs_names = '\n'.join(cs_names)
        result['chemspider_matches'] = cs_names
        df.append(result)
    else:
        continue    
cd = pd.DataFrame(df)
# cd

## Parse columns

In [15]:
area_cols = cd.filter(regex='^Area: ').columns
cd[area_cols] = cd.loc[:, area_cols].astype('float')
cd.loc[:, area_cols] = np.log2(cd.loc[:, area_cols])
cd['lc_type'] = 'HILIC'
cd['polarity'] = cd['Reference Ion'].str[-2]
cd['Checked'] = cd['Checked'].map({'TRUE': True, 'FALSE': False})
cd['is_id'] = (cd['Name'] != '') & (cd['Checked'])
cd[['RT [min]', 'm/z']] = cd[['RT [min]', 'm/z']].astype('float')
cd['Type'] = 'metabolite'
cd['mzcloud'], cd['mzvault'], cd['chemspider'], cd['is_HILIC_lipid'], cd['11.5'] = \
    zip(*cd['Tags'].map(parse_tags))

## Assign ACs (see `BJA-1.0.2`)

In [16]:
ACs = pd.DataFrame(json.load(open(r'../data/interim/ACs_MS1_matched.json'))).set_index(0)
cd.loc[ACs.index, 'Name'] = ACs.values
cd.loc[ACs.index, 'confident_id'] = True
cd.loc[ACs.index, 'is_HILIC_lipid'] = False
cd.loc[ACs.index, 'Type'] = 'lipid'
cd.loc[ACs.index, 'molec_class'] = 'AC'
# cd.loc[ACs.index]

In [17]:
cd['confident_id'] = ((cd['mzvault'] | cd['mzcloud']) & \
                      (~cd['is_HILIC_lipid']) & \
                      (cd['Checked']) & \
                      (~cd['Name'].str.contains('drop this duplicate'))
                    ) | (cd['molec_class'] == 'AC')

## Parse Tracefinder ID names from YYZ's library

In [18]:
# Fix the metabolite names to be user-friendly and remove comments
fixed_metab_names = {
    "Acetyl-β-methylcholine": "Acetyl-beta-methylcholine",
    "Adenosine 5'-monophosphate": "Adenosine 5'-monophosphate",
    "Cytidine 5'-diphosphocholine": "Cytidine 5'-diphosphocholine",
    "DL-Carnitine": 'Carnitine',
    "DL-Glutamine": 'Glutamine',
    "DL-Leucine/Isoleucine": 'Leucine/Isoleucine',
    "DL-Lysine": 'Lysine',
    "DL-Proline": 'Proline',
    "Guanosine: all right MS2": 'Guanosine',
    "L-(+)-Arginine": 'Arginine',
    "L-(+)-Citrulline": 'Citrulline',
    "L(+)-Ornithine": 'Ornithine',
    "L-Alanine": 'Alanine',
    "L-Aspartic acid": 'Aspartic acid',
    "L-Glutamic acid": 'Glutamic acid',
    "L-Iditol to Six-carbon sugar alcohol": 'Iditol',
    "L-Phenylalanine": 'Phenylalanine',
    "L-Serine": 'Serine', 
    "L-Tyrosine": 'Tyrosine',
    "L-Valine": 'Valine',
    "O-Isovaleryl-L-carnitine: rt a little shift": 'AC 5:0',
    'O-Butyryl-L-carnitine': 'AC 4:0',
    'Acetyl-L-carnitine': 'AC 2:0',
    'Propionylcarnitine': 'AC 3:0',
    "?-Lactose": 'alpha-Lactose',
    "Xylitol to Arabitol": 'Xylitol',
    '?-Glutamylcysteine': 'Glutamylcysteine',
    'Uridinediphosphateglucose: standard RIAILs study': 'Uridinediphosphateglucose',
#     'Nicotinamide adenine dinucleotide (NAD+)': 'Nicotinamide adenine dinucleotide',
    'L-Pyroglutamic acid': 'Pyroglutamic acid',
    'Citric acid/Isocitric acid': 'Citric acid',
    'D-Hexose 1-phosphate': 'Hexose-1-phosphate',
    'L-Glutathione (reduced)': 'Glutathione',
    'L-2-Aminoadipic acid': '2-Aminoadipic acid',
    'N2-Acetyl-Lysine': 'N2-Acetyllysine',
    'L-Glutathione oxidized': 'Glutathione oxidized',
}
tf_rt_mz = pd.read_csv(r'..\data\interim\Tracefinder_RT_MZ_name.csv', encoding='utf-8', encoding_errors='ignore')
tf_rt_mz = tf_rt_mz.dropna(how='all', axis=1)
tf_rt_mz['Quan Mass'] = tf_rt_mz['Quan Mass'].str.strip(' mz').astype('float')
tf_rt_mz['Compound'] = tf_rt_mz['Compound'].map(fixed_metab_names).fillna(tf_rt_mz['Compound'])
tf_rt_mz = tf_rt_mz.set_index('Compound')
# display(tf_rt_mz)

## Find matches between TraceFinder targeted method and the IDs from CD 

In [32]:
tf_rt_mz

Unnamed: 0_level_0,Quan Mass,RT
Compound,Unnamed: 1_level_1,Unnamed: 2_level_1
Nicotinamide,123.06,2.83
AC 5:0,246.17,6.0
Doxycycline,445.16,6.26
AC 4:0,232.15,7.09
AC 3:0,218.14,7.84
AC 2:0,204.12,8.72
Nicotinic acid,122.02,8.79
Phenylalanine,164.07,9.03
Leucine/Isoleucine,130.09,9.2
Pantothenic acid,218.1,9.3


In [34]:
cd.Name.unique()

array(['Wow! 1082 1076 Piracetam/Ectoine',
       'Very close mz and RT to Tyrosine but actually in the 11.5 minute gang',
       'Urocanic acid', 'Uridine', 'Uric acid', 'Tyrosine', 'Tryptophan',
       'Trigonelline', 'Tretinoin Glucuronide', 'Indole-3-acrylic acid',
       'Threonine', 'Threonic acid compare with neighbor',
       'Threonic acid', 'Testosterone sulfate', 'Taurodeoxycholic acid',
       'Taurocholic acid', 'Taurine', 'streptidine',
       'Streptamine phosphate', 'Stachydrine', 'SM', 'Serine', 'Ribose',
       'Quinic acid', 'Pyroglutamic acid', 'Pseudouridine', 'Proline',
       'Piperidine or methylpyrrolidine', 'Pipecolic acid',
       'Phenylalanine', 'Phenylacetylglycine', 'Pentose sugar', 'PE',
       'PC', 'p-Cresylsulfate', 'Ornithine (drop this duplicate)',
       'Ornithine', 'Oleamide or Sphingosine [M-H2O]+1',
       'Nicotinamide 1-oxide', 'Nicotinamide',
       'N6,N6,N6-Trimethyl-L-lysine', 'N3,N4-Dimethyl-L-arginine',
       'N2-Dimethylguanosine part

In [19]:
cd['tracefinder_id'] = False
for i_cd, mrow in cd.iterrows():
    for compound_name, tfrow in tf_rt_mz.iterrows():
        if ppm_tol(mrow['m/z'], tfrow['Quan Mass'], tol=15) and abs(mrow['RT [min]'] - tfrow['RT']) < 1:
            # Threonine and Proline already confirmed
            if compound_name != 'Proline' and compound_name != 'Threonine':  
                cd.loc[i_cd, 'Name'] = compound_name
                cd.loc[i_cd, 'tracefinder_id'] = True
#             print(mrow['Name'], mrow['m/z'], mrow['RT [min]'], compound_name, tfrow['RT'])

In [25]:
cd

Unnamed: 0,Name,Formula,Annot. Source: Predicted Compositions,Annot. Source: mzCloud Search,Annot. Source: mzVault Search,Annot. Source: ChemSpider Search,Reference Ion,Annot. DeltaMass [ppm],m/z,RT [min],...,is_id,Type,mzcloud,mzvault,chemspider,is_HILIC_lipid,11.5,confident_id,molec_class,tracefinder_id
0,Wow! 1082 1076 Piracetam/Ectoine,C6 H10 N2 O2,Full match,No results,No results,Full match,[M-H]-1,-7.31,141.06591,2.353,...,True,metabolite,False,False,True,False,False,False,,False
1,Very close mz and RT to Tyrosine but actually ...,C9 H11 N O3,No match,Not the top hit,No results,No results,[M-H]-1,-44.8,180.05850,11.539,...,True,metabolite,False,False,False,False,True,False,,False
2,Urocanic acid,C6 H6 N2 O2,Full match,Full match,Full match,Partial match,[M+H]+1,-2.71,139.05028,11.365,...,True,metabolite,True,True,False,False,False,True,,False
3,Uridine,C9 H12 N2 O6,Full match,Full match,Full match,Full match,[M-H]-1,-3.02,243.06152,7.513,...,True,metabolite,False,True,False,False,False,True,,False
4,Uric acid,C5 H4 N4 O3,Full match,Full match,Full match,Full match,[M-H]-1,-5.85,167.02007,12.732,...,True,metabolite,False,True,False,False,False,True,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
625,,C12 H16,Full match,Invalid mass,No results,Full match,[M+H]+1,0.34,161.13253,1.403,...,False,metabolite,False,False,False,False,False,False,,False
626,,C12 H10 N2 O S,Not the top hit,No results,No results,Full match,[M-H]-1,-3.14,229.04342,8.657,...,False,metabolite,False,False,False,False,False,False,,False
627,,C24 H33 F O6,No match,Full match,No results,Full match,[M+H]+1,5.7,437.23588,1.775,...,False,metabolite,False,False,False,True,False,False,,False
628,,C12 H17 N4 O2 P S,Full match,Invalid mass,No results,No match,[M-H]-1,-6.28,311.07175,9.539,...,False,metabolite,False,False,False,False,False,False,,False


In [30]:
[print(x) for x in cd.loc[cd['tracefinder_id']].Name.to_list()]

Indole-3-acrylic acid
Phenylalanine
Pyroglutamic acid
Iditol
Gluconic acid
Gluconic acid
AC 5:0
AC 3:0
Arginine
Alanine
Adenosine 5'-monophosphate


[None, None, None, None, None, None, None, None, None, None, None]

## Remove any IDs that aren't from mzVault, or mzCloud, or Tracefinder, or ACs

In [23]:
not_confident = ~(cd['confident_id'] | cd['tracefinder_id'])  # | (cd['molec_class'] == 'AC')
cd['Name'] = cd['Name'].mask(cond=not_confident)
# cd.loc[cd['molec_class'] == 'AC']

## Remove HILIC lipids

In [24]:
cd = cd.loc[cd['is_HILIC_lipid'] == False]

## Rename columns

In [25]:
# Create a map from the filename with "Area: ... (F2)" to the shortened col name
metab_data_col_map = {}
for old_col in cd.filter(regex='^Area:').columns:
    interim_col = old_col.replace('Area: ', '').split('.raw')[0]
    try:
        shortened_col = fg.loc[fg['File Name_metab'] == interim_col].index[0]
    except: 
        continue
    if not isinstance(shortened_col, str):
        continue
    metab_data_col_map[old_col] = shortened_col

cd = cd.rename({'Name': 'ID', 'RT [min]': 'RT', **metab_data_col_map}, axis=1)

### Extra metabolites remove

In [26]:
cd = cd.loc[~(cd['ID'] == 'Pyroglutamic acid')]
cd = cd.loc[~((cd['ID'] == 'Gluconic acid') & (cd['RT'] > 13.6) & (cd['RT'] < 13.7))]

## Assign Metabolite Molecular Class from list in `BJA-0.2.1`
## superclass will be same as molecular class for metabolites

In [27]:
cd['ID'] = cd['ID'].replace(np.nan, 'Unidentified')
cd['molec_class'] = cd['ID'].map(compound_superclasses)
cd.loc[cd['ID'].str.contains('AC'), 'molec_class'] = 'AC' 
cd['superclass'] = cd['molec_class'].map(compound_superclasses)
cd['superclass'].unique()

array(['Unidentified', 'Nitrogen heterocycle', 'Amino acid',
       'Organic acid', 'Carbohydrate', 'Amino acid derivative', 'Alcohol',
       'Fatty Acyl'], dtype=object)

# Combine lipid and metabolite dataframes

In [28]:
df = cd.append(fr_final)
df = df.reset_index()

# Keep select columns
cols_to_keep = ['ID', 'm/z', 'RT', 'molec_class', 'Type', 'polarity', 'lc_type', 'superclass']
cols_to_keep.extend(fg.index)
df = df.loc[:, cols_to_keep]

## Drop `1091_9_FBG` due to outlier

In [29]:
df = df.drop('1091_9_FBG', axis=1)

# Make `unique_id` column and shortened `i` column

In [30]:
df['unique_id'] = df.apply(lambda x: make_unique_name(
    x['ID'],                                                    
    x['RT'],                             
    x['m/z']), axis=1)

df['i'] = None
for _type in ['lipid', 'metabolite']:
    counter = 0
    for i, row in df.loc[df['Type'] == _type].iterrows():
        df.loc[i, 'i'] = _type[0] + '_' + str(counter)
        counter += 1

### Drop unusual FAs

In [31]:
df = df.loc[~df['ID'].isin(['FA 27:3', 'FA 29:3', 'FA 31:3'])]

## Define non-fasted and fasted columns

In [32]:
rbg_cols = fg.loc[fg['bg_type'] == 'RBG'].index.to_list()
fbg_cols = fg.loc[fg['bg_type'] == 'FBG'].index.to_list()
# REMOVE OUTLIER SAMPLE
fbg_cols.remove('1091_9_FBG')
data_cols = rbg_cols + fbg_cols

## Include log2 fold change of average of `nonfasted - fasted` for collaborators

In [33]:
gb = df.groupby(fg['bg_type'], axis=1).mean()
df['mean_sampling_log2fc'] = gb['RBG'] - gb['FBG']

# Do linear models

In [34]:
def fit_anova_fixed_effects(feature, all_terms, debug=False):
    full_formula = f'{feature} ~ ' + ' + '.join(all_terms) 
    full_model = smf.ols(full_formula, data=data).fit()

    params = full_model.params.to_dict()
    params = {'model_param_' + key: value for key, value in params.items()}

    anova_df = sm.stats.anova_lm(full_model, typ=1)
    anova_df = anova_df.loc[~anova_df.index.isin(['Residual']), 'PR(>F)']
    anova_df = anova_df.to_dict()
    anova_df = {'pval_' + key: value for key, value in anova_df.items()}
    if debug: 
        return {**anova_df, **params}, full_model
    else:
        return {**anova_df, **params}

In [35]:
data = df.filter(regex='RBG|FBG', axis=1).set_index(df['i']).T

data = data.astype('float')
features = data.columns
data = data.join(fg[['bg_type', 'bg', 'week', 'gluc_tol', 'animal', 'ogtt', 'litter']])

# ### ENCODING WEEK AND LITTER AS A CATEGORICAL STRING INSTEAD OF NUMERIC ###
data[['week', 'litter']] = data[['week', 'litter']].astype('str')  

weight_insulin = fg['animal'].apply(lambda x: ap.loc[x, ['Insulin (AUC)', 'Weightprefastweek12']])
data = data.join(weight_insulin)
data.rename({'bg_type': 'sampling', 
             'Insulin (AUC)': 'insulin', 
             'Weightprefastweek12': 'weight'}, 
            inplace=True, axis=1)
data

Unnamed: 0,m_0,m_1,m_2,m_3,m_4,m_5,m_6,m_7,m_8,m_9,...,l_916,sampling,bg,week,gluc_tol,animal,ogtt,litter,insulin,weight
1091_8_FBG,18.724636,20.77958,24.180859,24.88768,27.518513,23.065294,22.586018,21.670779,22.690034,22.292465,...,18.620317,FBG,53.333333,8,normal,1091,19060.0,2,653.495,92.4
1091_10_FBG,17.600816,20.575689,23.58308,24.71325,27.163433,23.132749,22.706016,22.116365,22.69058,21.771079,...,19.778245,FBG,48.5,10,normal,1091,19060.0,2,653.495,92.4
1093_8_FBG,16.451618,20.902141,23.548337,24.647222,27.405718,23.138487,22.633815,22.139245,22.665739,22.337209,...,21.063577,FBG,47.0,8,normal,1093,25057.5,2,453.485,94.5
1093_9_FBG,18.286166,20.87746,22.368788,24.526725,27.051667,23.178672,22.559027,22.469655,22.629048,22.274394,...,20.968101,FBG,64.0,9,normal,1093,25057.5,2,453.485,94.5
1093_10_FBG,18.054307,20.415204,22.704412,24.404735,26.756463,23.060708,22.793575,22.273762,22.923587,21.875123,...,19.638826,FBG,56.5,10,normal,1093,25057.5,2,453.485,94.5
1060_8_FBG,17.59473,20.198771,23.895235,24.71491,17.148994,22.653452,22.084504,21.397206,22.065355,21.67906,...,19.836798,FBG,41.0,8,impaired,1060,48742.5,0,422.94,105.1
1060_9_FBG,21.809716,20.524661,24.263211,24.520405,19.16921,23.207043,22.450039,21.42402,22.534742,21.84956,...,20.234283,FBG,48.0,9,impaired,1060,48742.5,0,422.94,105.1
1060_10_FBG,18.748109,20.303969,23.084387,24.792384,27.209158,22.793616,22.563256,21.784634,22.664376,21.631447,...,19.279616,FBG,52.5,10,impaired,1060,48742.5,0,422.94,105.1
1062_8_FBG,18.098242,21.019048,24.873347,24.787384,21.774699,23.221875,22.35222,21.964129,22.346725,22.098343,...,18.585339,FBG,52.333333,8,impaired,1062,43717.5,0,432.02,109.1
1062_9_FBG,17.286972,20.641492,24.131393,24.408454,23.189693,22.831914,22.452734,21.841118,22.588667,21.916517,...,18.670027,FBG,48.0,9,impaired,1062,43717.5,0,432.02,109.1


In [36]:
ogtt_sampling_terms = ['ogtt', 'sampling',  'sampling:ogtt']

ogtt_fer = []  # fer = Fixed Effects Result
for feature in features:
    ogtt_fer.append(fit_anova_fixed_effects(feature=feature, all_terms=ogtt_sampling_terms))
ogtt_fer = pd.DataFrame(ogtt_fer, index=features)

### ADDITIONAL CODE TO DO LIKELIHOOD RATIO TEST FOR (INSULIN OGTT):SAMPLING TERMS 

# itt_sampling_terms =  ['insulin', 'sampling', 'sampling:insulin']
# itt_fer = [] 
# for feature in features:
#     itt_fer.append(fit_anova_fixed_effects(feature=feature, all_terms=itt_sampling_terms))
# itt_fer = pd.DataFrame(itt_fer, index=features)

## Perform 2 separate linear regression fits on only Fed data, and only Fasted data for each feature

In [37]:
# One example of a linear regression fit 
fit = smf.ols('l_762 ~ ogtt', data=data, subset=rbg_cols).fit()
fit.pvalues['ogtt']

def get_linear_regression(feature):
    d = {}
    for cols, bg_type in [(rbg_cols, 'fed'), (fbg_cols, 'fasted')]:
        fit = smf.ols(f'{feature} ~ ogtt', data=data, subset=cols).fit()
        d['pval_' + bg_type] = fit.pvalues['ogtt']
        d['coef_' + bg_type] = fit.params['ogtt']
        d['intercept_' + bg_type] = fit.params['Intercept']
        
    # display(fit.summary())
    return d

feature = 'm_1'
get_linear_regression(feature)

{'pval_fed': 3.1590160514400507e-08,
 'coef_fed': 3.69042256293342e-05,
 'intercept_fed': 20.089207679864238,
 'pval_fasted': 0.8511333563584583,
 'coef_fasted': 1.0966543412315325e-06,
 'intercept_fasted': 20.674434437962763}

In [38]:
fits = []
for feature in features:
    fits.append(get_linear_regression(feature))
fits = pd.DataFrame(fits, index=features)
fits

Unnamed: 0_level_0,pval_fed,coef_fed,intercept_fed,pval_fasted,coef_fasted,intercept_fasted
i,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
m_0,1.774542e-03,0.000093,14.839852,0.003910,0.000078,15.116552
m_1,3.159016e-08,0.000037,20.089208,0.851133,0.000001,20.674434
m_2,4.526725e-01,-0.000010,23.382609,0.858745,-0.000002,23.388362
m_3,4.126169e-01,-0.000004,23.982960,0.067430,-0.000009,24.869487
m_4,4.510871e-01,-0.000032,26.522464,0.023551,-0.000121,29.475577
...,...,...,...,...,...,...
l_912,3.003979e-05,0.000051,18.452536,0.803593,0.000003,19.990039
l_913,1.311126e-04,0.000054,16.772587,0.254531,-0.000025,19.505542
l_914,1.666406e-03,0.000038,16.665978,0.133203,-0.000018,20.500428
l_915,3.461004e-04,0.000045,16.114024,0.158286,-0.000019,20.590182


In [39]:
# FDR correction
# Adds FDR-corrected column to ogtt_fer and fits DataFrames 
for fer in [ogtt_fer, fits]:  # itt_fer
    for i, pval_col in enumerate(fer.filter(regex='pval')):
        fdr_col_name = 'qval_' + pval_col.split('pval_')[1]
        fer.insert(i, fdr_col_name, fdrcorrection(fer[pval_col])[1])

# Combine DFs

In [44]:
data = df.join(ogtt_fer, on=df['i']).set_index('unique_id')
data = data.join(fits, on=data['i'])

# Make useful columns for later processing 

In [46]:
data['signif_interaction'] = data['qval_sampling:ogtt'] < 0.05
data['signif_sampling'] = data['qval_sampling'] < 0.05
gb_means = (data
            .loc[:, data_cols]
            .groupby(fg['bg_type'], axis=1)
            .mean()
           )

data['fasted_mean'] = gb_means['FBG']
data['fed_mean'] = gb_means['RBG']
data['Log2 Fold Change'] = data['fed_mean'] - data['fasted_mean']

data['Fed - Fasted slope'] = data['coef_fed'] - data['coef_fasted']
data['signif_sampling'] = data['qval_sampling'] < 0.05
data['signif_interact'] = data['qval_sampling:ogtt'] < 0.05
data['log_qval_sampling'] = -np.log10(data['qval_sampling'])
data['log_qval_ogtt'] = -np.log10(data['qval_ogtt'])
data['log_qval_sampling:ogtt'] = -np.log10(data['qval_sampling:ogtt'])
data['is_id'] = data['superclass'] != 'Unidentified'

l_ids = data.loc[(data['Type'] == 'lipid') & (data['ID'] != 'unknown')].index
data.loc[l_ids, 'lipid_class']     = data.loc[l_ids, 'ID'].apply(lambda x: parse_lipid(x)[0])
data.loc[l_ids, 'extra_lipid_label']     = data.loc[l_ids, 'ID'].apply(lambda x: parse_lipid(x)[1])
data.loc[l_ids, 'fa_carbons']      = data.loc[l_ids, 'ID'].apply(lambda x: parse_lipid(x)[2])
data.loc[l_ids, 'fa_unsat']        = data.loc[l_ids, 'ID'].apply(lambda x: parse_lipid(x)[3])
data.loc[l_ids, 'fa_carbon:unsat'] = data.loc[l_ids, 'ID'].apply(lambda x: parse_lipid(x)[4])
data['pval_asterisks']  = data['qval_sampling:ogtt'].apply(lambda x: parse_p_value(x))

# Combine and write to files

In [47]:
data.to_csv(r'..\data\processed\combined_metabolites_data_with_model_params.csv')
display(data)
data.drop(data.filter(regex='model_param|coef_|intercept_').columns, axis=1, inplace=True)
display(data)
data.to_csv(r'..\data\processed\combined_metabolites_data.csv')
data.to_excel(r'..\data\processed\combined_metabolites_data.xlsx')

Unnamed: 0_level_0,ID,m/z,RT,molec_class,Type,polarity,lc_type,superclass,1091_8_FBG,1091_10_FBG,...,log_qval_sampling,log_qval_ogtt,log_qval_sampling:ogtt,is_id,lipid_class,extra_lipid_label,fa_carbons,fa_unsat,fa_carbon:unsat,pval_asterisks
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Unidentified_2.353_141.0659,Unidentified,141.06591,2.353,Unidentified,metabolite,-,HILIC,Unidentified,18.724636,17.600816,...,0.209851,3.911627,0.092460,False,,,,,,ns
Unidentified_11.539_180.0585,Unidentified,180.05850,11.539,Unidentified,metabolite,-,HILIC,Unidentified,20.779580,20.575689,...,11.987252,4.759830,3.239845,False,,,,,,***
Urocanic acid_11.365_139.0503,Urocanic acid,139.05028,11.365,Nitrogen heterocycle,metabolite,+,HILIC,Nitrogen heterocycle,24.180859,23.583080,...,0.748819,0.233844,0.104163,True,,,,,,ns
Uridine_7.513_243.0615,Uridine,243.06152,7.513,Nitrogen heterocycle,metabolite,-,HILIC,Nitrogen heterocycle,24.887680,24.713250,...,9.482876,0.889251,0.159312,True,,,,,,ns
Uric acid_12.732_167.0201,Uric acid,167.02007,12.732,Nitrogen heterocycle,metabolite,-,HILIC,Nitrogen heterocycle,27.518513,27.163433,...,0.459055,1.250723,0.480416,True,,,,,,ns
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TG 18:0_18:0_18:0_19.885_908.8654,TG 18:0_18:0_18:0,908.86542,19.885,TG,lipid,+,RP,Glycerolipid,19.413597,19.899861,...,1.893991,2.856877,1.814371,True,TG,,54.0,0.0,54:0,*
Unidentified_20.144_948.8965,Unidentified,948.89648,20.144,Unidentified,lipid,+,RP,Unidentified,16.606829,18.266873,...,1.258580,0.595997,1.828730,False,,,,,,*
Unidentified_20.148_897.8253,Unidentified,897.82532,20.148,Unidentified,lipid,+,RP,Unidentified,19.073429,20.133479,...,7.730918,0.617574,2.049913,False,,,,,,**
Unidentified_20.175_593.5872,Unidentified,593.58716,20.175,Unidentified,lipid,+,RP,Unidentified,19.091184,19.996615,...,9.410573,0.848449,2.257369,False,,,,,,**


Unnamed: 0_level_0,ID,m/z,RT,molec_class,Type,polarity,lc_type,superclass,1091_8_FBG,1091_10_FBG,...,log_qval_sampling,log_qval_ogtt,log_qval_sampling:ogtt,is_id,lipid_class,extra_lipid_label,fa_carbons,fa_unsat,fa_carbon:unsat,pval_asterisks
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Unidentified_2.353_141.0659,Unidentified,141.06591,2.353,Unidentified,metabolite,-,HILIC,Unidentified,18.724636,17.600816,...,0.209851,3.911627,0.092460,False,,,,,,ns
Unidentified_11.539_180.0585,Unidentified,180.05850,11.539,Unidentified,metabolite,-,HILIC,Unidentified,20.779580,20.575689,...,11.987252,4.759830,3.239845,False,,,,,,***
Urocanic acid_11.365_139.0503,Urocanic acid,139.05028,11.365,Nitrogen heterocycle,metabolite,+,HILIC,Nitrogen heterocycle,24.180859,23.583080,...,0.748819,0.233844,0.104163,True,,,,,,ns
Uridine_7.513_243.0615,Uridine,243.06152,7.513,Nitrogen heterocycle,metabolite,-,HILIC,Nitrogen heterocycle,24.887680,24.713250,...,9.482876,0.889251,0.159312,True,,,,,,ns
Uric acid_12.732_167.0201,Uric acid,167.02007,12.732,Nitrogen heterocycle,metabolite,-,HILIC,Nitrogen heterocycle,27.518513,27.163433,...,0.459055,1.250723,0.480416,True,,,,,,ns
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TG 18:0_18:0_18:0_19.885_908.8654,TG 18:0_18:0_18:0,908.86542,19.885,TG,lipid,+,RP,Glycerolipid,19.413597,19.899861,...,1.893991,2.856877,1.814371,True,TG,,54.0,0.0,54:0,*
Unidentified_20.144_948.8965,Unidentified,948.89648,20.144,Unidentified,lipid,+,RP,Unidentified,16.606829,18.266873,...,1.258580,0.595997,1.828730,False,,,,,,*
Unidentified_20.148_897.8253,Unidentified,897.82532,20.148,Unidentified,lipid,+,RP,Unidentified,19.073429,20.133479,...,7.730918,0.617574,2.049913,False,,,,,,**
Unidentified_20.175_593.5872,Unidentified,593.58716,20.175,Unidentified,lipid,+,RP,Unidentified,19.091184,19.996615,...,9.410573,0.848449,2.257369,False,,,,,,**
