In [None]:
import pandas as pd
import numpy as np
import glob as glob
import os
import sys
import matplotlib.pyplot as plt
import networkx as nx
import pingouin as pg
import seaborn as sns

import pandas as pd
import numpy as np
import glob as glob
import matplotlib.pyplot as plt
import networkx as nx
import os
import glob

# sys.path.insert(0,'../')
import analysis_tools as at

In [None]:
# experiment directory
## note: must follow JGI file naming conventions and be converted to hdf5 format

exp_dir = '/global/cfs/cdirs/metatlas/raw_data/egsb/20231018_EB_MdR_109570-002_WAVEstab_20231017_EXP120A_C18-EP_USDAY72349_vols'
group_control = 'supern-WAVE-NatCom-NLDM-Day0'
group_treatment = 'supern-WAVE-NatCom-NLDM-Day7'
my_groups = {'control':group_control,'treatment':group_treatment}
experiment_name = 'wavestab1'
output_filename = f'OUTPUT_{experiment_name}_{group_treatment}-vs-{group_control}.csv'

# tolerance in ppm between experimental signal and node mz
mz_ppm_tolerance = 5
peak_height_min = 1e4
num_datapoints_min = 10
# minimum MSMS score 
msms_score_min = 0.5

# minimum MSMS matching ion count
msms_matches_min = 3

# retention time range in minutes for feature finding
rt_range = [1, 700]

# tolerance in daltons used for calculating MS/MS similarity scores
frag_mz_tolerance = 0.05

# combine all parameters into a single dictionary and export to a file
params = {'mz_ppm_tolerance':mz_ppm_tolerance,
          'peak_height_min':peak_height_min,
          'num_datapoints_min':num_datapoints_min,
          'msms_score_min':msms_score_min,
          'msms_matches_min':msms_matches_min,
          'rt_range':rt_range,
          'frag_mz_tolerance':frag_mz_tolerance}

params['exp_dir'] = exp_dir
params['my_groups'] = my_groups
params['output_filename'] = output_filename

with open(output_filename.replace('.csv','.params'),'w') as f:
    for key in params.keys():
        f.write("%s: %s\n"%(key,params[key]))

In [None]:
# collect and merge required data and metadata

node_data = at.graph_to_df()
node_atlas = at.make_node_atlas(node_data, rt_range)
merged_node_data = at.merge_spectral_data(node_data)
files_data = at.get_files_df(exp_dir,parse_filename=True,groups=my_groups)
files = files_data['filename'].tolist()

cols = ['inchi_key_identity','smiles_identity']
data = node_data[cols].copy()
data.drop_duplicates('inchi_key_identity',inplace=True)
data = data[pd.notna(data['inchi_key_identity'])]
data.rename(columns={'inchi_key_identity':'inchi_key','smiles_identity':'smiles'},inplace=True)
data.head()
# sys.path.insert(0,'.')
from get_compound_descriptors import calc_descriptor_df
# from importlib import reload
# gcd = reload(gcd)
data = calc_descriptor_df(data)

In [None]:
mzml_files = [file.replace('h5', 'mzML') for file in files]
files = mzml_files

In [None]:
files_data['filename'] = files

In [None]:
# get ms1 and ms2 data
ms1_data = at.get_sample_ms1_data(node_atlas, files, mz_ppm_tolerance,peak_height_min,num_datapoints_min)
max_ms1_data = at.get_best_ms1_rawdata(ms1_data,node_data)
ms2_data = at.get_sample_ms2_data(files,merged_node_data,msms_score_min,msms_matches_min,mz_ppm_tolerance,frag_mz_tolerance)
max_ms2_data = at.get_best_ms2_rawdata(pd.concat(ms2_data))
best_hits = at.get_best_ms1_ms2_combined(max_ms1_data,max_ms2_data)

stats_df = at.do_basic_stats(ms1_data,files_data)
output_df = at.make_output_df(node_data,best_hits,stats_df,filename=output_filename)

In [None]:
stats_df

In [None]:
cols = ['inchi_key_identity','log2_foldchange','p_value']
d = output_df[cols]
d = d[pd.notna(d['inchi_key_identity'])]
d = d[pd.notna(d['log2_foldchange'])]
d = d[d['p_value'] < 0.05]
d = pd.merge(d,data,left_on='inchi_key_identity',right_on='inchikey',how='inner')
cols = [c for c in d.columns if c.startswith('property')]
for c in cols:
    d[c] = d[c].astype(float)
    x = d['log2_foldchange']
    y = d[c]
    idx = pd.notna(x) & pd.notna(y)
    corr = pg.corr(x[idx],y[idx])
    r = corr["r"].values[0]
    p = corr["p-val"].values[0]
    if abs(r) > 0.04 and p < 0.05:
        print(f'{c}: r={corr["r"].values[0]:.2f}, p={corr["p-val"].values[0]:.2e}')
    
    # print(f'{c}: r={r:.2f}, p={p:.2e}')


In [None]:
my_prop = 'property: hallKierAlpha'
x = d['log2_foldchange']
y = d[my_prop]
idx = pd.notna(x) & pd.notna(y)
x = x[idx]
y = y[idx]
idx = x > 0 # not eaten?    
fig,ax = plt.subplots()
bins = np.linspace(y.min(),y.max(),20)
sns.histplot(y[idx],ax=ax,bins=bins,label='Increased')
idx = x < 0 #eaten?
sns.histplot(y[idx],ax=ax,bins=bins,label='Decreased')
plt.legend()


In [None]:
df1 = pd.read_csv('OUTPUT_wavestab1_supern-WAVE-NatCom-NLDM-Day7-vs-supern-WAVE-NatCom-NLDM-Day0.csv')
df2 = pd.read_csv('OUTPUT_wavestab3_supern-CentExp-OMT1d2-NatCom-d7-NA-vs-supern-CentExp-OMT1d2-Sterile-d0-NA.csv')
df1 = df1[df1['p_value']>0.05]
df2 = df2[df2['p_value']>0.05]
df1 = df1[abs(df1['log2_foldchange'])<0.25]
df2 = df2[abs(df2['log2_foldchange'])<0.25]
df1 = df1[df1['ms2_matches']>0]
df2 = df2[df2['ms2_matches']>0]
nodes = list(set(df1['node_id'].tolist()) & set(df2['node_id'].tolist()))
smiles = df1.loc[df1['node_id'].isin(nodes),'smiles_identity'].unique()
from rdkit import Chem
from rdkit.Chem import Draw

mols = [Chem.MolFromSmiles(s) for s in smiles]
Draw.MolsToGridImage(mols, molsPerRow=4)

In [None]:
df1.columns