In [1]:
### Preprocessing requirements
import pandas as pd
# 1. Copy all .h5 and .mzML files from the job folders to a single output directory for easier access.
# 2. Run the deconvolution to make <filename>_deconvoluted.parquet files.
import os
import numpy as np
from scipy.stats import ttest_ind
# import ztest
from statsmodels.stats.weightstats import ztest
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches


In [2]:
PYTHONPATH = "/global/homes/b/bpb/repos/envnet"
if PYTHONPATH not in sys.path:
    sys.path.insert(0, PYTHONPATH)

from envnet.annotation.core import AnnotationEngine
annotation_engine = AnnotationEngine()
ref_dir = '/global/homes/b/bpb/repos/envnet/results/full_build_20250908_181404/'
node_data = annotation_engine.load_envnet_reference(
    graphml_file=os.path.join(ref_dir, "network_with_sirius.graphml"),
    mgf_base_name=os.path.join(ref_dir, "envnet")
)   
cols = ['original_index','precursor_mz','inchi_key', 'compound_name', 'smiles','NPC#pathway', 'NPC#superclass', 'NPC#class','predicted_formula']
node_data = node_data['nodes'][cols]

Loading ENVnet reference data...
  GraphML file: /global/homes/b/bpb/repos/envnet/results/full_build_20250908_181404/network_with_sirius.graphml
  Deconvoluted MGF: /global/u2/b/bpb/repos/envnet/data//global/homes/b/bpb/repos/envnet/results/full_build_20250908_181404/envnet_deconvoluted_spectra.mgf
  Original MGF: /global/u2/b/bpb/repos/envnet/data//global/homes/b/bpb/repos/envnet/results/full_build_20250908_181404/envnet_original_spectra.mgf
Loaded 22128 ENVnet nodes


In [7]:
ms1_filename = '/pscratch/sd/b/bpb/envnet_annotation_results/ms1_results_experiments_for_paper/ms1_annotations.parquet'
ms2_filename = '/pscratch/sd/b/bpb/envnet_annotation_results/ms2_results_experiments_for_paper/ms2_deconvoluted_annotations.parquet'
ms1_df = pd.read_parquet(ms1_filename)
print(ms1_df.shape)
ms2_cols = ['score_deconvoluted_match', 'matches_deconvoluted_match',
       'original_index_deconvoluted_match',  'filename',
        'mz_diff']
ms2_df = pd.read_parquet(ms2_filename, columns=ms2_cols)
# ms1_df = pd.merge(ms1_df, ms2_df, left_on=['lcmsrun_observed','original_index'], right_on=['filename','original_index_deconvoluted_match'], how='left')
# ms1_df.drop(columns=['original_index_deconvoluted_match','filename','confidence_level','h5'], inplace=True)
ms1_df['has_ms2_evidence'] = ms1_df['original_index'].isin(ms2_df['original_index_deconvoluted_match'].unique())
ms1_df = ms1_df[ms1_df['lcmsrun_observed'].str.contains('T0-MeOH')]
sample_metadata = pd.DataFrame(ms1_df['lcmsrun_observed'].unique(), columns=['lcmsrun_observed'])
sample_metadata['basename'] = sample_metadata['lcmsrun_observed'].apply(lambda x: os.path.basename(x))
sample_metadata['treatment'] = sample_metadata['basename'].apply(lambda x: x.split('_')[12])

print(ms1_df.shape)

(3602431, 10)
(305275, 11)


In [8]:
sample_metadata['treatment'].value_counts()

20M-T0-MeOH    3
66M-T0-MeOH    3
42M-T0-MeOH    3
82M-T0-MeOH    3
55M-T0-MeOH    3
Name: treatment, dtype: int64

In [None]:
data_path = '../results/full_build_20250908_181404/permafrost results gnps2'
ms1_filename = os.path.join(data_path,'ms1_results','ms1_annotations.parquet')
ms2_filename = os.path.join(data_path,'ms2_results','ms2_deconvoluted_annotations.parquet')
analysis_filename = os.path.join(data_path,'analysis_results','statistical_results.csv')
ms1_data = pd.read_parquet(ms1_filename)
ms2_data = pd.read_parquet(ms2_filename)
analysis_data = pd.read_csv(analysis_filename)

In [None]:
ms1_data['basename'] = ms1_data['lcmsrun_observed'].apply(lambda x: os.path.basename(x).replace('.mzML',''))
ms1_data['basename'].value_counts()

In [None]:
analysis_data.columns

In [None]:
analysis_data[['original_index']] = analysis_data[['original_index']].astype(int)
cols = ['original_index','log2_foldchange','p_value']
temp = pd.merge(node_data, analysis_data[cols], on='original_index', how='inner')
temp['classification'] = 'unclassified'
idx1 = temp['p_value'] < 0.05
idx2 = temp['log2_foldchange'] > 1
temp.loc[idx1 & idx2, 'classification'] = 'increased'
idx2 = temp['log2_foldchange'] < -1
temp.loc[idx1 & idx2, 'classification'] = 'decreased'
# temp = temp[temp['classification'] != 'unclassified']
temp = temp.groupby('predicted_formula')['classification'].apply(list)
temp = temp.reset_index()
temp['classificiation_counts'] = temp['classification'].apply(lambda x: len(pd.unique(x)))
# temp[temp['classificiation_counts'] > 1]
temp.sort_values('classificiation_counts', ascending=False).head(10)

In [None]:
node_indices = node_data.loc[node_data['predicted_formula']=='C16H18O9','original_index'].values
print(node_data.loc[node_data['original_index'].isin(node_indices),'precursor_mz'].values)
temp = ms1_data[ms1_data['original_index'].isin(node_indices)].copy()
temp = temp.groupby(['original_index','lcmsrun_observed'])['peak_area'].sum().reset_index()
temp['sample_type'] = temp['lcmsrun_observed'].apply(lambda x: os.path.basename(x).split('_')[12])
temp['filename_short'] = temp['lcmsrun_observed'].apply(lambda x: '_'.join(os.path.basename(x).split('_')[12:16]) )
import seaborn as sns
import matplotlib.pyplot as plt
fig,ax = plt.subplots(figsize=(12,6))
# sns.boxplot(data=temp, hue='sample_type', y='peak_area', ax=ax,x='original_index')
sns.swarmplot(data=temp, hue='filename_short', y='peak_area', ax=ax,x='original_index')
ax.set_yscale('log')
ax.set_title('C16H18O9 abundance in permafrost samples')
ax.set_xlabel('Node original index')
ax.set_ylabel('Peak Area (log scale)')
# move legend outside
ax.legend(bbox_to_anchor=(0.01, -0.3), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
# import sys
# PYTHONPATH = "/global/homes/b/bpb/repos/envnet"
# if PYTHONPATH not in sys.path:
#     sys.path.insert(0, PYTHONPATH)

# from envnet.deconvolution.workflows import LCMSWorkflow
# import os
# from pathlib import Path

# # Setup workflow once
# workflow = LCMSWorkflow(do_buddy=True)

# # Process multiple files
# input_dir = Path('/global/cfs/cdirs/metatlas/projects/envnet_build_files/analysis_for_manuscript')
# output_dir = Path('/global/cfs/cdirs/metatlas/projects/envnet_build_files/analysis_for_manuscript')
# output_dir.mkdir(exist_ok=True)

# for file_path in input_dir.glob('*.h5'):
#     output_file = output_dir / f"{file_path.stem}_deconvoluted.parquet"
#     if os.path.exists(output_file):
#         continue
#     df = workflow.run_full_workflow(str(file_path))
#     if df is not None:
#         df.drop(columns=['deconvoluted_spectrum','original_spectrum'], inplace=True, errors='ignore')
#         df.to_parquet(output_file)
#         print(f"Processed: {file_path.name} -> {output_file.name}")
#     else:
#         print(f"No data found in: {file_path.name}")


In [None]:
df

In [9]:
nodes = [2587640,2605164]
node_slice = node_data[node_data['original_index'].isin(nodes)]
temp = ms1_data[ms1_data['original_index'].isin(nodes)].copy()
temp = temp[temp['lcmsrun_observed'].str.contains('_Run92')]
temp

NameError: name 'ms1_data' is not defined

In [None]:
import pandas as pd
cols = ['h5', 'parquet', 'environmental_subclass', 'lcmsrun_observed',
       'original_file_type']
import os
my_dir = '/pscratch/sd/b/bpb/20231004_JGI_SB_503799_Pfrost_final_QEHF_C18_USDAY81384'
my_files = os.listdir(my_dir)
df = pd.DataFrame(columns=cols)
df['h5'] = my_files
df['lcmsrun_observed'] = df['h5'].apply(lambda x: os.path.basename(x).replace('.h5',''))
df['parquet'] = df['h5'].apply(lambda x: x.replace('.h5','_deconvoluted.parquet'))
df['environmental_subclass'] = 'not applicable'
df['original_file_type'] = 'h5'
output_filename = '/global/homes/b/bpb/repos/envnet/scripts/input_for_ms1-ms2_annotation-permafrost-for-paper.csv'
df.to_csv(output_filename, index=False)

In [None]:
import pandas as pd
df = pd.read_csv('/global/homes/b/bpb/repos/envnet/scripts/build_files.csv')
df['lcmsrun_observed'] = df['h5'].apply(lambda x: os.path.basename(x).replace('.h5',''))
df['original_file_type'] = 'h5'

df.to_csv('/global/homes/b/bpb/repos/envnet/scripts/build_files.csv', index=False)

In [None]:
import pandas as pd
df = pd.read_csv('/global/homes/b/bpb/repos/envnet/scripts/input_for_ms1-ms2_annotation-experiments-for-paper.csv')
df

In [None]:
h5_filename = '/global/cfs/cdirs/metatlas/raw_data/jgi/20231004_JGI_SB_503799_Pfrost_final_QEHF_C18_USDAY81384'
basename = '20231004_JGI_SB_503799_Pfrost_final_QEHF_C18_USDAY81384_NEG_MS2_9A_55M-T0-MeOH_3_Rg80to1200-CE205060-perma-S1_Run148.h5'
# basename = '20230822_JGI_SB_503799_Pfrost_final_QEHF_C18_USDAY81384_NEG_MS2_9A_55M-T0-MeOH_3_Rg80to1200-CE205060-perma-S1_Run148.h5'
h5_filename = os.path.join(h5_filename, basename)
raw_data = pd.read_hdf(h5_filename, key='ms1_neg')
mz_tol= 10
rt_min = 1
rt_max = 9
for i,row in node_slice.iterrows():
    precursor_mz = row['precursor_mz']
    diff = np.abs(raw_data['mz'] - precursor_mz) / precursor_mz * 1e6

    idx = (diff < mz_tol) & (raw_data['rt'] >= rt_min) & (raw_data['rt'] <= rt_max)
    extracted = raw_data[idx]
    peak_area = extracted['i'].sum() / 1e8
    print(f"Node {row['original_index']}:{row['precursor_mz']:.4f} ({row['predicted_formula']}): Extracted Peak Area: {peak_area:.4f}")

In [None]:
ms1_filename = '/pscratch/sd/b/bpb/envnet_annotation_results/ms1_results_experiments_for_paper/ms1_annotations.parquet'
ms2_filename = '/pscratch/sd/b/bpb/envnet_annotation_results/ms2_results_experiments_for_paper/ms2_deconvoluted_annotations.parquet'
ms1_df = pd.read_parquet(ms1_filename)
print(ms1_df.shape)
ms2_cols = ['score_deconvoluted_match', 'matches_deconvoluted_match',
       'original_index_deconvoluted_match',  'filename',
        'mz_diff']
ms2_df = pd.read_parquet(ms2_filename, columns=ms2_cols)
ms1_df = pd.merge(ms1_df, ms2_df, left_on=['lcmsrun_observed','original_index'], right_on=['filename','original_index_deconvoluted_match'], how='inner')
ms1_df.drop(columns=['original_index_deconvoluted_match','filename','confidence_level','h5'], inplace=True)
sample_metadata = pd.DataFrame(ms1_df['lcmsrun_observed'].unique(), columns=['lcmsrun_observed'])
sample_metadata['basename'] = sample_metadata['lcmsrun_observed'].apply(lambda x: os.path.basename(x))
sample_metadata['treatment'] = sample_metadata['basename'].apply(lambda x: x.split('_')[11])
print(ms1_df.shape)

In [None]:
sample_metadata = pd.DataFrame(ms1_df['lcmsrun_observed'].unique(), columns=['lcmsrun_observed'])
sample_metadata['basename'] = sample_metadata['lcmsrun_observed'].apply(lambda x: os.path.basename(x))
sample_metadata['project'] = sample_metadata['basename'].apply(lambda x: x.split('_')[4])
sample_metadata['treatment'] = sample_metadata['basename'].apply(lambda x: x.split('_')[12])

sample_metadata['timepoint'] = None
idx = sample_metadata['treatment'].str.contains('-d7', regex=False,case=False)
sample_metadata.loc[idx,'timepoint'] = 7
idx = sample_metadata['treatment'].str.contains('-day7', regex=False,case=False)
sample_metadata.loc[idx,'timepoint'] = 7
idx = sample_metadata['treatment'].str.contains('-d0', regex=False,case=False)
sample_metadata.loc[idx,'timepoint'] = 0
idx = sample_metadata['treatment'].str.contains('-day0', regex=False,case=False)
sample_metadata.loc[idx,'timepoint'] = 0

sample_metadata['priming'] = None
idx = sample_metadata['treatment'].str.endswith('-NA')
sample_metadata.loc[idx,'priming'] = 'Unprimed'
idx = sample_metadata['treatment'].str.contains('-na-', regex=False,case=False)
sample_metadata.loc[idx,'priming'] = 'Unprimed'
idx = sample_metadata['treatment'].str.contains('-natcom-salts', regex=False,case=False)
sample_metadata.loc[idx,'priming'] = 'Unprimed'
idx = sample_metadata['treatment'].str.endswith('-Lo')
sample_metadata.loc[idx,'priming'] = 'Low'
idx = sample_metadata['treatment'].str.contains('-0p05xnldm-', regex=False,case=False)
sample_metadata.loc[idx,'priming'] = 'Low'
idx = sample_metadata['treatment'].str.endswith('-Hi')
sample_metadata.loc[idx,'priming'] = 'High'
idx = sample_metadata['treatment'].str.contains('-0p5xnldm-', regex=False,case=False)
sample_metadata.loc[idx,'priming'] = 'High'

sample_metadata['soil_type'] = None
idx = sample_metadata['treatment'].str.contains('supern-wave-natcom', regex=False,case=False)
sample_metadata.loc[idx,'soil_type'] = 'Potting Soil'
idx = sample_metadata['treatment'].str.contains('omt1d2', regex=False,case=False)
sample_metadata.loc[idx,'soil_type'] = 'Agricultural Soil'
idx = sample_metadata['treatment'].str.contains('h4171', regex=False,case=False)
sample_metadata.loc[idx,'soil_type'] = 'H4171 lignin'
idx = sample_metadata['treatment'].str.contains('h4161', regex=False,case=False)
sample_metadata.loc[idx,'soil_type'] = 'H4161 lignin'

cols = ['timepoint','priming','soil_type']
idx = pd.notna(sample_metadata[cols]).all(axis=1)
sample_metadata = sample_metadata[idx].copy()
sample_metadata.drop(columns=['basename','treatment'], inplace=True)
ms1_df = pd.merge(ms1_df, sample_metadata, on='lcmsrun_observed', how='inner')
print(ms1_df.shape)

In [None]:
def row_filter(df,project,column_name):
    df = df[~df[column_name].str.contains('exctrl|qc|txctrl',case=False)]
    if project == 'Potting Soil':
        return df[df[column_name].str.contains('6uL') & df[column_name].str.contains('NatCom') & df[column_name].str.contains('Day0|Day7')]
    if project == 'Agricultural Soil NA':
        return df[df[column_name].str.contains('NatCom') & df[column_name].str.contains('NA')]
    if project == 'Agricultural Soil Lo':
        return df[df[column_name].str.contains('NatCom') & df[column_name].str.contains('Lo')]
    if project == 'Agricultural Soil':
        return df[df[column_name].str.contains('NatCom') & df[column_name].str.contains('Hi')]
    elif project == 'soil-ppl':
        return df[df[column_name].str.contains('Run15')]
    # elif project == 'syncom-exudates':
        # return 
    elif project == 'century-exp':
        return df[df[column_name].str.contains('omt|cmt',case=False)]
    else:
        return df
    
def tost(control, treatment, margin=0.25):
    # Perform two one-sided tests.  Note that this
    # is specific for testing if treatment is not
    # the same as control.  If you control mean is
    # near zero this will not work.
    m = np.mean(control)
    lower_margin =  -1*margin*m
    upper_margin = margin*m
    _, p_value_lower = ztest(control, treatment, value=lower_margin, alternative='larger')
    _, p_value_upper = ztest(control, treatment, value=upper_margin, alternative='smaller')
    return max(p_value_lower, p_value_upper)

def do_ttest(df,control_group,treatment_group,do_split=True,min_intensity=1e6,margin=0.2):
    p = pd.pivot_table(df,index=['lcmsrun_observed','timepoint'],values='peak_area',columns='original_index')
    p.fillna(1e5,inplace=True)     
    cols = p.columns
    p.reset_index(inplace=True,drop=False)

    ttest = []
    for c in cols:
        idx1 = p['timepoint']==treatment_group
        idx2 = p['timepoint']==control_group
        treatment = p[c][idx1].values
        control = p[c][idx2].values
        mean_treatment = 1+treatment.mean()
        mean_control = 1+control.mean()
        if (mean_treatment<min_intensity) & (mean_control<min_intensity):
            continue
        fold_change = np.log2(mean_treatment/mean_control)
        t,p_value = ttest_ind(treatment, control)
        # Two One-Sided Tests (TOST) procedure using 50% margin
        p_tost = tost(control,treatment, margin=0.5)
        ttest.append({'original_index':c,'t':t,'p':p_value,'fc':fold_change,'tost':p_tost,
                      'mean_treatment':mean_treatment,'mean_control':mean_control,
                      'treatment_vals':treatment,
                      'control_vals':control})
    ttest = pd.DataFrame(ttest)
    return ttest



In [None]:
grouped_ms1_df = [g for _, g in ms1_df.groupby(['project','priming','soil_type'])]
out = []
for g_df in grouped_ms1_df:
    project = g_df['project'].values[0]
    priming = g_df['priming'].values[0]
    soil_type = g_df['soil_type'].values[0]
    print(f'Processing {project} {priming} {soil_type}')
    ttest = do_ttest(g_df,0,7,min_intensity=1e7)
    ttest['project'] = project
    ttest['priming'] = priming
    ttest['soil_type'] = soil_type
    idx_prefered = (ttest['fc']<-0.5) & (ttest['p']<0.05)
    idx_produced = (ttest['fc']>0.5) & (ttest['p']<0.05)
    idx_ignored = ttest['tost']<0.05
    ttest['classification'] = None
    ttest.loc[idx_prefered,'classification'] = 'prefered'
    ttest.loc[idx_produced,'classification'] = 'produced'
    ttest.loc[idx_ignored,'classification'] = 'ignored'
    out.append(ttest)
final_df = pd.concat(out, ignore_index=True)
print(final_df.shape)

In [None]:
temp = final_df[final_df['classification'].notna()]
idx1 = temp['project'] == 'Agricultural Soil'
# idx2 = temp['classification'] == 'prefered'
# idx3 = temp['classification'] == 'produced'
temp = temp[idx1].copy()# & (idx2 | idx3)].copy()
temp = pd.merge(temp, node_data, on='original_index', how='left')
temp = temp.groupby('predicted_formula')['classification'].value_counts().unstack(fill_value=0)
print("There are {} formulas that are prefered, produced or ignored".format(len(temp)))
idx1 = (temp>0).sum(axis=1)>1
print('There are {} formulas that are prefered and produced or ignored'.format(temp[idx1].shape[0]))
# temp = temp[(temp['prefered']>1) & (temp['produced']>1)].copy()
# temp

In [None]:
import pandas as pd
usecols = ['original_index', 't', 'p', 'fc', 'tost',
       'mean_treatment', 'mean_control', 'treatment_vals', 'control_vals',
       'project', 'priming', 'soil_type', 'classification',
       'summary_classification', 'node_order_index', 'deconvoluted_spectrum',
       'all_features_prediction', 'formula_based_prediction']
prediction_df = pd.read_csv('training_data_with_predictions_full.csv', usecols=usecols)

prediction_df = prediction_df[prediction_df['classification'].notna()]
idx1 = prediction_df['project'] == 'Agricultural Soil'


prediction_df = pd.merge(prediction_df, node_data, on='original_index', how='left')

g = prediction_df.groupby('predicted_formula')['classification'].value_counts().unstack(fill_value=0)
print("There are {} formulas that are prefered, produced or ignored".format(len(g)))
idx1 = (g>0).sum(axis=1)>1
complex_ones = g[idx1].index.tolist()
print('There are {} formulas that are prefered and produced or ignored'.format(g[idx1].shape[0]))
# temp = temp[(temp['prefered']>1) & (temp['produced']>1)].copy()
g = prediction_df.groupby('predicted_formula')[['classification','formula_based_prediction','all_features_prediction']].agg(lambda x: tuple(x)).reset_index()
def replace_words_with_numbers(x):
    out = []
    for xx in x:
        if xx=='prefered':
            out.append(0)
        elif xx=='produced':
            out.append(0)
        elif xx=='ignored':
            out.append(1)
    return tuple(out)
    
g['classification'] = g['classification'].apply(replace_words_with_numbers)
g = g[g['predicted_formula'].isin(complex_ones)].copy()
# g = g[['predicted_formula','classification','formula_based_prediction','all_features_prediction']]
# idx = g['formula_based_prediction'] != g['all_features_prediction']
g['disagreement'] = g['classification'].apply(lambda x: len(set(x))>1)
idx = g['disagreement'] == True
g = g[idx].copy()

# melt and pivot classification, formula_based_prediction, all_features_prediction
# melted = g.melt(id_vars=['predicted_formula'], value_vars=['classification','formula_based_prediction','all_features_prediction'], var_name='type', value_name='values')
cols = ['classification','formula_based_prediction','all_features_prediction']
melted = g.explode(cols)
# melted =pd.pivot_table(melted,index=['predicted_formula'], columns=['type'], values='values')
# melted.reset_index(inplace=True)
correct_formula = melted['classification'] == melted['formula_based_prediction']
correct_all = melted['classification'] == melted['all_features_prediction']
incorrect_formula = melted['classification'] != melted['formula_based_prediction']
incorrect_all = melted['classification'] != melted['all_features_prediction']
print('Formula based prediction')
print('TP: {}, FP: {}, TN: {}, FN: {}'.format(correct_formula.sum(), incorrect_formula.sum(), 0, 0))
print('All features prediction')
print('TP: {}, FP: {}, TN: {}, FN: {}'.format(correct_all.sum(), incorrect_all.sum(), 0, 0))

In [None]:
podman-hpc run --rm -v -it /pscratch/sd/b/bpb/20230127_JGI_ER_508059_POM_final_IDX_C18_USDAY63675:/data docker.io/proteowizard/pwiz-skyline-i-agree-to-the-vendor-licenses wine msconvert /data/*.raw

In [None]:


row_terms = final_df['soil_type'].unique()
num_rows = len(row_terms)
col_terms = ['Unprimed','Low','High']
num_cols = len(col_terms)

classification_colors = {
    'produced': '#1f77b4',  # Muted blue
    'prefered': '#d62728',  # Muted red
    'ignored': '#7f7f7f'    # Gray
}

fig,ax = plt.subplots(num_rows,num_cols, figsize=(4*num_cols,4*num_rows), sharex=True, sharey=True)
for i,row in enumerate(row_terms):
    for j,col in enumerate(col_terms):
        ax_ij = ax[i,j]
        idx = (final_df['soil_type']==row) & (final_df['priming']==col)
        plot_df = final_df[idx].copy()
        # idx = plot_df['p']<0.005
        # plot_df.loc[idx,'p'] = 0.005 + np.random.rand(sum(idx))*0.001
        # idx = plot_df['fc']>5
        # plot_df.loc[idx,'fc'] = 5 + np.random.rand(sum(idx))
        # idx = plot_df['fc']<-5
        # plot_df.loc[idx,'fc'] = -5 - np.random.rand(sum(idx))
        sns.scatterplot(data=plot_df, x='fc', y=-np.log10(plot_df['p']), 
                        hue='classification', 
                        palette=classification_colors, 
                        ax=ax_ij, alpha=0.7,legend=False)

        ax_ij.axhline(-np.log10(0.05), color='red', linestyle='--')
        ax_ij.axvline(0.5, color='blue', linestyle='--')
        ax_ij.axvline(-0.5, color='blue', linestyle='--')
        ax_ij.set_title(f'{row} {col}')
        if i == num_rows-1:
            ax_ij.set_xlabel('Log2 Fold Change (Day 7 / Day 0)')
        else:
            ax_ij.set_xlabel('')
        if j == 0:
            ax_ij.set_ylabel('-Log10(p-value)')
        else:
            ax_ij.set_ylabel('')


handles = [mpatches.Patch(color=color, label=label) for label, color in classification_colors.items()]

# Place the legend outside the plot area
fig.legend(handles=handles, loc='center left', bbox_to_anchor=(1, 0.5), title='Classification')

# Adjust layout to make room for the legend
plt.tight_layout(rect=[0, 0, 0.9, 1])

In [None]:
temp

In [None]:
final_df.to_csv('training data for stability model.csv', index=False)

In [None]:
fig,all_axes = plt.subplots(figsize=(20,6),nrows=1,ncols=len(final_df['soil_type'].unique()), sharey=True)
counter = 0
class_term = 'NPC#class'
counter = 0
for soil_type in final_df['soil_type'].unique():
    ax = all_axes[counter]
    # make a stacked bar chart of the compound classes for each classification

    idx1 = final_df['classification'].notna()
    idx2 = final_df['soil_type'] == soil_type
    idx = idx1 & idx2
    classified_indices = final_df[idx].groupby('classification')['original_index'].unique().to_dict()
    out = []
    for key, value in classified_indices.items():
        idx = node_data['original_index'].isin(value)
        temp = node_data.loc[idx,class_term].value_counts()
        temp = temp.to_frame().reset_index().rename(columns={'index':'class','NPC#class':'count'})
        temp['classification'] = key
        temp['total'] = temp['count'].sum()
        temp['fraction'] = temp['count']/temp['total']
        out.append(temp)
    class_df = pd.concat(out, ignore_index=True)
    class_df = class_df.pivot(index='classification', columns='class', values='fraction')
    class_df.fillna(0,inplace=True)
    cols = class_df.columns[class_df.sum(axis=0)>0.05]
    class_df = class_df[cols]
    # sum each row to one
    class_df = class_df.div(class_df.sum(axis=1), axis=0)

    class_df.plot(kind='bar', stacked=True, ax=ax, color=sns.color_palette("tab20", n_colors=len(class_df.columns)))
    ax.set_ylabel('Fraction of Annotations')
    ax.set_xlabel('Compound Class')
    ax.set_title(f'Distribution of Compound Classes by Classification\n({soil_type})')
    # plt.xticks(rotation=45, ha='right')
    # move legend outside of plot
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    counter = counter + 1
plt.tight_layout()
plt.show()
