In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from pathlib import Path
import os

# Set up paths
PYTHONPATH = "/global/homes/b/bpb/repos/envnet"
os.environ['PYTHONPATH'] = PYTHONPATH

# Import ENVnet
import sys
sys.path.insert(0, PYTHONPATH)
from envnet.build import quick_envnet
from envnet.config.build_config import BuildConfig

In [2]:


parquet_file = '/global/cfs/cdirs/metatlas/projects/carbon_network/raw_data/metatlas/20221110_EB_MdR_101544-059_HumicAcid_20221110_EXP120A_C18-EP_USDAY63672_NEG_MS2_11_HumicAcid-KOH-nonfiltr-NA-NA_1__078.parquet'
print(f"Loading: {parquet_file}")
df = pd.read_parquet(parquet_file)

print(f"Total spectra in file: {len(df)}")
print(f"Columns: {list(df.columns)}")
print(f"Precursor m/z range: {df['precursor_mz'].min():.3f} - {df['precursor_mz'].max():.3f}")
df.head()


Loading: /global/cfs/cdirs/metatlas/projects/carbon_network/raw_data/metatlas/20221110_EB_MdR_101544-059_HumicAcid_20221110_EXP120A_C18-EP_USDAY63672_NEG_MS2_11_HumicAcid-KOH-nonfiltr-NA-NA_1__078.parquet
Total spectra in file: 216
Columns: ['cluster', 'precursor_mz', 'isolated_precursor_mz', 'rt', 'filename', 'coisolated_precursor_count', 'mdm_mz_vals', 'mdm_i_vals', 'original_mz_vals', 'original_i_vals', 'predicted_formula', 'estimated_fdr']
Precursor m/z range: 85.027 - 591.220


Unnamed: 0,cluster,precursor_mz,isolated_precursor_mz,rt,filename,coisolated_precursor_count,mdm_mz_vals,mdm_i_vals,original_mz_vals,original_i_vals,predicted_formula,estimated_fdr
0,23,85.026703,84.990555,9.573406,/global/cfs/cdirs/metatlas/projects/carbon_net...,2,"[41.03777313232422, 43.017662048339844, 67.013...","[1164.2669677734375, 1092.9249267578125, 1201....","[40.38612365722656, 59.822776794433594, 67.013...","[1617.525390625, 1389.0322265625, 1201.3072509...",H2N6,0.0
1,80,89.024302,88.98793,11.400856,/global/cfs/cdirs/metatlas/projects/carbon_net...,1,"[59.013275146484375, 59.01355743408203, 59.013...","[1932.5098876953125, 1487.2332763671875, 1387....","[89.02423095703125, 88.98790740966797, 60.9967...","[13320.5166015625, 14817.9658203125, 1782.0925...",C3H6O3,0.0
2,167,99.118536,98.948906,9.342418,/global/cfs/cdirs/metatlas/projects/carbon_net...,1,"[43.06032180786133, 55.0954475402832, 70.11597...","[1421.1280517578125, 1410.1922607421875, 1434....","[85.5282211303711, 108.79478454589844, 100.908...","[1273.975341796875, 1299.6181640625, 1417.0789...",C7H16,0.0
3,203,100.003901,99.925575,1.503489,/global/cfs/cdirs/metatlas/projects/carbon_net...,1,"[72.99296569824219, 72.99298095703125, 72.9929...","[27504.388671875, 6628.23974609375, 30990.8222...","[99.92579650878906, 72.99298095703125]","[7064.74853515625, 6628.23974609375]",C3H3NO3,0.0
4,264,100.982393,100.933502,5.935612,/global/cfs/cdirs/metatlas/projects/carbon_net...,1,"[68.9560546875, 68.95609283447266, 68.95610809...","[1187.2337646484375, 1511.3192138671875, 1677....","[115.92060089111328, 100.94574737548828, 100.9...","[4883.25146484375, 6714.56640625, 48528.078125...",C2H2N2OS,0.0


In [3]:

# 2. Select strategic spectra for testing
# Goal: Pick some that should cluster together and some that shouldn't

# Strategy 1: Pick spectra with similar m/z (should connect if similar fragmentation)
similar_mz_group = df[
    (df['precursor_mz'] >= 200) & (df['precursor_mz'] <= 210)
].head(5)

# Strategy 2: Pick spectra with very different m/z (should NOT connect)  
different_mz_1 = df[
    (df['precursor_mz'] >= 300) & (df['precursor_mz'] <= 310)
].head(3)

different_mz_2 = df[
    (df['precursor_mz'] >= 500) & (df['precursor_mz'] <= 510)
].head(3)

# Strategy 3: Pick a few random ones
random_spectra = df.sample(n=min(5, len(df)), random_state=42)

# Combine all test spectra
test_spectra = pd.concat([
    similar_mz_group,
    different_mz_1, 
    different_mz_2,
    random_spectra
]).head(15).reset_index(drop=True)  # Limit to 15 spectra



print(f"\nSelected {len(test_spectra)} test spectra:")
print("Similar m/z group (should potentially connect):")
for _, row in similar_mz_group.iterrows():
    print(f"  m/z: {row['precursor_mz']:.3f}")

print("Different m/z groups (should be separate):")
for _, row in different_mz_1.iterrows():
    print(f"  m/z: {row['precursor_mz']:.3f}")
for _, row in different_mz_2.iterrows():
    print(f"  m/z: {row['precursor_mz']:.3f}")

# 3. Save test spectra to temp file joining PYTHONPATH, results, temp_data
output_dir = Path(os.path.join(PYTHONPATH, "results", "temp_data"))
output_dir.mkdir(parents=True, exist_ok=True)
test_file = os.path.join(output_dir, "test_spectra.parquet")
test_spectra.to_parquet(test_file)
print(f"\nSaved test spectra to: {test_file}")

test_spectra



Selected 12 test spectra:
Similar m/z group (should potentially connect):
  m/z: 201.117
  m/z: 201.960
  m/z: 202.081
  m/z: 202.153
  m/z: 202.947
Different m/z groups (should be separate):
  m/z: 306.937
  m/z: 309.002

Saved test spectra to: /global/homes/b/bpb/repos/envnet/results/temp_data/test_spectra.parquet


Unnamed: 0,cluster,precursor_mz,isolated_precursor_mz,rt,filename,coisolated_precursor_count,mdm_mz_vals,mdm_i_vals,original_mz_vals,original_i_vals,predicted_formula,estimated_fdr
0,1683,201.116954,200.858536,8.519252,/global/cfs/cdirs/metatlas/projects/carbon_net...,1,"[67.17402648925781, 139.1127471923828, 145.078...","[1234.3328857421875, 14225.45703125, 1647.7250...","[163.31222534179688, 200.8587188720703, 200.83...","[1438.153564453125, 42576.80859375, 813.881835...",C8H18N4S,0.0
1,1721,201.960495,202.078537,9.746067,/global/cfs/cdirs/metatlas/projects/carbon_net...,1,"[47.934383392333984, 129.97532653808594, 129.9...","[1271.36572265625, 4805.03564453125, 4088.7429...","[156.77142333984375, 137.8072509765625, 129.97...","[1248.0281982421875, 1214.9593505859375, 4852....",C2H9N3S4,3.404217e-05
2,1735,202.081433,202.078583,2.806789,/global/cfs/cdirs/metatlas/projects/carbon_net...,1,"[54.080326080322266, 129.06344604492188, 129.0...","[1301.15625, 1367.1175537109375, 1205.07482910...","[49.05620574951172, 177.79165649414062, 129.06...","[1270.9776611328125, 1445.50048828125, 1367.11...",C3H9N9O2,0.0
3,1742,202.152565,202.078522,1.991081,/global/cfs/cdirs/metatlas/projects/carbon_net...,3,"[52.161277770996094, 68.17720794677734, 138.17...","[1405.5565185546875, 1425.250244140625, 1342.3...","[140.45738220214844, 140.18478393554688, 139.8...","[940.978271484375, 1021.2156372070312, 981.183...",C5H17N9,0.0
4,1771,202.947296,202.941147,1.49826,/global/cfs/cdirs/metatlas/projects/carbon_net...,1,"[94.99288940429688, 94.992919921875, 170.98425...","[2478.9287109375, 2206.72119140625, 1385.92089...","[204.6928253173828, 202.94125366210938, 170.98...","[1424.439208984375, 7048.18115234375, 1385.920...",C6H4O4S2,0.0
5,2241,306.937244,306.937225,10.952811,/global/cfs/cdirs/metatlas/projects/carbon_net...,1,"[278.9419250488281, 278.9420166015625, 278.942...","[15835.638671875, 17821.1953125, 20716.6210937...","[216.9422607421875, 278.9420166015625, 260.931...","[3681.262451171875, 17821.1953125, 18542.76953...",C12H4O6S2,0.0
6,2246,309.002365,309.173706,6.457131,/global/cfs/cdirs/metatlas/projects/carbon_net...,1,"[221.06137084960938, 265.05126953125, 281.8729...","[1521.8828125, 2882.3662109375, 1438.140014648...","[148.13140869140625, 309.1742248535156, 303.18...","[1269.4288330078125, 26421.236328125, 1260.921...",C10H15O5PS2,2.818273e-10
7,2259,315.035755,315.014313,5.320086,/global/cfs/cdirs/metatlas/projects/carbon_net...,1,"[147.04513549804688, 147.0452880859375, 272.02...","[6271.2763671875, 6572.3876953125, 1498.171142...","[316.06292724609375, 316.017578125, 272.029296...","[2289.124755859375, 31083.83984375, 1498.17114...",C12H12O10,0.0008284325
8,2408,456.993955,456.955139,9.873414,/global/cfs/cdirs/metatlas/projects/carbon_net...,1,"[388.9674987792969, 388.9678039550781, 388.967...","[3147.425537109375, 2915.389892578125, 2694.82...","[185.0063934326172, 320.98291015625, 253.94769...","[13497.6181640625, 8209.02734375, 1486.8059082...",C14H18O11S3,0.0002549107
9,1833,208.025908,207.930801,1.373662,/global/cfs/cdirs/metatlas/projects/carbon_net...,1,"[120.04499816894531, 144.0605926513672, 146.03...","[1435.3018798828125, 1374.4659423828125, 1382....","[117.93590545654297, 179.935791015625, 163.986...","[7818.48779296875, 6327.7099609375, 3691.12329...",C10H11NS2,0.03470453


In [4]:


# 4. Create a simple test data source that points to our temp file
# We need to mock the Google Sheets data loading to use our temp file
class TestDataLoader:
    def load_file_metadata(self, file_source):
        # Return a simple DataFrame pointing to our test file
        return pd.DataFrame({
            'parquet': [str(test_file)],
            'basename': ['test_file']
        })

# 5. Run ENVnet build
print("\n=== Running ENVnet Build ===")

# Create config
config = BuildConfig()
config.min_score = 0.6  # Lower threshold for testing
config.remblink_cutoff = 0.7
config.network_max_mz_difference = 50.0


try:
    # Run quick build
    builder = quick_envnet(
        max_spectra=20,
        max_files=1, 
        output_dir=output_dir,
        config=config,
        verbose=True
    )
    
    # Access the built network
    network = builder.get_network()
    node_data = builder.get_node_data()
    
    print(f"\n=== Network Results ===")
    print(f"Nodes: {network.number_of_nodes()}")
    print(f"Edges: {network.number_of_edges()}")
    print(f"Connected components: {nx.number_connected_components(network)}")
    
    # 6. Analyze and visualize the network
    print(f"\n=== Network Analysis ===")
    
    # Get connected components
    components = list(nx.connected_components(network))
    print(f"Component sizes: {[len(c) for c in components]}")
    
    # Print details about each component
    for i, component in enumerate(components):
        print(f"\nComponent {i+1} ({len(component)} nodes):")
        for node in component:
            node_info = network.nodes[node]
            print(f"  Node {node}: m/z={node_info.get('precursor_mz', 'N/A'):.3f}")
    
    # 7. Visualize the network
    plt.figure(figsize=(12, 8))
    
    # Create layout
    if network.number_of_nodes() > 0:
        pos = nx.spring_layout(network, k=2, iterations=50)
        
        # Draw nodes colored by precursor m/z
        node_colors = [network.nodes[node].get('precursor_mz', 0) for node in network.nodes()]
        
        nx.draw_networkx_nodes(network, pos, 
                              node_color=node_colors, 
                              node_size=300,
                              cmap='viridis',
                              alpha=0.8)
        
        nx.draw_networkx_edges(network, pos, alpha=0.5, width=2)
        
        # Add labels with m/z values
        labels = {node: f"{node}\nm/z:{network.nodes[node].get('precursor_mz', 0):.1f}" 
                 for node in network.nodes()}
        nx.draw_networkx_labels(network, pos, labels, font_size=8)
        
        plt.colorbar(plt.cm.ScalarMappable(cmap='viridis'), 
                    label='Precursor m/z')
        plt.title(f"ENVnet Test Network\n{network.number_of_nodes()} nodes, {network.number_of_edges()} edges")
        
    else:
        plt.text(0.5, 0.5, "No network generated", 
                horizontalalignment='center', verticalalignment='center')
        plt.title("ENVnet Test - No Network")
    
    plt.axis('off')
    plt.tight_layout()
    plt.savefig('/tmp/envnet_test_network.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # 8. Validation checks
    print(f"\n=== Validation ===")
    
    # Check if similar m/z spectra are connected (they should be if they have similar fragmentation)
    similar_nodes = [node for node in network.nodes() 
                    if 200 <= network.nodes[node].get('precursor_mz', 0) <= 210]
    
    if len(similar_nodes) > 1:
        similar_connected = any(network.has_edge(similar_nodes[0], other) 
                              for other in similar_nodes[1:])
        print(f"Similar m/z spectra connected: {similar_connected}")
    
    # Check that very different m/z spectra are NOT connected
    very_different = []
    for node in network.nodes():
        mz = network.nodes[node].get('precursor_mz', 0)
        if 300 <= mz <= 310 or 500 <= mz <= 510:
            very_different.append((node, mz))
    
    if len(very_different) > 1:
        different_connected = any(network.has_edge(very_different[0][0], other[0]) 
                                for other in very_different[1:])
        print(f"Very different m/z spectra connected: {different_connected} (should be False)")
    
    print(f"\n✅ ENVnet build completed successfully!")
    print(f"Network saved to: {output_dir}")
    print(f"Visualization saved to: /tmp/envnet_test_network.png")

except Exception as e:
    print(f"\n❌ Error in ENVnet build: {e}")
    import traceback
    traceback.print_exc()


=== Running ENVnet Build ===
ENVnet Builder initialized
Starting quick build with max 20 spectra...
Starting quick network build (max 20 spectra, 1 files)...
Configuration saved to: /global/homes/b/bpb/repos/envnet/results/temp_data/build _config.json
Subsampled to 20 spectra
Processing deconvoluted 20230223_EB_MdR_101544-059_SynDAC_20230223_QE144_C18-EP_USDAY72350_NEG_MS2_29_RS-HA-NA_1__18.h5 - found 5 matches
Chunking spectra into groups based on precursor m/z...
Found 16 unique precursor m/z groups
Counter: 0, Processing 3 entries for m/z range 249.0042 to 249.0042, Found 1 unique spectra
Counter: 1, Processing 2 entries for m/z range 237.0042 to 237.0042, Found 2 unique spectra
Counter: 2, Processing 2 entries for m/z range 295.0825 to 295.0825, Found 2 unique spectra
Counter: 3, Processing 1 entries for m/z range 115.0399 to 115.0399, Found 1 unique spectra
Counter: 4, Processing 1 entries for m/z range 169.0509 to 169.0509, Found 1 unique spectra
Counter: 5, Processing 1 entries

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
Traceback (most recent call last):
  File "/tmp/ipykernel_20282/205632641.py", line 32, in <module>
    network = builder.get_network()
AttributeError: 'ENVnetBuilder' object has no attribute 'get_network'


In [5]:
builder

ENVnetBuilder(Network: 0 nodes)