In [1]:
import pandas as pd
import numpy as np
import networkx as nx

import os
import shutil
# Set up paths
PYTHONPATH = "/global/homes/b/bpb/repos/envnet"
os.environ['PYTHONPATH'] = PYTHONPATH

# Import ENVnet
import sys
sys.path.insert(0, PYTHONPATH)
from envnet.build import quick_envnet
from envnet.config.build_config import BuildConfig
from envnet.build import ENVnetBuilder
from envnet.build.mgf_tools import MGFGenerator
config = BuildConfig()
mgf_generator = MGFGenerator(config)




In [2]:
# use recursive find to get all parquet files
# use linux find since it is much faster than glob
cmd = "find /global/cfs/cdirs/metatlas/projects/envnet_build_files/ -name '*_deconvoluted.parquet'"
# run command
import subprocess
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
files = result.stdout.splitlines()
len(files)

1401

In [3]:
file_df = pd.DataFrame(files, columns=['parquet_file'])
file_df.reset_index(drop=False, inplace=True)
file_df.to_csv('/global/cfs/cdirs/metatlas/projects/carbon_network/mgf_files_for_falcon.csv',index=False)

In [4]:
# BEGIN IONS
# TITLE=mzspec:USI000000:61_Sammy:scan:Id=503572
# PEPMASS=105.066146850586
# RTINSECONDS=8.392733333333
# CHARGE=1+
# CLUSTER=0
# 50.00249 0.0
# 50.00372 16.0


In [9]:
output_dir = '/global/cfs/cdirs/metatlas/projects/carbon_network/mgf_files_for_falcon'

def modify_file_for_falcon(filename):
    with open(filename,'r') as fid:
        mgf_text = fid.read()

    replace_text = 'FEATURE_ID='
    new_text = 'TITLE=%s:'%filename
    mgf_text = mgf_text.replace(replace_text, new_text)

    replace_text = 'ORIGINAL_ID='
    new_text = 'CLUSTER='
    mgf_text = mgf_text.replace(replace_text, new_text)

    with open(filename,'w') as fid:
        fid.write(mgf_text)

def make_mgf_helper(temp_df):
    network = nx.Graph()
    network.add_nodes_from(temp_df['original_index'].to_list())
    created_files = mgf_generator.create_mgf_files(
        node_data=temp_df,
        network=network,
        output_dir=str(output_dir)
    )
    return created_files

df = []
my_cols = ['precursor_mz','rt','original_spectrum_mz_vals','original_spectrum_intensity_vals','deconvoluted_spectrum_mz_vals','deconvoluted_spectrum_intensity_vals']
for i,row in file_df.iterrows():
    parquet_file = row['parquet_file']
    old_filename = '/global/cfs/cdirs/metatlas/projects/carbon_network/mgf_files_for_falcon/envnet_deconvoluted_spectra.mgf'
    new_filename = old_filename.replace('envnet_deconvoluted_spectra.mgf','file_%d.mgf'%row['index'])
    if os.path.exists(new_filename):
        continue
    temp_df = pd.read_parquet(parquet_file,columns=my_cols)
    temp_df.index.name = 'original_index'
    temp_df = temp_df.reset_index()
    created_files = make_mgf_helper(temp_df)
    # move the file
    shutil.move(old_filename, new_filename)
    modify_file_for_falcon(new_filename)

# change the group ownership recursively
cmd = ["chgrp", "-R", 'metatlas', str(output_dir)]
result = subprocess.run(cmd, check=True, capture_output=True, text=True)

file_to_delete = created_files['original']
os.remove(file_to_delete)

Created MGF file: /global/cfs/cdirs/metatlas/projects/carbon_network/mgf_files_for_falcon/envnet_deconvoluted_spectra.mgf with 1079 spectra
Created MGF file: /global/cfs/cdirs/metatlas/projects/carbon_network/mgf_files_for_falcon/envnet_original_spectra.mgf with 1079 spectra
Created MGF file: /global/cfs/cdirs/metatlas/projects/carbon_network/mgf_files_for_falcon/envnet_deconvoluted_spectra.mgf with 1736 spectra
Created MGF file: /global/cfs/cdirs/metatlas/projects/carbon_network/mgf_files_for_falcon/envnet_original_spectra.mgf with 1736 spectra
Created MGF file: /global/cfs/cdirs/metatlas/projects/carbon_network/mgf_files_for_falcon/envnet_deconvoluted_spectra.mgf with 1434 spectra
Created MGF file: /global/cfs/cdirs/metatlas/projects/carbon_network/mgf_files_for_falcon/envnet_original_spectra.mgf with 1434 spectra
Created MGF file: /global/cfs/cdirs/metatlas/projects/carbon_network/mgf_files_for_falcon/envnet_deconvoluted_spectra.mgf with 1495 spectra
Created MGF file: /global/cfs/cd

In [8]:
row['parquet_file']

'/global/cfs/cdirs/metatlas/projects/envnet_build_files/metatlas/20230403_EB_BGS_107002-011_BIODESERT_Metagenome_EXP120A_C18-EP_USDAY72349_NEG_MS2_116_KEN-Wamiti-Kalama-1PLH-2GZM_1__288_deconvoluted.parquet'

In [30]:


# change the folder to be read/write by metatlas group
