In [2]:
import os
import subprocess
import pandas as pd

In [3]:
cmd = "find /global/cfs/cdirs/metatlas/projects/envnet_build_files -name '*.h5'"
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
files = result.stdout.splitlines()
len(files)

1886

In [4]:
import os
files_to_convert = []
for f in files:
    parquet_filename = f.replace(".h5", "_deconvoluted.parquet")
    
    if not os.path.exists(parquet_filename):
        files_to_convert.append(f)
len(files_to_convert)

485

In [5]:
sbatch_header = """#!/bin/bash
#SBATCH --job-name="DOM_Interlab-LCMS_Lab018_M_Neg_MS2_rep2"
#SBATCH --output="/pscratch/sd/b/bpb/metatlas_mdm_parquet_files/DOM_Interlab-LCMS_Lab018_M_Neg_MS2_rep2.out"
#SBATCH --error="/pscratch/sd/b/bpb/metatlas_mdm_parquet_files/DOM_Interlab-LCMS_Lab018_M_Neg_MS2_rep2.err"
#SBATCH --time=08:00:00
#SBATCH --qos=regular
#SBATCH --ntasks-per-node=128
#SBATCH --licenses=cfs
#SBATCH --exclusive
#SBATCH --constraint=cpu
#SBATCH --nodes=1
#SBATCH --cpus-per-task=1
#SBATCH --account=m2650

export PYTHONPATH=/global/homes/b/bpb/repos/envnet

"""

commands = []
python_binary = '/global/common/software/m2650/msbuddy/bin/python'
python_command = '-m envnet.deconvolution.workflows --do_buddy'
for f in files_to_convert:
    commands.append('%s %s "%s"' % (python_binary, python_command, f))

# split commands into 7 chunks
chunk_size = 200
command_chunks = [commands[i:i + chunk_size] for i in range(0, len(commands), chunk_size)]
launch_dir = '/pscratch/sd/b/bpb/metatlas_mdm_parquet_files'
sbatch_files = []
for i, chunk in enumerate(command_chunks):
    sbatch_file = os.path.join(launch_dir, f"sbatch_chunk_{i}.sbatch")
    with open(sbatch_file, 'w') as f:
        f.write(sbatch_header)
        f.write("\n".join(chunk))
    sbatch_files.append(sbatch_file)

with open(os.path.join(launch_dir, "submit_all_sbatch.sh"), 'w') as f:
    f.write("#!/bin/bash\n")
    for sbatch_file in sbatch_files:
        f.write(f"sbatch {sbatch_file}\n")

In [24]:
new_file = '/global/cfs/cdirs/metatlas/projects/envnet_build_files/metatlas/20230403_EB_BGS_107002-011_BIODESERT_Metagenome_EXP120A_C18-EP_USDAY72349_NEG_MS2_84_HUN-Valk-Nagyivan-1PLH-2GZM_1__623_deconvoluted.parquet'
old_file = '/global/cfs/cdirs/metatlas/projects/carbon_network/raw_data/metatlas/20230403_EB_BGS_107002-011_BIODESERT_Metagenome_EXP120A_C18-EP_USDAY72349_NEG_MS2_84_HUN-Valk-Nagyivan-1PLH-2GZM_1__623_deconvoluted.parquet'
new_df = pd.read_parquet(new_file)
old_df = pd.read_parquet(old_file)
new_df.shape,old_df.shape

((1698, 19), (1500, 19))

In [25]:
new_df.sort_values('coisolated_precursor_count',ascending=False).head(10)

Unnamed: 0,temp_index,rt,count,precursor_mz,sum_frag_intensity,max_frag_intensity,obs,isolated_precursor_mz,filename,basename,coisolated_precursor_count,coisolated_precursor_mz_list,assumed_adduct,predicted_formula,estimated_fdr,deconvoluted_spectrum_mz_vals,deconvoluted_spectrum_intensity_vals,original_spectrum_mz_vals,original_spectrum_intensity_vals
1043,1043,3.858191,4,343.214015,44900.896729,28929.775391,"[C2H4O, CO2, 2x H2O, H2O]",343.212097,/global/cfs/cdirs/metatlas/projects/envnet_bui...,20230403_EB_BGS_107002-011_BIODESERT_Metagenom...,5,"[343.0795742879019, 343.1034858493162, 343.140...",[M-H]-,C19H28N4O2,0.0,"[299.18884, 299.2265, 307.19165, 325.20358]","[3514.345, 2299.0498, 10157.727, 28929.775]","[215.12862, 223.09627, 223.13383, 225.15053, 2...","[2426.6694, 2161.1072, 2669.3396, 2391.8945, 1..."
1112,1112,3.976118,4,301.128843,28275.557129,10119.420898,"[2x CO2 + CO, 2x CO2 + H2O, 2x CO2, CO2 + CO]",301.201813,/global/cfs/cdirs/metatlas/projects/envnet_bui...,20230403_EB_BGS_107002-011_BIODESERT_Metagenom...,5,"[301.0318475693559, 301.05811512533637, 301.10...",[M-H]-,C14H22O7,0.0,"[185.15382, 195.13893, 213.14984, 229.14413]","[10119.421, 7883.9907, 2633.1753, 7638.97]","[221.15338, 223.13022, 226.028, 227.03514, 237...","[1866.7537, 2054.268, 3815.6853, 6462.6787, 17..."
1040,1040,3.858191,6,343.103486,51416.438232,22295.535156,"[3x CO2 + H2O + CO, 3x CO2 + CH4, 2x CO2 + H2O...",343.212097,/global/cfs/cdirs/metatlas/projects/envnet_bui...,20230403_EB_BGS_107002-011_BIODESERT_Metagenom...,5,"[343.0795742879019, 343.1034858493162, 343.140...",[M-H]-,C15H20O9,5.082965e-09,"[165.12871, 195.10341, 209.11829, 211.13391, 2...","[2199.623, 5480.181, 16091.89, 22295.535, 2161...","[215.12862, 223.09627, 223.13383, 225.15053, 2...","[2426.6694, 2161.1072, 2669.3396, 2391.8945, 1..."
1041,1041,3.858191,4,343.14007,13666.897949,5211.439941,"[3x CO2 + CH4, 2x CO2 + CH4O, 2x CO2 + H2O, 2x...",343.212097,/global/cfs/cdirs/metatlas/projects/envnet_bui...,20230403_EB_BGS_107002-011_BIODESERT_Metagenom...,5,"[343.0795742879019, 343.1034858493162, 343.140...",[M-H]-,C16H24O8,0.01488036,"[195.13953, 223.13383, 237.15013, 255.16031]","[1987.2334, 2669.3396, 3798.885, 5211.44]","[215.12862, 223.09627, 223.13383, 225.15053, 2...","[2426.6694, 2161.1072, 2669.3396, 2391.8945, 1..."
1042,1042,3.858191,4,343.176265,14713.61499,4917.296387,"[CO2 + CH4O, C4H4O, CO2 + H2O, H2O]",343.212097,/global/cfs/cdirs/metatlas/projects/envnet_bui...,20230403_EB_BGS_107002-011_BIODESERT_Metagenom...,5,"[343.0795742879019, 343.1034858493162, 343.140...",[M-H]-,C17H28O7,0.0,"[267.16125, 275.15033, 281.17548, 325.16373]","[4917.2964, 4561.711, 2448.8008, 2785.807]","[215.12862, 223.09627, 223.13383, 225.15053, 2...","[2426.6694, 2161.1072, 2669.3396, 2391.8945, 1..."
585,585,2.92554,3,225.112802,29017.872681,23874.125,"[C4H4O, CO2, C2H4]",225.074081,/global/cfs/cdirs/metatlas/projects/envnet_bui...,20230403_EB_BGS_107002-011_BIODESERT_Metagenom...,4,"[225.00378470652512, 225.03707331127316, 225.0...",[M-H]-,C12H18O4,1.130873e-07,"[157.0868, 181.12378, 197.0795]","[23874.125, 1779.4579, 3364.2898]","[167.03519, 165.01912, 157.0868, 153.08978, 13...","[1904.4008, 7457.445, 23874.125, 9689.003, 196..."
1188,1188,4.120216,3,341.087702,30332.703857,24066.441406,"[4x CO2 + CH4, 3x CO2 + H2O + CO, 3x CO2]",341.19632,/global/cfs/cdirs/metatlas/projects/envnet_bui...,20230403_EB_BGS_107002-011_BIODESERT_Metagenom...,4,"[341.08770186237416, 341.13064540632854, 341.1...",[M-H]-,C15H18O9,6.828871e-05,"[149.0973, 163.1126, 209.1182]","[4079.4778, 2186.7847, 24066.441]","[223.13388, 243.17622, 235.16718, 235.13438, 2...","[3384.5376, 2436.0469, 1547.353, 3462.9326, 10..."
1190,1190,4.120216,6,341.160102,14431.778809,3314.373291,"[2x CO2 + H2O, 2x CO2, CO2 + CH4O, CO2 + H2O, ...",341.19632,/global/cfs/cdirs/metatlas/projects/envnet_bui...,20230403_EB_BGS_107002-011_BIODESERT_Metagenom...,4,"[341.08770186237416, 341.13064540632854, 341.1...",[M-H]-,C17H26O7,0.0,"[235.16718, 253.17906, 265.14468, 279.16022, 2...","[1547.353, 2397.539, 2809.9324, 3273.9338, 331...","[223.13388, 243.17622, 235.16718, 235.13438, 2...","[3384.5376, 2436.0469, 1547.353, 3462.9326, 10..."
1584,1584,4.918899,5,327.217933,180084.198242,98826.671875,"[CO2 + H2O, C2H4O, CO2, 2x H2O, H2O]",327.217346,/global/cfs/cdirs/metatlas/projects/envnet_bui...,20230403_EB_BGS_107002-011_BIODESERT_Metagenom...,4,"[327.0723861988474, 327.112456780347, 327.1805...",[M-H]-,C18H32O5,0.0,"[265.21875, 283.1937, 283.2287, 291.1964, 309....","[8281.501, 5626.5293, 9580.316, 57769.18, 9882...","[247.20634, 243.1603, 239.20078, 231.462, 227....","[31215.697, 3395.8184, 3732.539, 2811.4077, 17..."
1191,1191,4.120216,4,341.197237,78454.046875,40226.183594,"[CO2 + H2O, CO2, 2x H2O, H2O]",341.19632,/global/cfs/cdirs/metatlas/projects/envnet_bui...,20230403_EB_BGS_107002-011_BIODESERT_Metagenom...,4,"[341.08770186237416, 341.13064540632854, 341.1...",[M-H]-,C18H30O6,5.951043e-09,"[279.19742, 297.2084, 305.1759, 323.18665]","[4285.2266, 5504.6797, 28437.957, 40226.184]","[223.13388, 243.17622, 235.16718, 235.13438, 2...","[3384.5376, 2436.0469, 1547.353, 3462.9326, 10..."
