In [1]:
import pandas as pd
import numpy as np
import glob as glob
import os

import multiprocessing

data_dirs = ['/pscratch/sd/b/bpb/massive',
             '/global/cfs/cdirs/metatlas/projects/carbon_network/raw_data']
              # '/global/cfs/cdirs/metatlas/projects/rawdata_for_scn']
            #   '/global/cfs/cdirs/metatlas/projects/massive_data_for_scn']



In [2]:

def get_files(main_dir):
    mzml_files = glob.glob(main_dir + '/**/*.mzML', recursive=True)
    mzxml_files = glob.glob(main_dir + '/**/*.mzXML', recursive=True)
    hdf5_files = glob.glob(main_dir + '/**/*.h5', recursive=True)
    buddy_mdm_files = glob.glob(main_dir + '/**/*.parquet', recursive=True)
    buddy_failed_mdm_files = glob.glob(main_dir + '/**/*.parquet-failed', recursive=True)

    df_mzml = pd.DataFrame({'mzml': mzml_files})
    df_mzml['no_extension'] = df_mzml['mzml'].apply(lambda x: x.replace('.mzML', ''))
    df_mzml.set_index('no_extension', inplace=True)
    df_mzxml = pd.DataFrame({'mzxml': mzxml_files})
    df_mzxml['no_extension'] = df_mzxml['mzxml'].apply(lambda x: x.replace('.mzXML', ''))
    df_mzxml.set_index('no_extension', inplace=True)
    df_h5 = pd.DataFrame({'h5': hdf5_files})
    df_h5['no_extension'] = df_h5['h5'].apply(lambda x: x.replace('.h5', ''))
    df_h5.set_index('no_extension', inplace=True)
    df_buddy = pd.DataFrame({'buddy': buddy_mdm_files})
    df_buddy['no_extension'] = df_buddy['buddy'].apply(lambda x: x.replace('.parquet', ''))
    df_buddy.set_index('no_extension', inplace=True)
    df_buddy_failed = pd.DataFrame({'buddy_failed': buddy_failed_mdm_files})
    df_buddy_failed['no_extension'] = df_buddy_failed['buddy_failed'].apply(lambda x: x.replace('.parquet-failed', ''))
    df_buddy_failed.set_index('no_extension', inplace=True)

    df = df_mzml.join(df_mzxml, how='outer').join(df_h5, how='outer').join(df_buddy, how='outer').join(df_buddy_failed, how='outer')
    df['data_dir'] = main_dir
    return df

def get_files_from_dirs(data_dirs):
    out = []
    for data_dir in data_dirs:
        out.append(get_files(data_dir))
        print(data_dir,len(out))
    if len(data_dirs)>1:
        return pd.concat(out)
    else:
        return out[0]

df = get_files_from_dirs(data_dirs)
df = df[pd.notna(df['buddy'])]
cols = ['h5','buddy','data_dir']
df = df[cols]

pattern = r'(?<=/)(MSV.*?)(?=/)'
a = df.copy().index.str.extract(pattern)
df['massive_id'] = a[0].tolist()
df['h5_basename'] = df['h5'].apply(lambda x: os.path.basename(x))
df['no_extension_basename'] = df['h5_basename'].apply(lambda x: x.replace('.h5',''))
print(df.shape)
df = df[pd.notna(df['buddy'])]
df.reset_index(inplace=True,drop=False)
print(df.shape)

/pscratch/sd/b/bpb/massive 1
/global/cfs/cdirs/metatlas/projects/carbon_network/raw_data 2
(33229, 6)
(33229, 7)


In [3]:
df = df[~df['buddy'].str.contains('qc',case=False)]
df = df[~df['buddy'].str.contains('blank',case=False)]
df.shape

(26534, 7)

In [4]:

# hits = pd.read_csv('../data/nl_spectra_gnps_fastsearch.csv.gz')
# hits['filename'] = hits['USI'].apply(lambda x: x.split(':')[-3])
# print(hits.shape[0])
# idx = df['data_dir']=='/global/cfs/cdirs/metatlas/projects/massive_data_for_scn'
# hits = hits[hits['filename'].isin(df.loc[idx,'no_extension_basename'])]
# print(hits.shape[0])
# hits = hits.groupby(['Dataset', 'filename'])['original_index'].nunique().reset_index()
# hits.sort_values('original_index', ascending=False, inplace=True)
# hits = hits[hits['original_index'] > 1]
# hits.drop_duplicates(subset='filename', keep='first', inplace=True)
# hits.rename(columns={'original_index':'num_hits_fasst'},inplace=True)
# hits.reset_index(drop=True,inplace=True)
# hits

In [5]:
# idx = df['data_dir']=='/global/cfs/cdirs/metatlas/projects/massive_data_for_scn'
# df.loc[idx,'massive_id'] = df.loc[idx,'no_extension_basename'].map(hits.set_index('filename')['Dataset'])


In [6]:
df['redu_filename'] = df['h5'].str.extract(r'(/MSV.*)')
df['redu_filename'] = df['redu_filename'].str.replace(r'\.h5$', '', regex=True)
df['redu_filename'] = df['redu_filename'].str.replace(r'^/', 'f.', regex=True)

df.loc[0,'redu_filename']

'f.MSV000079542/ccms_peak/Peak_List_Files/OMICS_IM102_691_1d_Lipid_2_005_NEG_150mm_10May15_Polaroid_14-12-16'

In [7]:
usecols = ['title','dataset','description','keywords','instrument']
metadata = pd.read_csv('/global/cfs/cdirs/metatlas/projects/carbon_network/massive_metadata_2024.tsv', sep='\t',usecols=usecols)
metadata = metadata[metadata['dataset'].str.contains('MSV')]
metadata = metadata[metadata['dataset'].isin(df['massive_id'])]
df = pd.merge(df,metadata,left_on='massive_id',right_on='dataset',how='left')

df['keywords'] = df['keywords'].apply(lambda x: x.split('###') if type(x)==str else [])
df['keyword_DOM'] = df['keywords'].apply(lambda x: True if (('dom' in x) | ('organic matter' in x) | ('soil' in x)) else False)
# metadata = metadata[metadata['proteins']==0]
df.shape

(26534, 14)

In [8]:
redu = pd.read_csv('/global/cfs/cdirs/metatlas/projects/carbon_network/all_sampleinformation.tsv', sep='\t')
redu = redu[redu['ATTRIBUTE_DatasetAccession'].str.contains('MSV')]
redu['filename'] = redu['filename'].str.replace(r'\.mz(ML|XML)$', '', regex=True)
# redu.loc[0,'filename']
redu = redu[redu['filename'].isin(df['redu_filename'])]

cols = list(set(list(redu.columns)) - set(['InternalStandardsUsed','SampleTypeSub1','UniqueSubjectID','HumanPopulationDensity',
                                           'UBERONOntologyIndex','TermsofPosition','HealthStatus','ComorbidityListDOIDIndex','AgeInYears',
                                           'SampleCollectionDateandTime','DOIDOntologyIndex','DOIDCommonName','SubjectIdentifierAsRecorded',
                                           'LifeStage','BiologicalSex','UBERONBodyPartName','LatitudeandLongitude','Country','DepthorAltitudeMeters']))
redu = redu[cols]

df = pd.merge(df,redu,left_on='redu_filename',right_on='filename',how='left')
df['in_redu_plant'] = df['SampleType']=='plant'

df.shape

  redu = pd.read_csv('/global/cfs/cdirs/metatlas/projects/carbon_network/all_sampleinformation.tsv', sep='\t')


(26534, 33)

In [9]:
dom_samples = pd.read_csv('/global/homes/b/bpb/repos/scndb/data/dom_public_datasets.csv',usecols=['dataset'])
dom_samples = pd.Series(dom_samples['dataset'].unique())
# bad_massive = ['MSV000092338','MSV000093271','MSV000092599','MSV000093514','MSV000092622','MSV000092604','MSV000092520']
# dom_samples = dom_samples[~dom_samples['massive_id'].isin(bad_massive)]

# MSV000088823: GNPS DOM LC-MS/MS Interlab Comparison 2020 COMPILED Dataset
# dom_samples = pd.concat([dom_samples,pd.Series(['MSV000088823'])],ignore_index=True)  # these are ones that came up in addition to Thomas's list
df['in_massive_dom_list'] = df['massive_id'].isin(dom_samples)
df.shape

(26534, 34)

In [10]:


def count_rows_and_unique_formulas(row):
    # try:
    row = row[-1]
    parquet_file = row['buddy']
    try:
        df_parquet = pd.read_parquet(parquet_file)
        num_rows = len(df_parquet)
        num_unique_formulas = df_parquet['predicted_formula'].nunique()
        return num_rows, num_unique_formulas
    except:
        print('Error',parquet_file)
        return None, None

def parallel_count_rows_and_unique_formulas(df):
    pool = multiprocessing.Pool(20)
    results = pool.map(count_rows_and_unique_formulas, df.iterrows())
    pool.close()
    pool.join()
    return results

# results = df.head(10).apply(count_rows_and_unique_formulas,axis=1)
results = parallel_count_rows_and_unique_formulas(df)
df[['num_unique_spectra', 'num_unique_formula']] = pd.DataFrame(results, columns=['num_unique_spectra', 'num_unique_formulas'])
df.shape

(26534, 36)

In [11]:
# d = '/global/cfs/cdirs/metatlas/projects/rawdata_for_scn'
idx = df['buddy'].str.contains('rawdata_for_scn')
print(sum(idx))
df.loc[idx,'massive_id'] = df.loc[idx,'no_extension_basename'].apply(lambda x: '_'.join(x.split('_')[4:6]))
df_grouped = df.groupby('massive_id').size().reset_index(name='row_count')
df = pd.merge(df, df_grouped, on='massive_id', how='left')
df.shape

274


(26534, 37)

In [12]:
import pandas as pd

# Set the display width for pandas columns
pd.set_option('display.max_colwidth', None)

# Your code here
# ...

# Reset the display width to the default value (optional)
# pd.reset_option('display.max_colwidth')


In [13]:
sum(pd.isna(df['buddy']))

0

In [14]:
import hashlib
import pandas as pd
import multiprocessing

def hash_dataframe_row(filename):
    cols = ['precursor_mz', 'rt', 'coisolated_precursor_count', 'predicted_formula', 'estimated_fdr']
    try:
        t = pd.read_parquet(filename)
        hash_value = int(hashlib.sha256(pd.util.hash_pandas_object(t[cols], index=True).values).hexdigest(), 16)
        return hash_value
    except:
        print('Error',filename)
        return None

def parallel_hash_dataframe(files):
    with multiprocessing.Pool(20) as pool:
        results = pool.map(hash_dataframe_row, files)
    return results

hash_values = parallel_hash_dataframe(df['buddy'].tolist())
df['hash_value'] = hash_values


In [15]:
# print(df.shape)
# df.drop_duplicates(subset=['massive_id','hash_value'],keep='first',inplace=True)
# print(df.shape)

In [16]:
df.sort_values(['in_massive_dom_list','in_redu_plant','keyword_DOM','row_count'],ascending=False,inplace=True) # True is greater than False

df.drop_duplicates(subset=['hash_value'],keep='first',inplace=True)
print(df.shape)

# df

(13247, 38)


In [17]:
df.columns

Index(['no_extension', 'h5', 'buddy', 'data_dir', 'massive_id', 'h5_basename',
       'no_extension_basename', 'redu_filename', 'title', 'dataset',
       'description', 'instrument', 'keywords', 'keyword_DOM',
       'YearOfAnalysis', 'USI', 'NCBIRank', 'ChromatographyAndPhase',
       'NCBIDivision', 'ENVOEnvironmentMaterial',
       'ENVOEnvironmentMaterialIndex', 'ENVOEnvironmentBiomeIndex',
       'SampleType', 'ENVOEnvironmentBiome', 'SampleExtractionMethod',
       'ATTRIBUTE_DatasetAccession', 'SampleCollectionMethod',
       'MassSpectrometer', 'NCBITaxonomy', 'filename',
       'IonizationSourceAndPolarity', 'DataSource', 'in_redu_plant',
       'in_massive_dom_list', 'num_unique_spectra', 'num_unique_formula',
       'row_count', 'hash_value'],
      dtype='object')

In [None]:
# cols = ['hash_value']
# g = df.groupby(cols)['no_extension'].count().sort_values(ascending=False)
# g = g.to_frame()
# g.columns = ['duplicate_count']
# g.reset_index(drop=False,inplace=True)
# g = g[g['duplicate_count']>1]
# cols2 = ['massive_id','row_count','in_massive_dom_list','data_dir'] + cols
# g = pd.merge(df[cols2],g,on=cols,how='inner')
# g.sort_values('duplicate_count',ascending=False,inplace=True)
# print(g.shape)
# g.drop_duplicates(subset=['massive_id','hash_value'],keep='first',inplace=True)
# print(g.shape)
# g.head(20)

In [None]:


# cols = ['massive_id','h5_basename','num_unique_spectra', 'num_unique_formula']
# df.sort_values('num_unique_spectra',ascending=False,inplace=True)
# df.drop_duplicates(subset=cols,keep='first',inplace=True)
# df.reset_index(drop=True,inplace=True)
# df

In [18]:
import re
from rdkit.Chem import rdchem
from multiprocessing import Pool



def calculate_mass(formula):
    # Regular expression to match elements and their counts
    pattern = r'([A-Z][a-z]*)(\d*)'    
    mass = 0
    pt = rdchem.GetPeriodicTable()

    for el, count in re.findall(pattern, formula):
        # If count is an empty string, it means there's only one atom of this element
        count = int(count) if count else 1
        mass += pt.GetMostCommonIsotopeMass(el) * count
    return mass

def process_row(filename):
    temp = pd.read_parquet(filename)
    if temp.shape[0]>0:
        temp['predicted_mass'] = temp['predicted_formula'].apply(lambda x: calculate_mass(x))
        temp['predicted_mass'] = temp['predicted_mass'] - 1.007276
        temp['mass_error'] = temp['precursor_mz'] - temp['predicted_mass']
        count_good = temp[temp['mass_error'].abs() < 0.001].shape[0] #0.002 was the parameter used in the original code
        fraction_good = count_good / temp.shape[0]
    else:
        count_good = 0
        fraction_good = 0
    return {'buddy':filename,'fraction_within_half_tolerance':fraction_good,'total_formula':temp.shape[0],'good_formula':count_good}

with Pool(20) as pool:
    out = pool.map(process_row, df['buddy'].tolist())
out = pd.DataFrame(out)
df = pd.merge(df,out,on='buddy',how='left')


df.shape

(13247, 41)

In [19]:
df = df[df['total_formula']>20]
df = df[df['fraction_within_half_tolerance']>0.5]


In [20]:

df.shape

(8634, 41)

In [None]:
df['SampleType'].value_counts()

In [21]:
df[df['in_massive_dom_list']]

Unnamed: 0,no_extension,h5,buddy,data_dir,massive_id,h5_basename,no_extension_basename,redu_filename,title,dataset,...,DataSource,in_redu_plant,in_massive_dom_list,num_unique_spectra,num_unique_formula,row_count,hash_value,fraction_within_half_tolerance,total_formula,good_formula
0,/pscratch/sd/b/bpb/massive/v01/MSV000088543/ccms_peak/raw_data/20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Bot-T2-1-E216_IR002_352,/pscratch/sd/b/bpb/massive/v01/MSV000088543/ccms_peak/raw_data/20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Bot-T2-1-E216_IR002_352.h5,/pscratch/sd/b/bpb/massive/v01/MSV000088543/ccms_peak/raw_data/20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Bot-T2-1-E216_IR002_352.parquet,/pscratch/sd/b/bpb/massive,MSV000088543,20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Bot-T2-1-E216_IR002_352.h5,20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Bot-T2-1-E216_IR002_352,f.MSV000088543/ccms_peak/raw_data/20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Bot-T2-1-E216_IR002_352,Exometabolomics of Switchgrass rhizosphere,MSV000088543,...,,False,True,34,30,456,45430261681085743627944794508087828225761024817821271817688870204326127138120,1.000000,34,34
1,/pscratch/sd/b/bpb/massive/v01/MSV000088543/ccms_peak/raw_data/20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Bot-T3-1-E226_IR002_021,/pscratch/sd/b/bpb/massive/v01/MSV000088543/ccms_peak/raw_data/20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Bot-T3-1-E226_IR002_021.h5,/pscratch/sd/b/bpb/massive/v01/MSV000088543/ccms_peak/raw_data/20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Bot-T3-1-E226_IR002_021.parquet,/pscratch/sd/b/bpb/massive,MSV000088543,20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Bot-T3-1-E226_IR002_021.h5,20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Bot-T3-1-E226_IR002_021,f.MSV000088543/ccms_peak/raw_data/20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Bot-T3-1-E226_IR002_021,Exometabolomics of Switchgrass rhizosphere,MSV000088543,...,,False,True,32,29,456,11466997312021510781671838716572440673198672003821569169593285051705087203843,1.000000,32,32
2,/pscratch/sd/b/bpb/massive/v01/MSV000088543/ccms_peak/raw_data/20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Mid-T2-1-E159_IR002_030,/pscratch/sd/b/bpb/massive/v01/MSV000088543/ccms_peak/raw_data/20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Mid-T2-1-E159_IR002_030.h5,/pscratch/sd/b/bpb/massive/v01/MSV000088543/ccms_peak/raw_data/20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Mid-T2-1-E159_IR002_030.parquet,/pscratch/sd/b/bpb/massive,MSV000088543,20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Mid-T2-1-E159_IR002_030.h5,20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Mid-T2-1-E159_IR002_030,f.MSV000088543/ccms_peak/raw_data/20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Mid-T2-1-E159_IR002_030,Exometabolomics of Switchgrass rhizosphere,MSV000088543,...,,False,True,36,34,456,60502225826992405275824700640197620211318721337838281046483288341483812443647,1.000000,36,36
3,/pscratch/sd/b/bpb/massive/v01/MSV000088543/ccms_peak/raw_data/20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Mid-T3-1-E096_IR002_063,/pscratch/sd/b/bpb/massive/v01/MSV000088543/ccms_peak/raw_data/20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Mid-T3-1-E096_IR002_063.h5,/pscratch/sd/b/bpb/massive/v01/MSV000088543/ccms_peak/raw_data/20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Mid-T3-1-E096_IR002_063.parquet,/pscratch/sd/b/bpb/massive,MSV000088543,20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Mid-T3-1-E096_IR002_063.h5,20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Mid-T3-1-E096_IR002_063,f.MSV000088543/ccms_peak/raw_data/20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Mid-T3-1-E096_IR002_063,Exometabolomics of Switchgrass rhizosphere,MSV000088543,...,,False,True,43,40,456,37023138541985243015317401734624756585962291374287723754385517802264610591228,1.000000,43,43
4,/pscratch/sd/b/bpb/massive/v01/MSV000088543/ccms_peak/raw_data/20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Mid-T3-2-E148_IR002_260,/pscratch/sd/b/bpb/massive/v01/MSV000088543/ccms_peak/raw_data/20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Mid-T3-2-E148_IR002_260.h5,/pscratch/sd/b/bpb/massive/v01/MSV000088543/ccms_peak/raw_data/20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Mid-T3-2-E148_IR002_260.parquet,/pscratch/sd/b/bpb/massive,MSV000088543,20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Mid-T3-2-E148_IR002_260.h5,20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Mid-T3-2-E148_IR002_260,f.MSV000088543/ccms_peak/raw_data/20181019_JJ_KZ_Switchgrass_Greenhouse_Rhizo1_QE119_Ag68377-924_USHXG01162_NEG_MSMS-v2_Rhizo-12C-C-Mid-T3-2-E148_IR002_260,Exometabolomics of Switchgrass rhizosphere,MSV000088543,...,,False,True,41,38,456,32045406411099848028751851487113919624590309777555431682307114313708195524278,1.000000,41,41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2138,/pscratch/sd/b/bpb/massive/v05/MSV000090529/peak/mzml/Rose-DOM_neg_1,/pscratch/sd/b/bpb/massive/v05/MSV000090529/peak/mzml/Rose-DOM_neg_1.h5,/pscratch/sd/b/bpb/massive/v05/MSV000090529/peak/mzml/Rose-DOM_neg_1.parquet,/pscratch/sd/b/bpb/massive,MSV000090529,Rose-DOM_neg_1.h5,Rose-DOM_neg_1,f.MSV000090529/peak/mzml/Rose-DOM_neg_1,GNPS Non-targted LC-MS/MS from Roseobacter DOM for Bioactivity Screens,MSV000090529,...,,False,True,96,93,16,43369017745121175503979957132052360431808146987875251941117119818633007186620,1.000000,96,96
2139,/pscratch/sd/b/bpb/massive/v05/MSV000090529/peak/mzml/Rose-DOM_neg_2,/pscratch/sd/b/bpb/massive/v05/MSV000090529/peak/mzml/Rose-DOM_neg_2.h5,/pscratch/sd/b/bpb/massive/v05/MSV000090529/peak/mzml/Rose-DOM_neg_2.parquet,/pscratch/sd/b/bpb/massive,MSV000090529,Rose-DOM_neg_2.h5,Rose-DOM_neg_2,f.MSV000090529/peak/mzml/Rose-DOM_neg_2,GNPS Non-targted LC-MS/MS from Roseobacter DOM for Bioactivity Screens,MSV000090529,...,,False,True,90,87,16,54053923677227204806183398256130420739812272345691476749704228536018757459138,1.000000,90,90
2140,/pscratch/sd/b/bpb/massive/v01/MSV000090282/ccms_peak/TRM_0522_neg_LCMS_DDA_rep1,/pscratch/sd/b/bpb/massive/v01/MSV000090282/ccms_peak/TRM_0522_neg_LCMS_DDA_rep1.h5,/pscratch/sd/b/bpb/massive/v01/MSV000090282/ccms_peak/TRM_0522_neg_LCMS_DDA_rep1.parquet,/pscratch/sd/b/bpb/massive,MSV000090282,TRM_0522_neg_LCMS_DDA_rep1.h5,TRM_0522_neg_LCMS_DDA_rep1,f.MSV000090282/ccms_peak/TRM_0522_neg_LCMS_DDA_rep1,Tjarno Reference Material Dissolved Organic Matter LCMS/MS mass spectra,MSV000090282,...,,False,True,382,324,9,65773427999086734511727561083316518110177481716385103860392989549538949062690,0.997382,382,381
2141,/pscratch/sd/b/bpb/massive/v01/MSV000090282/ccms_peak/TRM_0522_neg_LCMS_DDA_rep2,/pscratch/sd/b/bpb/massive/v01/MSV000090282/ccms_peak/TRM_0522_neg_LCMS_DDA_rep2.h5,/pscratch/sd/b/bpb/massive/v01/MSV000090282/ccms_peak/TRM_0522_neg_LCMS_DDA_rep2.parquet,/pscratch/sd/b/bpb/massive,MSV000090282,TRM_0522_neg_LCMS_DDA_rep2.h5,TRM_0522_neg_LCMS_DDA_rep2,f.MSV000090282/ccms_peak/TRM_0522_neg_LCMS_DDA_rep2,Tjarno Reference Material Dissolved Organic Matter LCMS/MS mass spectra,MSV000090282,...,,False,True,390,325,9,69381679568758820390208905113342296557140517521804029306377622979317887102112,1.000000,390,390


In [22]:
df.to_csv('/global/cfs/cdirs/metatlas/projects/carbon_network/public_and_internal_files_with_massive_and_redu.tsv', sep='\t',index=False)

In [None]:
# f = df.loc[1700,'buddy']
# # t = pd.read_parquet(f)
# process_row(f)
# # t

In [None]:
len(df.loc[df['keyword_DOM']==True,'hash_value'].unique())

In [None]:
len(df.loc[df['in_massive_dom_list']==True,'hash_value'].unique())

In [None]:
len(df.loc[(df['in_massive_dom_list']) | df['keyword_DOM'],'hash_value'].unique()),len(df.loc[df['SampleType']=='plant','hash_value'].unique())

In [None]:
df.groupby('hash_value')['buddy'].count().sort_values(ascending=False)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
fig,ax = plt.subplots()

sns.kdeplot(data=df, x='num_unique_spectra', ax=ax, color='grey', label='All')
sns.kdeplot(data=df[df['data_dir']=='/global/cfs/cdirs/metatlas/projects/rawdata_for_scn'], x='num_unique_spectra', ax=ax, color='blue', label='Berkeley Lab',log_scale=False)
sns.kdeplot(data=df[df['data_dir']=='/global/cfs/cdirs/metatlas/projects/massive_data_for_scn'], x='num_unique_spectra', ax=ax, color='purple', label='From FASST search',log_scale=False)
sns.kdeplot(data=df[~df['in_massive_dom_list']], x='num_unique_spectra', ax=ax, color='orange', label='Not in_massive_dom_list',log_scale=False)
sns.kdeplot(data=df[df['in_massive_dom_list']], x='num_unique_spectra', ax=ax, color='green', label='In in_massive_dom_list',log_scale=False)
ax.legend()
plt.show()

# sns.histplot(data=df, x='num_unique_spectra', ax=ax, log_scale=True, kde=True, alpha=0.6)
# sns.histplot(data=df[~df['in_massive_dom_list']], x='num_unique_spectra', ax=ax, label='not in_massive_dom_list', log_scale=True, kde=False, alpha=0.6)
# sns.histplot(data=df[df['in_massive_dom_list']], x='num_unique_spectra', ax=ax, label='in_massive_dom_list', log_scale=True, kde=True, alpha=0.6)
# ax.legend()
# plt.show()





# sns.scatterplot(data=df,x='num_unique_spectra',y='num_unique_formulas',hue='in_massive_dom_list',ax=ax)
# ax.set_xscale('log')
# ax.set_yscale('log')
# plt.show()

In [None]:
fig,ax = plt.subplots()

sns.kdeplot(data=df, x='fraction_within_half_tolerance', ax=ax, color='grey', label='All')
sns.kdeplot(data=df[df['data_dir']=='/global/cfs/cdirs/metatlas/projects/rawdata_for_scn'], x='fraction_within_half_tolerance', ax=ax, color='blue', label='Berkeley Lab',log_scale=False)
sns.kdeplot(data=df[df['data_dir']=='/global/cfs/cdirs/metatlas/projects/massive_data_for_scn'], x='fraction_within_half_tolerance', ax=ax, color='purple', label='From FASST search',log_scale=False)
sns.kdeplot(data=df[~df['in_massive_dom_list']], x='fraction_within_half_tolerance', ax=ax, color='orange', label='Not in_massive_dom_list',log_scale=False)
sns.kdeplot(data=df[df['in_massive_dom_list']], x='fraction_within_half_tolerance', ax=ax, color='green', label='In in_massive_dom_list',log_scale=False)
ax.legend()
plt.show()

In [None]:
df.groupby(['massive_id','title'])['num_unique_formula'].sum().sort_values(ascending=False).head(40)

In [None]:
df['keyword_DOM'] = df['keywords'].str.contains('DOM',case=False,na=False)

In [None]:
fig,ax = plt.subplots()
sns.scatterplot(data=df,x='num_unique_spectra',y='num_unique_formula',hue='keyword_DOM',ax=ax)


In [None]:
fig,ax = plt.subplots()
sns.scatterplot(data=df,x='num_unique_spectra',y='num_unique_formula',hue='in_massive_dom_list',ax=ax)


In [None]:
len(df.loc[df['SampleType']=='plant','NCBITaxonomy'].unique())

In [None]:

unique_values = df['SampleType'].unique()

fig, ax = plt.subplots()
for value in unique_values:
    temp = df[df['SampleType'] == value].copy()
    if temp.shape[0]>100:
        sns.kdeplot(data=temp, x='num_unique_formula', ax=ax, label=value)

ax.legend(title='SampleType', bbox_to_anchor=(1.05, 1), loc='upper left')
# ax.set_yscale('log')
plt.show()
