In [None]:
import pandas as pd
import numpy as np
import glob as glob
import os

import multiprocessing


In [None]:

data_dirs = ['/pscratch/sd/b/bpb/massive',
              '/global/cfs/cdirs/metatlas/projects/rawdata_for_scn',
              '/global/cfs/cdirs/metatlas/projects/massive_data_for_scn']


def get_files(main_dir):
    mzml_files = glob.glob(main_dir + '/**/*.mzML', recursive=True)
    mzxml_files = glob.glob(main_dir + '/**/*.mzXML', recursive=True)
    hdf5_files = glob.glob(main_dir + '/**/*.h5', recursive=True)
    buddy_mdm_files = glob.glob(main_dir + '/**/*.parquet', recursive=True)
    buddy_failed_mdm_files = glob.glob(main_dir + '/**/*.parquet-failed', recursive=True)

    df_mzml = pd.DataFrame({'mzml': mzml_files})
    df_mzml['no_extension'] = df_mzml['mzml'].apply(lambda x: x.replace('.mzML', ''))
    df_mzml.set_index('no_extension', inplace=True)
    df_mzxml = pd.DataFrame({'mzxml': mzxml_files})
    df_mzxml['no_extension'] = df_mzxml['mzxml'].apply(lambda x: x.replace('.mzXML', ''))
    df_mzxml.set_index('no_extension', inplace=True)
    df_h5 = pd.DataFrame({'h5': hdf5_files})
    df_h5['no_extension'] = df_h5['h5'].apply(lambda x: x.replace('.h5', ''))
    df_h5.set_index('no_extension', inplace=True)
    df_buddy = pd.DataFrame({'buddy': buddy_mdm_files})
    df_buddy['no_extension'] = df_buddy['buddy'].apply(lambda x: x.replace('.parquet', ''))
    df_buddy.set_index('no_extension', inplace=True)
    df_buddy_failed = pd.DataFrame({'buddy_failed': buddy_failed_mdm_files})
    df_buddy_failed['no_extension'] = df_buddy_failed['buddy_failed'].apply(lambda x: x.replace('.parquet-failed', ''))
    df_buddy_failed.set_index('no_extension', inplace=True)

    df = df_mzml.join(df_mzxml, how='outer').join(df_h5, how='outer').join(df_buddy, how='outer').join(df_buddy_failed, how='outer')
    df['data_dir'] = main_dir
    return df

def get_files_from_dirs(data_dirs):
    out = []
    for data_dir in data_dirs:
        out.append(get_files(data_dir))
    if len(data_dirs)>1:
        return pd.concat(out)
    else:
        return out[0]

done_df = get_files_from_dirs(data_dirs)
cols = ['h5','buddy','data_dir']
done_df = done_df[cols]

pattern = r'(?<=/)(MSV.*?)(?=/)'
a = done_df.copy().index.str.extract(pattern)
done_df['massive_id'] = a[0].tolist()

done_df.reset_index(inplace=True,drop=False)
print(done_df.shape)

In [None]:
import ftplib
import os
import re

"""
MIT license: 2017 - Jwely

Example usage:
``` python
import ftplib
ftp = ftplib.FTP(mysite, username, password)
download_ftp_tree(ftp, remote_dir, local_dir)
```

The code above will look for a directory called "remote_dir" on the ftp host, and then duplicate the
directory and its entire contents into the "local_dir".

*** Note that if wget is an option, I recommend using that instead ***

"""


def _is_ftp_dir(ftp_handle, name, guess_by_extension=True):
    """ simply determines if an item listed on the ftp server is a valid directory or not """

    # if the name has a "." in the fourth to last position, its probably a file extension
    # this is MUCH faster than trying to set every file to a working directory, and will work 99% of time.
    if guess_by_extension is True:
        if len(name) >= 6:
            if name[-4] == '.': #.raw
                return False
            elif name[-5] == '.': #.mzml
                return False
            elif name[-6] == '.': #.mzxml
                return False

    original_cwd = ftp_handle.pwd()  # remember the current working directory
    try:
        # print(name,original_cwd)
        ftp_handle.cwd(name)  # try to set directory to new name
        ftp_handle.cwd(original_cwd)  # set it back to what it was
        return True

    except ftplib.error_perm as e:
        print(e)
        return False

    except Exception as e:
        print(e)
        return False


def _make_parent_dir(fpath):
    """ ensures the parent directory of a filepath exists """
    dirname = os.path.dirname(fpath)
    while not os.path.exists(dirname):
        try:
            os.makedirs(dirname)
            print("created {0}".format(dirname))
        except OSError as e:
            print(e)
            _make_parent_dir(dirname)


def _download_ftp_file(ftp_handle, name, dest, overwrite):
    """ downloads a single file from an ftp server """
    _make_parent_dir(dest.lstrip("/"))
    if _file_name_match_patern(pattern, name):
        if not os.path.exists(dest) or overwrite is True:
            try:
                s = ftp.size(name)   # Get size of file
                print(s)
                if (s<500e6) & (s>1e6):
                    try:
                        with open(dest, 'wb') as f:
                            ftp_handle.retrbinary("RETR {0}".format(name), f.write)
                        print("downloaded: {0}".format(dest))
                    except:
                        with open(dest, 'wb') as f: pass
                        print("FAILED: {0} is too big or too small".format(dest))
                else:
                    with open(dest, 'wb') as f: pass
                    print("FAILED: {0} is too big or too small".format(dest))
            except FileNotFoundError:
                print("FAILED: {0}".format(dest))
        else:
            print("already exists: {0}".format(dest))
    else:
        pass

def _file_name_match_patern(pattern, name):
    """ returns True if filename matches the pattern"""
    if pattern is None:
        return True
    else:
        pat_match = bool(re.match(pattern, name,re.IGNORECASE))
        return pat_match

# def _check_name_size(name, size):
#     # name = '/v01/MSV000088823/peak/neg-Mode_MS1/Lab_01/DOM_Interlab-LCMS_Lab1_A5M_Neg_MS1_rep3.mzML'
#     s = ftp.size(name)   # Get size of file
#     return s

def _mirror_ftp_dir(ftp_handle, name, overwrite, guess_by_extension, pattern):
    """ replicates a directory on an ftp server recursively """
    for item in ftp_handle.nlst(name):
        if _is_ftp_dir(ftp_handle, item, guess_by_extension):
            _mirror_ftp_dir(ftp_handle, item, overwrite, guess_by_extension, pattern)
        else:
            _download_ftp_file(ftp_handle, item, item, overwrite)
            # else:
            #     # quietly skip the file
            #     pass


def download_ftp_tree(ftp_handle, path, destination, pattern=None, overwrite=False, guess_by_extension=True):
    """
    Downloads an entire directory tree from an ftp server to the local destination
    :param ftp_handle: an authenticated ftplib.FTP instance
    :param path: the folder on the ftp server to download
    :param destination: the local directory to store the copied folder
    :param pattern: Python regex pattern, only files that match this pattern will be downloaded.
    :param overwrite: set to True to force re-download of all files, even if they appear to exist already
    :param guess_by_extension: It takes a while to explicitly check if every item is a directory or a file.
        if this flag is set to True, it will assume any file ending with a three character extension ".???" is
        a file and not a directory. Set to False if some folders may have a "." in their names -4th position.
    """
    path = path.lstrip("/")
    original_directory = os.getcwd()  # remember working directory before function is executed
    os.chdir(destination)  # change working directory to ftp mirror directory

    _mirror_ftp_dir(
        ftp_handle,
        path,
        pattern=pattern,
        overwrite=overwrite,
        guess_by_extension=guess_by_extension)

    os.chdir(original_directory)  # reset working directory to what it was before function exec




In [None]:
ftp.quit()

In [None]:
import pandas as pd
def get_ftp_directory_structure(host, username='', password=''):
    # Connect to the FTP server
    ftp = ftplib.FTP(host)
    ftp.login(username, password)

    # Get the directory listing
    listing = []
    ftp.retrlines('LIST', listing.append)

    # Parse the directory listing into a dataframe
    df = pd.DataFrame([line.split() for line in listing], columns=['permissions', 'links', 'owner', 'group', 'size', 'month', 'day', 'time', 'name'])
    out = []
    # Iterate over the directories and get the subdirectory listing
    for index, row in df.iterrows():
        if row['permissions'][0] == 'd':
            subdirectory = row['name']
            sublisting = []
            ftp.retrlines(f'LIST {subdirectory}', sublisting.append)
            subdf = pd.DataFrame([line.split() for line in sublisting], columns=['permissions', 'links', 'owner', 'group', 'size', 'month', 'day', 'time', 'name'])
            subdf['subdirectory'] = row['name']
            # df.at[index, 'subdirectory'] = subdf
            out.append(subdf)
            
    
    # Close the FTP connection
    ftp.quit()
    df = pd.concat(out)
    df.reset_index(drop=True, inplace=True)
    return df

# Specify the FTP host
host = 'massive.ucsd.edu'

# Get the directory structure as a dataframe
df = get_ftp_directory_structure(host)

# Print the dataframe
df


In [None]:
dom_samples = pd.read_csv('/global/homes/b/bpb/repos/scndb/data/dom_public_datasets.csv',usecols=['dataset'])
# dom_samples = dom_samples[~dom_samples['dataset'].isin(done_df['massive_id'])]

# bad_massive = ['MSV000092338','MSV000093271','MSV000092599','MSV000093514','MSV000092622','MSV000092604','MSV000092520']
df = df[df['name'].isin(dom_samples['dataset'])]
# MSV000082082
# drop = ['MSV000089481']  # These two worked: 'MSV000082082','MSV000089591'
# df = df[~df['name'].isin(drop)]
# d = ['MSV000089481']
# df = df[df['name'].isin(d)]
df

In [None]:
plant_samples = pd.read_csv('/global/cfs/cdirs/metatlas/projects/carbon_network/all_sampleinformation.tsv',sep='\t',usecols=['ATTRIBUTE_DatasetAccession','filename','SampleType'])
plant_samples = plant_samples[~plant_samples['filename'].str.contains('qc',case=False)]
plant_samples = plant_samples[~plant_samples['filename'].str.contains('blank',case=False)]
plant_samples = plant_samples[plant_samples['SampleType']=='plant']


df = df[df['name'].isin(plant_samples['ATTRIBUTE_DatasetAccession'])]
df = df[~df['name'].isin(done_df['massive_id'])]

df


In [None]:
# dom_samples[dom_samples['dataset']=='MSV000082082']

In [None]:
ftp.quit()

In [None]:
import time

# Wait for 2 seconds

# FILEPATH: /global/homes/b/bpb/repos/scndb/build/Get_MASSIVE_LIST-of-IDs_or_MIRROR.ipynb
# Example usage mirroring all jpg files in an FTP directory tree.
host = "massive.ucsd.edu"
username = ""
password = ""
ftp = ftplib.FTP(host)
ftp.login(username, password)
for i,row in df.iterrows():
    remote_dir = f"/{row['subdirectory']}/{row['name']}"
    print(remote_dir)
    local_dir = "/pscratch/sd/b/bpb/massive"
    pattern = ".*\.mz[X]*ML$"
    # ftp = ftplib.FTP(mysite, username, password)

    download_ftp_tree(ftp, remote_dir, local_dir, pattern=pattern, overwrite=False, guess_by_extension=True)
    # Wait for 2 seconds
    print('waiting...')
    time.sleep(1)

ftp.quit()

In [None]:
import subprocess

my_dir = '/pscratch/sd/b/bpb/massive'

# Change permissions
subprocess.run(['chmod', '-R', '770', my_dir])

# Change group
subprocess.run(['chgrp', '-R', 'metatlas', my_dir])



In [None]:
# 289.286376953125%208068.0%5Cn295.5452880859375%2022507.0%5Cn298.4896240234375%203925.0%5Cn317.324951171875%2018742.0%5Cn319.65594482421875%208604.0%5Cn324.482421875%208041.0%5Cn325.3162841796875%209738.0%5Cn339.7894287109375%2016145.0%5Cn343.947021484375%2018094.0%5Cn347.0205078125%2013981.0%5Cn347.91339111328125%206765.0%5Cn361.147705078125%2011763.0%5Cn361.8443603515625%2024296.0%5Cn364.23272705078125%202346.0%5Cn364.858154296875%2010782.0%5Cn365.84588623046875%2010242.0%5Cn368.2216796875%2012761.0%5Cn368.9656982421875%2019147.0%5Cn375.06951904296875%2015644.0%5Cn375.751953125%2025393.0%5Cn382.75054931640625%2012765.0%5Cn384.19708251953125%2017912.0%5Cn390.57421875%207993.0%5Cn394.0491943359375%2016135.0%5Cn397.10626220703125%2013986.0%5Cn404.42071533203125%2012326.0%5Cn411.09271240234375%202348.0%5Cn413.7845458984375%208715.0%5Cn427.6673583984375%2068137.0%5Cn436.1927490234375%2014879.0%5Cn443.26611328125%2044427.0%5Cn446.26727294921875%2023472.0%5Cn447.74749755859375%2029292.0%5Cn455.25%2070939.0%5Cn456.1075439453125%20105392.0%5Cn456.8221435546875%203.0%5Cn457.543212890625%2012862.0%5Cn464.28546142578125%208617.0%5Cn469.872314453125%2087594.0%5Cn471.06219482421875%2031482.0%5Cn475.25732421875%2015449.0%5Cn476.14361572265625%2023143.0%5Cn476.97515869140625%2028430.0%5Cn478.89111328125%2027890.0%5Cn479.9759521484375%2033235.0%5Cn483.242431640625%2013564.0%5Cn487.21038818359375%2032885.0%5Cn488.16015625%2020786.0%5Cn491.19195556640625%2055073.0%5Cn494.27960205078125%207435.0%5Cn495.65399169921875%2032208.0%5Cn498.4129638671875%2011684.0%5Cn503.0281982421875%2030643.0%5Cn503.699951171875%202.0%5Cn504.34454345703125%2036421.0%5Cn505.154541015625%209667.0%5Cn510.176513671875%2038891.0%5Cn512.168701171875%2010175.0%5Cn513.265380859375%2016524.0%5Cn514.9573974609375%2011384.0%5Cn515.9228515625%2078764.0%5Cn520.973388671875%2028857.0%5Cn521.82373046875%205810.0%5Cn523.1689453125%2058926.0%5Cn529.036865234375%2020722.0%5Cn530.9912109375%2031845.0%5Cn532.376708984375%203005.0%5Cn534.5751953125%2012906.0%5Cn538.003173828125%20220949.0%5Cn539.2177734375%20272296.0%5Cn540.6728515625%2043876.0%5Cn548.0614013671875%2013655.0%5Cn554.117431640625%2076225.0%5Cn556.0303955078125%20214421.0%5Cn557.288818359375%2052970.0%5Cn557.99609375%206202.0%5Cn559.9422607421875%2018112.0%5Cn561.3287353515625%2014656.0%5Cn564.123046875%2025971.0%5Cn564.94873046875%2034630.0%5Cn566.43994140625%2035564.0%5Cn571.333740234375%2061305.0%5Cn572.04736328125%2017235.0%5Cn575.21923828125%2042127.0%5Cn575.888916015625%206.0%5Cn577.1029052734375%2029550.0%5Cn579.6458740234375%207151.0%5Cn580.942626953125%2017609.0%5Cn582.110107421875%20102075.0%5Cn583.458984375%2010113.0%5Cn585.237060546875%2036774.0%5Cn598.1724853515625%2026085.0%5Cn599.352783203125%20764523.0%5Cn600.3828125%20114267.0%5Cn601.066650390625%204.0%5Cn602.267578125%2027144.0%5Cn609.302001953125%2010247.0%5Cn613.415771484375%208621.0%5Cn622.208984375%2023787.0%5Cn623.023193359375%2063940.0%5Cn623.991455078125%2019154.0%5Cn625.2161865234375%2023050.0%5Cn638.299560546875%2012481.0%5Cn640.265625%2017392.0%5Cn641.235107421875%2065873.0%5Cn646.095947265625%208409.0%5Cn649.277099609375%205446.0%5Cn651.526611328125%2017521.0%5Cn657.12890625%2012911.0%5Cn658.094970703125%2014824.0%5Cn659.4208984375%2041969.0%5Cn663.3876953125%2018284.0%5Cn668.33251953125%2065700.0%5Cn669.357177734375%205671.0%5Cn680.2197265625%2044374.0%5Cn681.98779296875%2024446.0%5Cn685.957763671875%2019166.0%5Cn691.648681640625%2029177.0%5Cn693.225830078125%2033545.0%5Cn694.307861328125%2022539.0%5Cn696.3323974609375%20121211.0%5Cn697.1278076171875%209503.0%5Cn709.4644775390625%2020171.0%5Cn710.79541015625%2022346.0%5Cn711.744873046875%2032675.0%5Cn714.07177734375%2050487.0%5Cn715.5789794921875%2054567.0%5Cn716.216552734375%208.0%5Cn723.267822265625%2014415.0%5Cn724.0819091796875%2088510.0%5Cn725.4888916015625%208470.0%5Cn728.35205078125%2021518.0%5Cn735.806396484375%2052022.0%5Cn738.3466796875%202697.0%5Cn744.365234375%2016205.0%5Cn747.4560546875%2019268.0%5Cn753.271240234375%2014114.0%5Cn761.609130859375%2012373.0%5Cn764.462158203125%2019876.0%5Cn765.280029296875%2018361.0%5Cn769.275634765625%2041999.0%5Cn770.32861328125%2016548.0%5Cn771.3863525390625%2013776.0%5Cn787.4326171875%2033003.0%5Cn796.1395263671875%209637.0%5Cn797.2327880859375%2011322.0%5Cn806.556884765625%2018639.0%5Cn808.4423828125%206355.0%5Cn811.6373291015625%2014687.0%5Cn812.300048828125%209904.0%5Cn813.1492919921875%2015959.0%5Cn817.21923828125%207640.0%5Cn820.2740478515625%206246.0%5Cn821.2882080078125%2015591.0%5Cn823.361328125%2013693.0%5Cn824.6182861328125%206895.0%5Cn828.5174560546875%2032132.0%5Cn830.4093017578125%20102583.0%5Cn831.3067626953125%2065294.0%5Cn832.10546875%209727.0%5Cn833.1845703125%204115.0%5Cn835.212890625%207606.0%5Cn836.0782470703125%208740.0%5Cn838.5186767578125%2026160.0%5Cn839.455810546875%2072006.0%5Cn845.6129150390625%2021577.0%5Cn847.43359375%20196462.0%5Cn848.1258544921875%2039637.0%5Cn851.380859375%20246170.0%5Cn852.37060546875%20276882.0%5Cn853.2705078125%2044216.0%5Cn865.597900390625%2044697.0%5Cn866.295654296875%20111012.0%5Cn867.1904296875%204120.0%5Cn868.3721923828125%2078023.0%5Cn869.3311767578125%208584.0%5Cn871.5570068359375%205374.0%5Cn877.137451171875%2030131.0%5Cn880.216552734375%205692.0%5Cn883.4422607421875%2049241.0%5Cn884.216552734375%208.0%5Cn888.1671142578125%2041037.0%5Cn889.282958984375%2024795.0%5Cn892.1278076171875%2014925.0%5Cn893.4678955078125%2023506.0%5Cn895.607421875%2013123.0%5Cn899.0101318359375%2028633.0%5Cn901.3511962890625%2013472.0%5Cn902.3255615234375%203774.0%5Cn909.4244384765625%20244136.0%5Cn910.515380859375%2043770.0%5Cn911.5263671875%2015208.0%5Cn914.3089599609375%206532.0%5Cn915.2177734375%2028455.0%5Cn918.6661376953125%205610.0%5Cn919.396240234375%2085829.0%5Cn920.066650390625%203.0%5Cn921.1239013671875%2016163.0%5Cn922.2056884765625%2037863.0%5Cn925.063720703125%2043395.0%5Cn931.1328125%2061732.0%5Cn932.3519287109375%20136657.0%5Cn933.5240478515625%2025202.0%5Cn935.4932861328125%2033896.0%5Cn936.552001953125%20103130.0%5Cn937.588623046875%2067605.0%5Cn938.4710693359375%2035379.0%5Cn939.617919921875%2077289.0%5Cn946.2578125%2038584.0%5Cn949.3702392578125%2085420.0%5Cn950.2845458984375%205976.0%5Cn951.5517578125%2029995.0%5Cn953.3966064453125%20545281.0%5Cn954.4915771484375%20123937.0%5Cn963.686767578125%20261578.0%5Cn964.524658203125%20318164.0%5Cn965.192138671875%20124405.0%5Cn982.221923828125%2027147.0

spectrum = """54.0348	5
58.0297	2
68.0504	2
81.0455	100
95.061	6
138.0663	70
156.0769	10"""

url = f'https://fasst.gnps2.org/fastsearch/?usi1=None&precursor_mz=981.54&charge=1&library_select=gnpslibrary&analog_select=No&delta_mass_below=130&delta_mass_above=200&pm_tolerance=0.001&fragment_tolerance=0.01&cosine_threshold=0.7&use_peaks=1#%7B%22peaks%22%3A%20%22{spectrum}%22%7D'
url

In [None]:
ftp.quit()

In [None]:
import json
import requests
import pandas as pd
def query_fasst_peaks(precursor_mz, peaks, database, serverurl="https://fasst.gnps2.org/", analog=False, precursor_mz_tol=0.05, fragment_mz_tol=0.05, min_cos=0.7):
    spectrum_query = {
        "peaks": peaks,
        "precursor_mz": precursor_mz
    }

    params = {
        "query_spectrum": json.dumps(spectrum_query),
        "library": database,
        "analog": "Yes" if analog else "No",
        "pm_tolerance": precursor_mz_tol,
        "fragment_tolerance": fragment_mz_tol,
        "cosine_threshold": min_cos,
    }

    r = requests.post(serverurl + "search", data=params, timeout=50)

    r.raise_for_status()

    return r.json()



database = "gnpsdata_index"
# database = "gnpslibrary"

spectrum = """54.0348   5
58.0297 2
68.0504 2
81.0455 100
95.061  6
138.0663    70
156.0769    10"""
spectrum = spectrum.split('\n')
spectrum = [x.split() for x in spectrum]

spectrum = [[float(x[0]), float(x[1])] for x in spectrum]
precursor = 156.0769


# results = query_fasst_peaks(json_spectrum["precursor_mz"], json_spectrum["peaks"], database)
results = query_fasst_peaks(precursor, spectrum, database)

results = pd.DataFrame(results['results'])
results = results[abs(results['Delta Mass'])<0.01]
results

In [None]:

spectrum = """54.0348   5
58.0297 2
68.0504 2
81.0455 100
95.061  6
138.0663    70
156.0769    10"""
spectrum = spectrum.split('\n')
spectrum = [x.split() for x in spectrum]
spectrum