Submitting data from Synapse scratch space

In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
import synapseclient
import cmc_submit2ndar as s2n
import pandas as pd
import numpy as np
import pickle
import os.path
import re

In [2]:
syn = synapseclient.login()

Welcome, Attila Jones!



## Data files to submit
These data files have different types marked by their extension 

In [3]:
fpath = '/home/attila/projects/bsm/results/2021-02-02-submit-to-nda/files_not_uploaded_yet.MSSM.txt'
picklepath = fpath + '.p'
if not os.path.exists(picklepath):
    dfiles = s2n.read_dfiles(syn, fpath)
    pickle.dump(dfiles, open(picklepath, 'wb'))
else:
    print('loading file from', picklepath)
    with open(picklepath, 'rb') as f:
        dfiles = pickle.load(f)
dfiles

loading file from /home/attila/projects/bsm/results/2021-02-02-submit-to-nda/files_not_uploaded_yet.MSSM.txt.p


Unnamed: 0_level_0,Unnamed: 1_level_0,synapseID,filename,data_file1
indivID,filetype,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CMC_MSSM_027,cram,syn22007834,MSSM_027_NeuN_pl.cram,5229ec83-c2eb-4f44-8d2b-f688b476bbda/MSSM_027_...
CMC_MSSM_033,cram,syn22090557,MSSM_033_NeuN_pl.cram,475392a7-5f20-4a07-8694-9dc86a677032/MSSM_033_...
CMC_MSSM_055,cram,syn22162964,MSSM_055_NeuN_pl.cram,e55eb946-1335-4dd5-a26b-85d74b5e3a36/MSSM_055_...
CMC_MSSM_056,cram,syn22093909,MSSM_056_NeuN_pl.cram,af069550-3c69-4e3a-bbb3-48d532d2c21c/MSSM_056_...
CMC_MSSM_063,cram,syn22150192,MSSM_063_NeuN_pl.cram,d98961ae-eef8-4eb3-8be8-5a79f9fda507/MSSM_063_...
...,...,...,...,...
CMC_PITT_072,unmapped.bam,syn22385266,PITT_072_NeuN_pl.unmapped.bam,bc52db52-ba9d-49e4-a9fb-5bc2ec7e22e3/PITT_072_...
CMC_PITT_082,unmapped.bam,syn22384272,PITT_082_NeuN_pl.unmapped.bam,a877be33-4ca7-4165-b2a3-178bbd818d0f/PITT_082_...
CMC_PITT_098,unmapped.bam,syn22385231,PITT_098_NeuN_pl.unmapped.bam,abbcfd1c-e2f8-4cdd-8bb3-8b26c0a26788/PITT_098_...
CMC_PITT_101,unmapped.bam,syn22381683,PITT_101_NeuN_pl.unmapped.bam,2b8d3b25-3654-43e8-b188-a4089b1afd93/PITT_101_...


The `prefixes` dictionary contains the AWS S3 prefix for each file type

In [4]:
prefixes = {'cram': '3338602',
            'cram.crai': '3338602',
            'flagstat.txt': '3340241',
            'ploidy_2.vcf.gz': '3340241',
            'ploidy_2.vcf.gz.tbi': '3340241',
            'ploidy_12.vcf.gz': '3340241',
            'ploidy_12.vcf.gz.tbi': '3340241',
            'ploidy_50.vcf.gz': '3340241',
            'ploidy_50.vcf.gz.tbi': '3340241',
            'unmapped.bam': '3338602'
           }
ftypes = dfiles.index.get_level_values(1).unique()
set(ftypes) == set(prefixes.keys())

True

## Creating `genomics_subject02` and `nichd_btb02`
These refer to the same samples and subjects as the ones submitted with the FASTQ files.  Since there were two FASTQ submissions for these set of samples I merge them 

In [5]:
def foo(maniftype='genomics_subject02'):
    basen = '/home/attila/projects/bsm/results/2021-02-02-submit-to-nda/' + maniftype + '-'
    fpaths = [basen + x + '.csv' for x in ['2019-12-09', '2020-06-08']]
    manif = pd.concat([pd.read_csv(x, skiprows=1) for x in fpaths], axis=0)
    target_path = '/home/attila/projects/bsm/results/2021-02-02-submit-to-nda/' + maniftype +'-2021-02-02.csv'
    template_path = '/home/attila/projects/bsm/results/2020-04-22-upload-to-ndar-from-s3/chess-' + maniftype + '.csv'
    s2n.write_manifest(manif, template_path=template_path, target_path=target_path)
    return(manif)

gsub = foo('genomics_subject02')
btb = foo('nichd_btb02')

## Creating `genomics_sample03`

### `genomics_sample03` filled template

I take `genomics_sample03` from a previous submission that was used for the FASTQ files for the same set of 81 samples.  I name this manifest `oldgsam`

In [6]:
basen = '/home/attila/projects/bsm/results/2021-02-02-submit-to-nda/genomics_sample03-'
fpaths = [basen + x + '.csv' for x in ['2019-12-09', '2020-06-08']]
oldgsam = pd.concat([pd.read_csv(x, skiprows=1) for x in fpaths], axis=0)
oldgsam

Unnamed: 0,subjectkey,experiment_id,src_subject_id,interview_age,interview_date,sample_description,sample_id_original,organism,sample_amount,sample_unit,...,patient_id_biorepository,sample_id_biorepository,cell_id_original,cell_id_biorepository,comments_misc,site,rat280,rat230,gqn,seq_batch
0,NDAR_INV0971H4H4,1223,CMC_MSSM_033,972,04/13/2018,frontal cortex,MSSM_033.DLPFC_1355.np1,human,1.0,,...,CMC_MSSM_033,MSSM_033.DLPFC_1355.np1,,,,U01MH106891,,,,
1,NDAR_INV0971H4H4,1223,CMC_MSSM_033,972,04/13/2018,frontal cortex,MSSM_033.DLPFC_1355.np1,human,1.0,,...,CMC_MSSM_033,MSSM_033.DLPFC_1355.np1,,,,U01MH106891,,,,
2,NDAR_INV1VPUF5CL,1223,CMC_MSSM_056,804,04/13/2018,frontal cortex,MSSM_056.DLPFC_1181.np1,human,1543.0,ng,...,CMC_MSSM_056,MSSM_056.DLPFC_1181.np1,,,,U01MH106891,,,,
3,NDAR_INV1VPUF5CL,1223,CMC_MSSM_056,804,04/13/2018,frontal cortex,MSSM_056.DLPFC_1181.np1,human,1543.0,ng,...,CMC_MSSM_056,MSSM_056.DLPFC_1181.np1,,,,U01MH106891,,,,
4,NDAR_INV1VPUF5CL,1223,CMC_MSSM_056,804,04/13/2018,frontal cortex,MSSM_056.DLPFC_1181.np1,human,1543.0,ng,...,CMC_MSSM_056,MSSM_056.DLPFC_1181.np1,,,,U01MH106891,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,NDAR_INVTY432EB9,1223,CMC_MSSM_379,852,06/08/2020,frontal cortex,1238.np1,human,1.0,,...,CMC_MSSM_379,1238.np1,,,,U01MH106891,,,,
65,NDAR_INVLH141LX6,1223,CMC_MSSM_340,912,06/08/2020,frontal cortex,1247.np1,human,1.0,,...,CMC_MSSM_340,1247.np1,,,,U01MH106891,,,,
66,NDAR_INVAK206LGH,1223,CMC_MSSM_362,876,06/08/2020,frontal cortex,1346.np1,human,1.0,,...,CMC_MSSM_362,1346.np1,,,,U01MH106891,,,,
67,NDAR_INVTM018EUJ,1223,CMC_MSSM_405,708,06/08/2020,frontal cortex,1357.np1,human,1.0,,...,CMC_MSSM_405,1357.np1,,,,U01MH106891,,,,


### Editing `genomics_sample03`

The `edit_gsam` function takes the `oldgsam` template and simplifies it to one row for each sample.  Then it replaces the `data_file1_type` and `data_file1` fields with the file type (e.g `unmapped` and `295ba82f-8f14-4e50-8945-4496f5131c9e/MSSM_027_NeuN_pl.unmapped.bam`, respectively).

`edit_gsam` edits `data_file1` in a certain way; see Cindy Molitor's email from Nov 30, 2020, 1:29 PM

> Note that the s3 prefix for the scratch bucket should not be included in the file path in the data_file columns, e.g. the value of the column should be 
>
> 89eac311-84e3-4be0-9f0e-c6186104f71c/NC1-CX-ASTMIG.cram
>
> instead of
>
> 3340241/89eac311-84e3-4be0-9f0e-c6186104f71c/NC1-CX-ASTMIG.cram

In [7]:
template_path = '/home/attila/projects/bsm/results/2020-04-22-upload-to-ndar-from-s3/genomics_sample03_template.csv'
def foo(ftype):
    gsam = s2n.edit_gsam(oldgsam, dfiles, gender=gsub.set_index('src_subject_id')['gender'], dftype=ftype)
    target_path = '/home/attila/projects/bsm/results/2021-02-02-submit-to-nda/genomics_sample03-' + ftype + '.csv'
    s2n.write_manifest(gsam, template_path=template_path, target_path=target_path)
    return(gsam)

gsamd = {ftype: foo(ftype) for ftype in ftypes}

## Submission
First submit everything except for `unmapped.bam`s because `PITT_117_NeuN_pl.unmapped.bam` is still missing

In [9]:
%%bash
cd ~/projects/bsm/results/2021-02-02-submit-to-nda
validate="vtcmd -b -t title -d description -s3 nda-bsmn-scratch -pre 3338602 -u $NDA_USER -p $NDA_PASSWORD -ak ASIAZAAXFM2FOHMRXZUH -sk 'gimUKU1I3rActmMBYSiw9dLBEzQhOw7WCrRiq8+B' -c 2965 -w"
manifests="nichd_btb02-2021-02-02.csv genomics_subject02-2021-02-02.csv"
dftypes="cram.crai cram flagstat.txt ploidy_2.vcf.gz ploidy_2.vcf.gz.tbi ploidy_12.vcf.gz ploidy_12.vcf.gz.tbi ploidy_50.vcf.gz ploidy_50.vcf.gz.tbi unmapped.bam"
for dftype in $dftypes; do
echo $dftype
$validate $manifests genomics_sample03-$dftype.csv
echo '----'
done

cram.crai
Running NDATools Version 0.2.3
Opening log: /home/attila/NDAValidationResults/debug_log_20210222T210149.txt

Validating files...
Validation report output to: /home/attila/NDAValidationResults/validation_results_20210222T210149.csv

All files have finished validating.

The following files passed validation:
UUID c9b5bbbd-4bf2-4587-8a22-73a7de2fc8a8: genomics_subject02-2021-02-02.csv
UUID cee61d03-7f7c-49a4-8df3-89ec94ae157e: genomics_sample03-cram.crai.csv
UUID f691eb48-04a5-46a9-a7cd-274168e20c45: nichd_btb02-2021-02-02.csv

Searching for associated files...

 Your user does NOT have access to the following buckets. Please review the bucket and/or your AWS credentials and try again.
nda-bsmn-scratch
nda-bsmn-scratch
nda-bsmn-scratch
nda-bsmn-scratch
nda-bsmn-scratch
nda-bsmn-scratch
nda-bsmn-scratch
nda-bsmn-scratch
nda-bsmn-scratch
nda-bsmn-scratch
nda-bsmn-scratch
nda-bsmn-scratch
nda-bsmn-scratch
nda-bsmn-scratch
nda-bsmn-scratch
nda-bsmn-scratch
nda-bsmn-scratch
nda-bsmn-

  0%|          | 0/3 [00:00<?, ?it/s] 33%|███▎      | 1/3 [00:01<00:02,  1.42s/it] 67%|██████▋   | 2/3 [00:01<00:01,  1.04s/it]100%|██████████| 3/3 [00:01<00:00,  1.28it/s]100%|██████████| 3/3 [00:01<00:00,  1.66it/s]
Traceback (most recent call last):
  File "/home/attila/.local/bin/vtcmd", line 8, in <module>
    sys.exit(main())
  File "/home/attila/.local/lib/python3.7/site-packages/NDATools/clientscripts/vtcmd.py", line 290, in main
    package_results = build_package(uuid, associated_files, config=config)
  File "/home/attila/.local/lib/python3.7/site-packages/NDATools/clientscripts/vtcmd.py", line 237, in build_package
    package.file_search(directories, source_bucket, source_prefix, retry_allowed=True)
  File "/home/attila/.local/lib/python3.7/site-packages/NDATools/BuildPackage.py", line 226, in file_search
    self.recollect_file_search_info()
  File "/home/attila/.local/lib/python3.7/site-packages/NDATools/BuildPackage.py", line 150, in recollect_file_search_info
    r

In [8]:
%connect_info

{
  "shell_port": 59245,
  "iopub_port": 41551,
  "stdin_port": 34749,
  "control_port": 57099,
  "hb_port": 57723,
  "ip": "127.0.0.1",
  "key": "7fcd38c5-b742590ef2dcdd4f126234c4",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
}

Paste the above JSON into a file, and connect with:
    $> jupyter <app> --existing <file>
or, if you are local, you can connect with just:
    $> jupyter <app> --existing kernel-cfd88a2b-1421-422b-b9c9-ef68b2bf3f90.json
or even just:
    $> jupyter <app> --existing
if this is the most recent Jupyter kernel you have started.
