In [1]:
from taxonomy import Taxonomy
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [3]:
MAX_SAMPLE_SEQS = 150000000

First, get the ENA run metadata.

In [4]:
runs = pd.read_csv('data/ena_runs_genomic_txomic_metagenomic.txt.gz', sep='\t', dtype={'tax_id': str})

Drop runs with no FTP links and malformed sample accessions.

In [5]:
runs = runs.dropna(subset=['fastq_ftp']).dropna(subset=['sample_accession'])

In [6]:
runs = runs[runs.sample_accession.str.startswith('SAM')].sort_values(['library_source', 'study_accession', 'sample_accession'])

And grab runs only with random or PolyA selection.

In [7]:
runs = runs[runs.library_selection.isin(['RANDOM', 'PolyA'])]

Download the taxonomy database and load it.

In [43]:
!curl -L -o data/taxdump.tar.gz https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 54.5M  100 54.5M    0     0  8261k      0  0:00:06  0:00:06 --:--:-- 8910k


In [44]:
!tar -xvzf data/taxdump.tar.gz --directory ./data

citations.dmp
delnodes.dmp
division.dmp
gencode.dmp
merged.dmp
names.dmp
nodes.dmp
gc.prt
readme.txt


In [8]:
tax = Taxonomy.from_ncbi('data/nodes.dmp', 'data/names.dmp')

In [11]:
tax.lineage('105271')

[<TaxonomyNode (id=105271 rank="species" name="Knautia arvensis"))>,
 <TaxonomyNode (id=59182 rank="genus" name="Knautia"))>,
 <TaxonomyNode (id=4200 rank="family" name="Caprifoliaceae"))>,
 <TaxonomyNode (id=4199 rank="order" name="Dipsacales"))>,
 <TaxonomyNode (id=91882 rank="clade" name="campanulids"))>,
 <TaxonomyNode (id=71274 rank="clade" name="asterids"))>,
 <TaxonomyNode (id=1437201 rank="clade" name="Pentapetalae"))>,
 <TaxonomyNode (id=91827 rank="clade" name="Gunneridae"))>,
 <TaxonomyNode (id=71240 rank="clade" name="eudicotyledons"))>,
 <TaxonomyNode (id=1437183 rank="clade" name="Mesangiospermae"))>,
 <TaxonomyNode (id=3398 rank="class" name="Magnoliopsida"))>,
 <TaxonomyNode (id=58024 rank="clade" name="Spermatophyta"))>,
 <TaxonomyNode (id=78536 rank="clade" name="Euphyllophyta"))>,
 <TaxonomyNode (id=58023 rank="clade" name="Tracheophyta"))>,
 <TaxonomyNode (id=3193 rank="clade" name="Embryophyta"))>,
 <TaxonomyNode (id=131221 rank="subphylum" name="Streptophytina"))>

In [9]:
def get_rank_name(taxid, rank = 'class'):
    ''' Given at taxonomy ID, get the name for the given rank in its lineage.
    '''
    try:
        for node in tax.lineage(taxid):
            if node.rank == rank:
                return node.name
    except:
        return 'Invalid'

Add some extra taxonomy metadata to the runs.

In [12]:
runs['class_name'] = runs.tax_id.apply(get_rank_name, rank='class')
runs['superkingdom_name'] = runs.tax_id.apply(get_rank_name, rank='superkingdom')
runs['phylum_name'] = runs.tax_id.apply(get_rank_name, rank='phylum')

In [13]:
runs.to_csv('data/ENA_runs.annotated.csv.gz')

# Subsampling

Select the following sample groups:
- Eukaryotic WGS      (n=100)
- Eukaryotic RNA-seq  (n=100)
- Bacterial WGS       (n=100)

In [253]:
def sample_taxon(data, source, toplevel_rank, N=100):
    taxon = np.random.choice(data.loc[source, toplevel_rank].tax_id.unique(), N, replace=False)
    samples = data.loc[source, toplevel_rank][data.loc[source, toplevel_rank].tax_id.isin(taxon)].groupby(['tax_id']).first()['sample_accession']
    sample_runs = data.loc[source, toplevel_rank].reset_index().set_index(['sample_accession']).loc[samples]
    add_class_names(sample_runs)
    return sample_runs

In [257]:
euk_wgs = sample_taxon(runs, 'GENOMIC', 'Eukaryota')

In [270]:
euk_wgs.head(3)

Unnamed: 0_level_0,library_source,toplevel_rank,tax_id,study_accession,run_accession,secondary_sample_accession,experiment_accession,scientific_name,instrument_platform,instrument_model,library_layout,library_strategy,library_selection,read_count,base_count,fastq_ftp,sra_ftp,class_id,class_name
sample_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
SAMEA5241046,GENOMIC,Eukaryota,105271,PRJEB30946,ERR3089155,ERS3048450,ERX3149983,Knautia arvensis,ILLUMINA,Illumina HiSeq 2500,PAIRED,WGS,RANDOM,5949258.0,2986528000.0,ftp.sra.ebi.ac.uk/vol1/fastq/ERR308/005/ERR308...,ftp.sra.ebi.ac.uk/vol1/err/ERR308/005/ERR3089155,3398,Magnoliopsida
SAMN06563067,GENOMIC,Eukaryota,1073241,PRJNA378844,SRR5439653,SRS2117572,SRX2729556,Enneacampus ansorgii,ILLUMINA,NextSeq 500,PAIRED,WGS,Hybrid Selection,1066501.0,322083300.0,ftp.sra.ebi.ac.uk/vol1/fastq/SRR543/003/SRR543...,ftp.sra.ebi.ac.uk/vol1/srr/SRR543/003/SRR5439653,186623,Actinopteri
SAMN08535824,GENOMIC,Eukaryota,109475,PRJNA433997,SRR6794555,SRS3009608,SRX3753591,Suncus etruscus,ILLUMINA,Illumina HiSeq 2500,PAIRED,WGS,Hybrid Selection,2523283.0,630820800.0,ftp.sra.ebi.ac.uk/vol1/fastq/SRR679/005/SRR679...,ftp.sra.ebi.ac.uk/vol1/srr/SRR679/005/SRR6794555,40674,Mammalia


In [260]:
euk_rna = sample_taxon(runs, 'TRANSCRIPTOMIC', 'Eukaryota')

In [269]:
euk_rna.head(3)

Unnamed: 0_level_0,library_source,toplevel_rank,tax_id,study_accession,run_accession,secondary_sample_accession,experiment_accession,scientific_name,instrument_platform,instrument_model,library_layout,library_strategy,library_selection,read_count,base_count,fastq_ftp,sra_ftp,class_id,class_name
sample_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
SAMN08159240,TRANSCRIPTOMIC,Eukaryota,1008908,PRJNA421868,SRR6374701,SRS2756987,SRX3469531,Euonymus carnosus,ILLUMINA,Illumina HiSeq 2000,PAIRED,RNA-Seq,cDNA,9070007.0,1832141000.0,ftp.sra.ebi.ac.uk/vol1/fastq/SRR637/001/SRR637...,ftp.sra.ebi.ac.uk/vol1/srr/SRR637/001/SRR6374701,3398,Magnoliopsida
SAMN11493215,TRANSCRIPTOMIC,Eukaryota,1036259,PRJNA534635,SRR9618648,SRS5042126,SRX6381693,Chaetomium megalocarpum,ILLUMINA,Illumina NovaSeq 6000,PAIRED,RNA-Seq,RT-PCR,71871266.0,21705120000.0,ftp.sra.ebi.ac.uk/vol1/fastq/SRR961/008/SRR961...,ftp.sra.ebi.ac.uk/vol1/srr/SRR961/008/SRR9618648,147550,Sordariomycetes
SAMN03575891,TRANSCRIPTOMIC,Eukaryota,104587,PRJNA281136,SRR2103696,SRS993177,SRX1098213,Pteridium aquilinum subsp. latiusculum,ILLUMINA,Illumina HiSeq 2500,PAIRED,RNA-Seq,cDNA,23368696.0,4720477000.0,ftp.sra.ebi.ac.uk/vol1/fastq/SRR210/006/SRR210...,ftp.sra.ebi.ac.uk/vol1/srr/SRR210/006/SRR2103696,241806,Polypodiopsida


In [265]:
bact_wgs = sample_taxon(runs, 'GENOMIC', 'Bacteria')

In [268]:
bact_wgs.head(3)

Unnamed: 0_level_0,library_source,toplevel_rank,tax_id,study_accession,run_accession,secondary_sample_accession,experiment_accession,scientific_name,instrument_platform,instrument_model,library_layout,library_strategy,library_selection,read_count,base_count,fastq_ftp,sra_ftp,class_id,class_name
sample_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
SAMN00794582,GENOMIC,Bacteria,1005485,PRJNA65835,SRR427070,SRS297181,SRX125239,Escherichia coli NE1487,ILLUMINA,Illumina HiSeq 2000,PAIRED,WGS,RANDOM,6015188.0,1215068000.0,ftp.sra.ebi.ac.uk/vol1/fastq/SRR427/SRR427070/...,ftp.sra.ebi.ac.uk/vol1/srr/SRR427/SRR427070,1236,Gammaproteobacteria
SAMN02441634,GENOMIC,Bacteria,1007109,PRJNA195650,SRR3938296,SRS1576977,SRX1967564,alpha proteobacterium SCGC AAA076-C03,ILLUMINA,Illumina HiSeq 2000,PAIRED,WGS,RANDOM,10098504.0,3029551000.0,ftp.sra.ebi.ac.uk/vol1/fastq/SRR393/006/SRR393...,ftp.sra.ebi.ac.uk/vol1/srr/SRR393/006/SRR3938296,28211,Alphaproteobacteria
SAMN00768252,GENOMIC,Bacteria,1053179,PRJNA70323,SRR392639,SRS283509,SRX113095,Bacillus cereus BAG2X1-1,ILLUMINA,Illumina HiSeq 2000,PAIRED,WGS,RANDOM,7547932.0,1524682000.0,ftp.sra.ebi.ac.uk/vol1/fastq/SRR392/SRR392639/...,ftp.sra.ebi.ac.uk/vol1/srr/SRR392/SRR392639,91061,Bacilli


## Merge and Save

Merge subsamples together and save as one `samples.tsv`.

In [272]:
all_samples = pd.concat([euk_wgs, euk_rna, bact_wgs])

In [318]:
expanded = []
for label, group in all_samples.groupby(level=0):
    result = []
    for sample_acc, row in group.iterrows():
        urls = row.fastq_ftp.strip(';').split(';')
        block = pd.DataFrame(data=[row] * len(urls), index=[sample_acc] * len(urls))
        block.fastq_ftp = urls
        result.append(block)
    gdf = pd.concat(result)
    #gdf = pd.DataFrame({'sample_accession': [label] * len(result), 'fastq_url': result})
    expanded.append(gdf)
samples_df = pd.concat(expanded)

In [323]:
samples_df.index.rename('sample_accession', inplace=True)

In [342]:
samples_df['fastq_filename'] = samples_df.fastq_ftp.str.rpartition('/')[2]

In [363]:
samples_df = samples_df[(samples_df.fastq_filename.str.contains('_1.fastq') | samples_df.fastq_filename.str.contains('_2.fastq'))]

In [364]:
samples_df.to_csv('samples.csv')

In [348]:
samples_df[samples_df['fastq_filename'] == 'DRR008445_1.fastq.gz'].fastq_ftp[0]

'ftp.sra.ebi.ac.uk/vol1/fastq/DRR008/DRR008445/DRR008445_1.fastq.gz'

In [326]:
samples_df.reset_index().set_index(['sample_accession', 'run_accession'])

Unnamed: 0_level_0,Unnamed: 1_level_0,library_source,toplevel_rank,tax_id,study_accession,secondary_sample_accession,experiment_accession,scientific_name,instrument_platform,instrument_model,library_layout,library_strategy,library_selection,read_count,base_count,fastq_ftp,sra_ftp,class_id,class_name
sample_accession,run_accession,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
SAMD00013667,DRR008445,GENOMIC,Eukaryota,6239,PRJDB2670,DRS007575,DRX007634,Caenorhabditis elegans,ILLUMINA,Illumina HiSeq 2000,PAIRED,WGS,RANDOM,69477252.0,1.389545e+10,ftp.sra.ebi.ac.uk/vol1/fastq/DRR008/DRR008445/...,ftp.sra.ebi.ac.uk/vol1/drr/DRR008/DRR008445,119089,Chromadorea
SAMD00013667,DRR008445,GENOMIC,Eukaryota,6239,PRJDB2670,DRS007575,DRX007634,Caenorhabditis elegans,ILLUMINA,Illumina HiSeq 2000,PAIRED,WGS,RANDOM,69477252.0,1.389545e+10,ftp.sra.ebi.ac.uk/vol1/fastq/DRR008/DRR008445/...,ftp.sra.ebi.ac.uk/vol1/drr/DRR008/DRR008445,119089,Chromadorea
SAMD00026528,DRR030768,GENOMIC,Eukaryota,1928728,PRJDB3528,DRS107612,DRX027778,Paulinella micropora,ILLUMINA,Illumina HiSeq 2500,PAIRED,WGS,RANDOM,219720858.0,6.591626e+10,ftp.sra.ebi.ac.uk/vol1/fastq/DRR030/DRR030768/...,ftp.sra.ebi.ac.uk/vol1/drr/DRR030/DRR030768,,Unclassified
SAMD00026528,DRR030768,GENOMIC,Eukaryota,1928728,PRJDB3528,DRS107612,DRX027778,Paulinella micropora,ILLUMINA,Illumina HiSeq 2500,PAIRED,WGS,RANDOM,219720858.0,6.591626e+10,ftp.sra.ebi.ac.uk/vol1/fastq/DRR030/DRR030768/...,ftp.sra.ebi.ac.uk/vol1/drr/DRR030/DRR030768,,Unclassified
SAMD00026528,DRR030769,GENOMIC,Eukaryota,1928728,PRJDB3528,DRS107612,DRX027779,Paulinella micropora,ILLUMINA,Illumina HiSeq 2500,PAIRED,WGS,RANDOM,235877773.0,7.076333e+10,ftp.sra.ebi.ac.uk/vol1/fastq/DRR030/DRR030769/...,ftp.sra.ebi.ac.uk/vol1/drr/DRR030/DRR030769,,Unclassified
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SAMN13471349,SRR10592613,TRANSCRIPTOMIC,Eukaryota,114981,PRJNA593817,SRS5766617,SRX7272457,Dalbergia melanoxylon,ILLUMINA,Illumina HiSeq 4000,PAIRED,RNA-Seq,cDNA,74366734.0,2.231002e+10,ftp.sra.ebi.ac.uk/vol1/fastq/SRR105/013/SRR105...,ftp.sra.ebi.ac.uk/vol1/srr/SRR105/013/SRR10592613,3398,Magnoliopsida
SAMN13531194,SRR10662764,GENOMIC,Eukaryota,220873,PRJNA593692,SRS5801684,SRX7341705,Tegillarca granosa,ILLUMINA,HiSeq X Ten,PAIRED,WGS,RANDOM,437155101.0,1.311465e+11,ftp.sra.ebi.ac.uk/vol1/fastq/SRR106/064/SRR106...,ftp.sra.ebi.ac.uk/vol1/srr/SRR106/064/SRR10662764,6544,Bivalvia
SAMN13531194,SRR10662764,GENOMIC,Eukaryota,220873,PRJNA593692,SRS5801684,SRX7341705,Tegillarca granosa,ILLUMINA,HiSeq X Ten,PAIRED,WGS,RANDOM,437155101.0,1.311465e+11,ftp.sra.ebi.ac.uk/vol1/fastq/SRR106/064/SRR106...,ftp.sra.ebi.ac.uk/vol1/srr/SRR106/064/SRR10662764,6544,Bivalvia
SAMN13866233,SRR10897223,GENOMIC,Eukaryota,4521,PRJNA601325,SRS6002521,SRX7566053,Lolium multiflorum,ILLUMINA,Illumina MiSeq,PAIRED,WGS,RANDOM,17869029.0,1.066131e+10,ftp.sra.ebi.ac.uk/vol1/fastq/SRR108/023/SRR108...,ftp.sra.ebi.ac.uk/vol1/srr/SRR108/023/SRR10897223,3398,Magnoliopsida


In [336]:
{sample_acc: list(samples_df.loc[sample_acc].fastq_ftp) for sample_acc in samples_df.index.unique()}

{'SAMD00013667': ['ftp.sra.ebi.ac.uk/vol1/fastq/DRR008/DRR008445/DRR008445_1.fastq.gz',
  'ftp.sra.ebi.ac.uk/vol1/fastq/DRR008/DRR008445/DRR008445_2.fastq.gz'],
 'SAMD00026528': ['ftp.sra.ebi.ac.uk/vol1/fastq/DRR030/DRR030768/DRR030768_1.fastq.gz',
  'ftp.sra.ebi.ac.uk/vol1/fastq/DRR030/DRR030768/DRR030768_2.fastq.gz',
  'ftp.sra.ebi.ac.uk/vol1/fastq/DRR030/DRR030769/DRR030769_1.fastq.gz',
  'ftp.sra.ebi.ac.uk/vol1/fastq/DRR030/DRR030769/DRR030769_2.fastq.gz',
  'ftp.sra.ebi.ac.uk/vol1/fastq/DRR030/DRR030770/DRR030770_1.fastq.gz',
  'ftp.sra.ebi.ac.uk/vol1/fastq/DRR030/DRR030770/DRR030770_2.fastq.gz',
  'ftp.sra.ebi.ac.uk/vol1/fastq/DRR030/DRR030771/DRR030771_1.fastq.gz',
  'ftp.sra.ebi.ac.uk/vol1/fastq/DRR030/DRR030771/DRR030771_2.fastq.gz',
  'ftp.sra.ebi.ac.uk/vol1/fastq/DRR030/DRR030772/DRR030772_1.fastq.gz',
  'ftp.sra.ebi.ac.uk/vol1/fastq/DRR030/DRR030772/DRR030772_2.fastq.gz',
  'ftp.sra.ebi.ac.uk/vol1/fastq/DRR030/DRR030773/DRR030773_1.fastq.gz',
  'ftp.sra.ebi.ac.uk/vol1/fastq

In [362]:
samples_df

Unnamed: 0_level_0,library_source,toplevel_rank,tax_id,study_accession,run_accession,secondary_sample_accession,experiment_accession,scientific_name,instrument_platform,instrument_model,library_layout,library_strategy,library_selection,read_count,base_count,fastq_ftp,sra_ftp,class_id,class_name,fastq_filename
sample_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
SAMD00013667,GENOMIC,Eukaryota,6239,PRJDB2670,DRR008445,DRS007575,DRX007634,Caenorhabditis elegans,ILLUMINA,Illumina HiSeq 2000,PAIRED,WGS,RANDOM,69477252.0,1.389545e+10,ftp.sra.ebi.ac.uk/vol1/fastq/DRR008/DRR008445/...,ftp.sra.ebi.ac.uk/vol1/drr/DRR008/DRR008445,119089,Chromadorea,DRR008445_1.fastq.gz
SAMD00013667,GENOMIC,Eukaryota,6239,PRJDB2670,DRR008445,DRS007575,DRX007634,Caenorhabditis elegans,ILLUMINA,Illumina HiSeq 2000,PAIRED,WGS,RANDOM,69477252.0,1.389545e+10,ftp.sra.ebi.ac.uk/vol1/fastq/DRR008/DRR008445/...,ftp.sra.ebi.ac.uk/vol1/drr/DRR008/DRR008445,119089,Chromadorea,DRR008445_2.fastq.gz
SAMD00026528,GENOMIC,Eukaryota,1928728,PRJDB3528,DRR030768,DRS107612,DRX027778,Paulinella micropora,ILLUMINA,Illumina HiSeq 2500,PAIRED,WGS,RANDOM,219720858.0,6.591626e+10,ftp.sra.ebi.ac.uk/vol1/fastq/DRR030/DRR030768/...,ftp.sra.ebi.ac.uk/vol1/drr/DRR030/DRR030768,,Unclassified,DRR030768_1.fastq.gz
SAMD00026528,GENOMIC,Eukaryota,1928728,PRJDB3528,DRR030768,DRS107612,DRX027778,Paulinella micropora,ILLUMINA,Illumina HiSeq 2500,PAIRED,WGS,RANDOM,219720858.0,6.591626e+10,ftp.sra.ebi.ac.uk/vol1/fastq/DRR030/DRR030768/...,ftp.sra.ebi.ac.uk/vol1/drr/DRR030/DRR030768,,Unclassified,DRR030768_2.fastq.gz
SAMD00026528,GENOMIC,Eukaryota,1928728,PRJDB3528,DRR030769,DRS107612,DRX027779,Paulinella micropora,ILLUMINA,Illumina HiSeq 2500,PAIRED,WGS,RANDOM,235877773.0,7.076333e+10,ftp.sra.ebi.ac.uk/vol1/fastq/DRR030/DRR030769/...,ftp.sra.ebi.ac.uk/vol1/drr/DRR030/DRR030769,,Unclassified,DRR030769_1.fastq.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SAMN13471349,TRANSCRIPTOMIC,Eukaryota,114981,PRJNA593817,SRR10592613,SRS5766617,SRX7272457,Dalbergia melanoxylon,ILLUMINA,Illumina HiSeq 4000,PAIRED,RNA-Seq,cDNA,74366734.0,2.231002e+10,ftp.sra.ebi.ac.uk/vol1/fastq/SRR105/013/SRR105...,ftp.sra.ebi.ac.uk/vol1/srr/SRR105/013/SRR10592613,3398,Magnoliopsida,SRR10592613_2.fastq.gz
SAMN13531194,GENOMIC,Eukaryota,220873,PRJNA593692,SRR10662764,SRS5801684,SRX7341705,Tegillarca granosa,ILLUMINA,HiSeq X Ten,PAIRED,WGS,RANDOM,437155101.0,1.311465e+11,ftp.sra.ebi.ac.uk/vol1/fastq/SRR106/064/SRR106...,ftp.sra.ebi.ac.uk/vol1/srr/SRR106/064/SRR10662764,6544,Bivalvia,SRR10662764_1.fastq.gz
SAMN13531194,GENOMIC,Eukaryota,220873,PRJNA593692,SRR10662764,SRS5801684,SRX7341705,Tegillarca granosa,ILLUMINA,HiSeq X Ten,PAIRED,WGS,RANDOM,437155101.0,1.311465e+11,ftp.sra.ebi.ac.uk/vol1/fastq/SRR106/064/SRR106...,ftp.sra.ebi.ac.uk/vol1/srr/SRR106/064/SRR10662764,6544,Bivalvia,SRR10662764_2.fastq.gz
SAMN13866233,GENOMIC,Eukaryota,4521,PRJNA601325,SRR10897223,SRS6002521,SRX7566053,Lolium multiflorum,ILLUMINA,Illumina MiSeq,PAIRED,WGS,RANDOM,17869029.0,1.066131e+10,ftp.sra.ebi.ac.uk/vol1/fastq/SRR108/023/SRR108...,ftp.sra.ebi.ac.uk/vol1/srr/SRR108/023/SRR10897223,3398,Magnoliopsida,SRR10897223_1.fastq.gz


Unnamed: 0_level_0,library_source,toplevel_rank,tax_id,study_accession,run_accession,secondary_sample_accession,experiment_accession,scientific_name,instrument_platform,instrument_model,library_layout,library_strategy,library_selection,read_count,base_count,fastq_ftp,sra_ftp,class_id,class_name,fastq_filename
sample_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
SAMD00013667,GENOMIC,Eukaryota,6239,PRJDB2670,DRR008445,DRS007575,DRX007634,Caenorhabditis elegans,ILLUMINA,Illumina HiSeq 2000,PAIRED,WGS,RANDOM,69477252.0,1.389545e+10,ftp.sra.ebi.ac.uk/vol1/fastq/DRR008/DRR008445/...,ftp.sra.ebi.ac.uk/vol1/drr/DRR008/DRR008445,119089,Chromadorea,DRR008445_1.fastq.gz
SAMD00013667,GENOMIC,Eukaryota,6239,PRJDB2670,DRR008445,DRS007575,DRX007634,Caenorhabditis elegans,ILLUMINA,Illumina HiSeq 2000,PAIRED,WGS,RANDOM,69477252.0,1.389545e+10,ftp.sra.ebi.ac.uk/vol1/fastq/DRR008/DRR008445/...,ftp.sra.ebi.ac.uk/vol1/drr/DRR008/DRR008445,119089,Chromadorea,DRR008445_2.fastq.gz
SAMD00026528,GENOMIC,Eukaryota,1928728,PRJDB3528,DRR030768,DRS107612,DRX027778,Paulinella micropora,ILLUMINA,Illumina HiSeq 2500,PAIRED,WGS,RANDOM,219720858.0,6.591626e+10,ftp.sra.ebi.ac.uk/vol1/fastq/DRR030/DRR030768/...,ftp.sra.ebi.ac.uk/vol1/drr/DRR030/DRR030768,,Unclassified,DRR030768_1.fastq.gz
SAMD00026528,GENOMIC,Eukaryota,1928728,PRJDB3528,DRR030768,DRS107612,DRX027778,Paulinella micropora,ILLUMINA,Illumina HiSeq 2500,PAIRED,WGS,RANDOM,219720858.0,6.591626e+10,ftp.sra.ebi.ac.uk/vol1/fastq/DRR030/DRR030768/...,ftp.sra.ebi.ac.uk/vol1/drr/DRR030/DRR030768,,Unclassified,DRR030768_2.fastq.gz
SAMD00026528,GENOMIC,Eukaryota,1928728,PRJDB3528,DRR030769,DRS107612,DRX027779,Paulinella micropora,ILLUMINA,Illumina HiSeq 2500,PAIRED,WGS,RANDOM,235877773.0,7.076333e+10,ftp.sra.ebi.ac.uk/vol1/fastq/DRR030/DRR030769/...,ftp.sra.ebi.ac.uk/vol1/drr/DRR030/DRR030769,,Unclassified,DRR030769_1.fastq.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SAMN13471349,TRANSCRIPTOMIC,Eukaryota,114981,PRJNA593817,SRR10592613,SRS5766617,SRX7272457,Dalbergia melanoxylon,ILLUMINA,Illumina HiSeq 4000,PAIRED,RNA-Seq,cDNA,74366734.0,2.231002e+10,ftp.sra.ebi.ac.uk/vol1/fastq/SRR105/013/SRR105...,ftp.sra.ebi.ac.uk/vol1/srr/SRR105/013/SRR10592613,3398,Magnoliopsida,SRR10592613_2.fastq.gz
SAMN13531194,GENOMIC,Eukaryota,220873,PRJNA593692,SRR10662764,SRS5801684,SRX7341705,Tegillarca granosa,ILLUMINA,HiSeq X Ten,PAIRED,WGS,RANDOM,437155101.0,1.311465e+11,ftp.sra.ebi.ac.uk/vol1/fastq/SRR106/064/SRR106...,ftp.sra.ebi.ac.uk/vol1/srr/SRR106/064/SRR10662764,6544,Bivalvia,SRR10662764_1.fastq.gz
SAMN13531194,GENOMIC,Eukaryota,220873,PRJNA593692,SRR10662764,SRS5801684,SRX7341705,Tegillarca granosa,ILLUMINA,HiSeq X Ten,PAIRED,WGS,RANDOM,437155101.0,1.311465e+11,ftp.sra.ebi.ac.uk/vol1/fastq/SRR106/064/SRR106...,ftp.sra.ebi.ac.uk/vol1/srr/SRR106/064/SRR10662764,6544,Bivalvia,SRR10662764_2.fastq.gz
SAMN13866233,GENOMIC,Eukaryota,4521,PRJNA601325,SRR10897223,SRS6002521,SRX7566053,Lolium multiflorum,ILLUMINA,Illumina MiSeq,PAIRED,WGS,RANDOM,17869029.0,1.066131e+10,ftp.sra.ebi.ac.uk/vol1/fastq/SRR108/023/SRR108...,ftp.sra.ebi.ac.uk/vol1/srr/SRR108/023/SRR10897223,3398,Magnoliopsida,SRR10897223_1.fastq.gz
