#### Create conda environment
conda create -n aws-env python=3.9

conda activate aws-env

python -m pip install boto3 pandas s3fs

#### Install AWS CLI
curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"

unzip awscliv2.zip

sudo ./aws/install

#### Encyclopedia of DNA Elements (ENCODE)

Provided by: ENCODE Data Coordinating Center, part of the [AWS Open Data Sponsorship Program](https://aws.amazon.com/marketplace/search/results?trk=868d8747-614e-4d4d-9fb6-fd5ac02947a8&sc_channel=el&FULFILLMENT_OPTION_TYPE=DATA_EXCHANGE&CONTRACT_TYPE=OPEN_DATA_LICENSES&filters=FULFILLMENT_OPTION_TYPE%2CCONTRACT_TYPE)

In [1]:
!aws s3 ls --no-sign-request s3://encode-public/

                           PRE 2008/
                           PRE 2009/
                           PRE 2010/
                           PRE 2011/
                           PRE 2012/
                           PRE 2013/
                           PRE 2014/
                           PRE 2015/
                           PRE 2016/
                           PRE 2017/
                           PRE 2018/
                           PRE 2019/
                           PRE 2020/
                           PRE 2021/
                           PRE 2022/
                           PRE 2023/
2023-05-25 02:08:46  881843461 encode_file_manifest.tsv
2019-02-10 12:31:24         26 robots.txt


In [2]:
!aws s3 ls --no-sign-request s3://encode-public/2023/

                           PRE 01/
                           PRE 02/
                           PRE 03/
                           PRE 04/
                           PRE 05/


In [3]:
!aws s3 ls --no-sign-request s3://encode-public/2023/05/15/a33c3ad2-9631-432a-bfbc-fd5303e8b78c/

2023-05-20 00:09:12   21609504 ENCFF907PGF.bigBed


In [4]:
import pandas as pd

In [5]:
s3_files = pd.read_csv('s3://encode-public/encode_file_manifest.tsv', sep='\t')

In [6]:
s3_files.shape

(1392439, 19)

In [7]:
s3_files.columns

Index(['accession', 'status', 'file_format', 'file_type', 'assembly',
       'award.rfa', 's3_uri', 'azure_uri', 'cloud_metadata.url', 'dataset',
       'lab.@id', 'output_type', 'output_category', 'file_size',
       'date_created', 'md5sum', 'cloud_metadata.md5sum_base64',
       'replicate_libraries', 'analysis_step_version.analysis_step.name'],
      dtype='object')

In [8]:
s3_files.head()

Unnamed: 0,accession,status,file_format,file_type,assembly,award.rfa,s3_uri,azure_uri,cloud_metadata.url,dataset,lab.@id,output_type,output_category,file_size,date_created,md5sum,cloud_metadata.md5sum_base64,replicate_libraries,analysis_step_version.analysis_step.name
0,ENCFF645BQN,released,bigWig,bigWig,hg19,ENCODE4,s3://encode-public/2021/02/01/9c26c11a-95c6-4e...,https://datasetencode.blob.core.windows.net/da...,https://encode-public.s3.amazonaws.com/2021/02...,/annotations/ENCSR000CVU/,/labs/manolis-kellis/,signal p-value,signal,217777294,2021-02-01T18:10:32.524390+00:00,0f17a4b1d126dd9db740498d2a9b073d,DxeksdEm3Z23QEmNKpsHPQ==,,chrom-impute-signal-generation-step-v-1
1,ENCFF798HAP,released,bed,bed bed3+,GRCh38,community,s3://encode-public/2022/03/04/1bd9ab9d-f297-4e...,https://datasetencode.blob.core.windows.net/da...,https://encode-public.s3.amazonaws.com/2022/03...,/annotations/ENCSR000FHA/,/labs/alan-boyle/,footprints,annotation,7439,2022-03-04T20:02:51.596006+00:00,f633724be4b8403c39a61902e2747d32,9jNyS+S4QDw5phkC4nR9Mg==,,
2,ENCFF861KNS,released,bigWig,bigWig,GRCh38,ENCODE,s3://encode-public/2020/01/27/7c3d6d72-043c-4a...,https://datasetencode.blob.core.windows.net/da...,https://encode-public.s3.amazonaws.com/2020/01...,/annotations/ENCSR000FHB/,/labs/bill-noble/,signal p-value,signal,620684907,2020-01-27T17:44:54.621879+00:00,747f0f27a1593d6d8dd8a922a7189b56,dH8PJ6FZPW2N2KkipxibVg==,,
3,ENCFF636BGG,released,bed,bed bed3+,GRCh38,community,s3://encode-public/2022/03/05/411f16e8-d05e-45...,https://datasetencode.blob.core.windows.net/da...,https://encode-public.s3.amazonaws.com/2022/03...,/annotations/ENCSR000FHK/,/labs/alan-boyle/,footprints,annotation,51232,2022-03-05T04:03:01.718829+00:00,6ee1a529994fda67ff6a5230fd81066a,buGlKZlP2mf/alIw/YEGag==,,
4,ENCFF530EKT,released,bed,bed bed3+,GRCh38,community,s3://encode-public/2022/03/05/d1847fca-a684-4c...,https://datasetencode.blob.core.windows.net/da...,https://encode-public.s3.amazonaws.com/2022/03...,/annotations/ENCSR000FHZ/,/labs/alan-boyle/,footprints,annotation,10440,2022-03-05T04:56:08.963114+00:00,cd715a80e61bc0b40d551a1a5f2efee9,zXFagOYbwLQNVRoaXy7+6Q==,,


In [9]:
s3_files.file_format.value_counts()

file_format
bed            675521
bigWig         253896
fastq          140308
bam            139806
bigBed         116638
tsv             44217
txt              6623
tar              3758
starch           2398
tagAlign         2254
gtf              1482
hic              1214
bedpe            1195
gff               720
idat              550
bigInteract       268
hdf5              268
pairs             228
rcc               227
sam               188
wig               188
fasta             159
vcf                97
cndb               55
csfasta            49
csv                48
csqual             37
h5ad               36
CEL                 8
idx                 3
Name: count, dtype: int64

In [10]:
s3_files.output_category.unique()

array(['signal', 'annotation', 'quantification', 'validation',
       'reference', 'raw data', 'alignment'], dtype=object)

In [11]:
s3_files.dataset.head()

0    /annotations/ENCSR000CVU/
1    /annotations/ENCSR000FHA/
2    /annotations/ENCSR000FHB/
3    /annotations/ENCSR000FHK/
4    /annotations/ENCSR000FHZ/
Name: dataset, dtype: object

In [12]:
{ds_type.split('/')[1] for ds_type in s3_files.dataset.values}

{'annotations',
 'experiments',
 'functional-characterization-experiments',
 'functional-characterization-series',
 'matched-sets',
 'multiomics-series',
 'projects',
 'reference-epigenomes',
 'references',
 'replication-timing-series',
 'single-cell-units',
 'treatment-time-series',
 'ucsc-browser-composites'}

#### PubSeq - Public Sequence Resource

COVID-19 PubSeq - free and open online bioinformatics public sequence resource in the form of FASTA or FASTQ files of sequenced SARS-CoV-2 samples for a quick turnaround in identification of new virus strains. 

Part of the [AWS Open Data Sponsorship Program](https://aws.amazon.com/marketplace/search/results?trk=868d8747-614e-4d4d-9fb6-fd5ac02947a8&sc_channel=el&FULFILLMENT_OPTION_TYPE=DATA_EXCHANGE&CONTRACT_TYPE=OPEN_DATA_LICENSES&filters=FULFILLMENT_OPTION_TYPE%2CCONTRACT_TYPE)

In [13]:
!aws s3 ls --no-sign-request s3://pubseq-datasets/

                           PRE 2020/
                           PRE PubSeq-20210407/
2021-05-11 00:24:22      32357 index.html


In [14]:
!aws s3 ls --no-sign-request s3://pubseq-datasets/PubSeq-20210407/ | wc -l

86378


In [15]:
!aws s3 ls --no-sign-request s3://pubseq-datasets/PubSeq-20210407/ 2>/dev/null | head -5 

2021-05-09 04:32:41      29914 FR988026.fa
2021-05-09 04:32:41      29914 FR988027.fa
2021-05-09 04:32:42      29908 FR988028.fa
2021-05-09 04:32:42      29895 FR988030.fa
2021-05-09 04:32:42      29895 FR988031.fa


In [16]:
!aws s3 cp --no-sign-request s3://pubseq-datasets/PubSeq-20210407/FR988026.fa .

Completed 29.2 KiB/29.2 KiB (48.6 KiB/s) with 1 file(s) remainingdownload: s3://pubseq-datasets/PubSeq-20210407/FR988026.fa to ./FR988026.fa


In [17]:
!cat './FR988026.fa'

> FR988026
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCTTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTCTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTCGTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGGCCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTGTCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTGCTTGGTAC