# Making list of SRA accessions to fetch

In this script we will parse Entrez results to choose families for ehich there are at least 5 biosamples with at least 100k reads, and then we will randomly pick five of these. We will only consider Illumina reads. We will export the results as a csv table, which will be used by a bash script later to download data using fastq-dump

In [166]:
import pandas as pd, numpy as np, xmltodict

In [167]:
Eukarya_SRA = pd.read_csv('Eukarya_SRA.csv')
Eukarya_taxonomy = xmltodict.parse(open('Eukarya_taxonomy_clean.xml','rb'))

First, let's make a table relating each entry of Eukarya_taxonomy to its family

In [171]:
taxonomy_data = []
for tax in Eukarya_taxonomy['TaxaSet']['Taxon']:
    family = {'ScientificName':np.nan, 'TaxId':np.nan}
    kingdom = {'ScientificName':np.nan}
    for parent in tax['LineageEx']['Taxon']:
        if parent['Rank'] == 'family':
            family = parent
        if parent['Rank'] == 'kingdom':
            kingdom = parent

    taxonomy_data.append(dict(TaxID=tax['TaxId'],
                              Rank=tax['Rank'],
                              FamilyName=family['ScientificName'],
                              FamilyID=family['TaxId'],
                              Kingdom=kingdom['ScientificName']))

In [172]:
taxonomy_data = pd.DataFrame(taxonomy_data)
taxonomy_data

Unnamed: 0,TaxID,Rank,FamilyName,FamilyID,Kingdom
0,100019,species,Didymellaceae,683158,Fungi
1,1000413,species,Sapindaceae,23672,Viridiplantae
2,1000416,species,Poaceae,4479,Viridiplantae
3,1000418,species,Lamiaceae,4136,Viridiplantae
4,100047,species,Melanommataceae,45307,Fungi
...,...,...,...,...,...
24002,999564,species,Clusiaceae,55961,Viridiplantae
24003,999595,species,Clusiaceae,55961,Viridiplantae
24004,9997,species,Sciuridae,55153,Metazoa
24005,9999,species,Sciuridae,55153,Metazoa


Done, now let's find which families have at least 3 contained taxa with Illumina data. First, let's filter only to Illumina data with at least 50M base pairs.

In [173]:
Eukarya_illumina = (Eukarya_SRA.
 loc[lambda x: (x['Platform']=='ILLUMINA') & (x['bases'] != 'bases')].
 loc[lambda x: x['bases'].astype(int) > 50000000])
Eukarya_illumina

Unnamed: 0,Run,ReleaseDate,LoadDate,spots,bases,spots_with_mates,avgLength,size_MB,AssemblyName,download_path,...,Affection_Status,Analyte_Type,Histological_Type,Body_Site,CenterName,Submission,dbgap_study_accession,Consent,RunHash,ReadHash
2,SRR23456104,2023-02-14 14:07:26,2023-02-14 14:05:15,20507732,4108229833,20507732,200,1802,,https://sra-pub-run-odp.s3.amazonaws.com/sra/S...,...,,,,,UNIVERSIDAD MIGUEL HERNANDEZ DE ELCHE,SRA1590205,,public,9F8C2DBCDDADE87A80B4911E4B0E00ED,83DBC8A95A6FFD10C4CC2820A44CB4E4
3,SRR23456103,2023-02-14 14:23:02,2023-02-14 14:09:20,28604938,5609916031,28604938,196,2474,,https://sra-pub-run-odp.s3.amazonaws.com/sra/S...,...,,,,,UNIVERSIDAD MIGUEL HERNANDEZ DE ELCHE,SRA1590205,,public,BBA52B72031251C433B69E337E25E9A8,F8B229FA37419D3B7809B1DDAFADA1F5
4,SRR23456708,2023-02-14 19:25:58,2023-02-14 17:25:31,239175255,26028423817,239175255,108,13279,,https://sra-pub-run-odp.s3.amazonaws.com/sra/S...,...,,,,,TEXAS A&M UNIVERSITY,SRA1590262,,public,9E4030F31BB1C10BFC516C2C8132F7E3,B8809611A93461D6A22DBD7291EF7A9E
5,SRR23456707,2023-02-14 16:31:07,2023-02-14 16:10:26,106897386,13867013483,106897386,129,7092,,https://sra-pub-run-odp.s3.amazonaws.com/sra/S...,...,,,,,TEXAS A&M UNIVERSITY,SRA1590262,,public,C872803832B08AE183A2B02520EE40D5,23CE303E68DC0EF9550301D5665907F7
6,SRR23456706,2023-02-14 19:25:58,2023-02-14 17:37:23,281934610,34674308367,281934610,122,17656,,https://sra-pub-run-odp.s3.amazonaws.com/sra/S...,...,,,,,TEXAS A&M UNIVERSITY,SRA1590262,,public,09BFBC6C77AA0C92AECDA90BA8A48671,F10575BCA517F35F7A9542DD5E26FFBD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
657768,SRR8873051,2019-04-09 22:53:12,2019-04-09 22:51:14,9417471,2844076242,9417471,302,889,,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,...,,,,,JGI,SRA871723,,public,A8B46B1D317516291098503EE71A7CA6,3B5B38A56BC4EFF660E86ECE02FEB70A
657769,SRR9945544,2022-12-20 05:05:39,2019-08-11 05:57:49,316440553,63288110600,316440553,200,21469,,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,...,,,,,"KUNMING INSTITUTE OF ZOOLOGY, CHINESE ACADEMY ...",SRA937931,,public,4083EE6BA927651ADDD49C37D21308D8,6318B3B84A0ECA49D04C7EA8AA4BE2C6
657770,SRR9945545,2022-12-20 05:05:40,2019-08-11 03:35:44,52936555,10587311000,52936555,200,4089,,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,...,,,,,"KUNMING INSTITUTE OF ZOOLOGY, CHINESE ACADEMY ...",SRA937931,,public,2A5F7C57C9CE673CB993C9F2BCC85690,AC35773D2578D5A2E166064CB014DCFF
657771,SRR9945550,2022-12-20 05:05:40,2019-08-11 03:45:24,68035329,13607065800,68035329,200,4830,,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,...,,,,,"KUNMING INSTITUTE OF ZOOLOGY, CHINESE ACADEMY ...",SRA937931,,public,ECA5CC8AAFCD0BE099D75BF29F74D7EA,915EFB39F79C5F77D6161BEC40F8044A


Now let's count the number of taxa per family and retain only families with at least 3 taxa:

In [179]:
families_to_keep = (Eukarya_illumina.loc[:,['BioSample','TaxID']].
 merge(taxonomy_data, on = 'TaxID', how = 'left').
 dropna(subset=['FamilyID']).
 loc[:,['TaxID','FamilyID','FamilyName','Kingdom']].
 drop_duplicates().
 loc[:,['Kingdom','FamilyID','FamilyName']].
 value_counts().
 reset_index().
 rename(columns = {0:'count'}).
 loc[lambda x: x['count'] >= 3]
)

families_to_keep 

Unnamed: 0,Kingdom,FamilyID,FamilyName,count
0,Viridiplantae,4345,Ericaceae,951
1,Viridiplantae,4479,Poaceae,645
2,Metazoa,33415,Nymphalidae,581
3,Viridiplantae,4210,Asteraceae,499
4,Metazoa,8113,Cichlidae,473
...,...,...,...,...
856,Fungi,2793945,Drepanopezizaceae,3
857,Metazoa,2969676,Cardinalidae,3
858,Metazoa,224313,Odontophoridae,3
859,Metazoa,215350,Bramidae,3


How many families in each kingdom?

In [180]:
families_to_keep['Kingdom'].value_counts()

Metazoa          517
Viridiplantae    197
Fungi            147
Name: Kingdom, dtype: int64

Ok, now let's randomly choose up to 20 taxa per family with one accession randomly chosen by taxon. To do this, first we randomly choose one accession by taxon and then we randomly choose up to 20 rows per family.

In [181]:
runs_to_keep = (Eukarya_illumina.merge(taxonomy_data, on = 'TaxID', how = 'left').
                loc[lambda x: x['FamilyID'].isin(families_to_keep['FamilyID'])].
                groupby('TaxID').
                sample(n=1, random_state=2948763)['Run'])


final_runs = (Eukarya_illumina.
 merge(taxonomy_data, on = 'TaxID', how = 'left').
 loc[lambda x: x['FamilyID'].isin(families_to_keep['FamilyID'])].
 loc[lambda x: x['Run'].isin(runs_to_keep)].
 groupby('FamilyID').
 apply(lambda x: x.sample(20, random_state=87635) if len(x) > 20 else x).
 reset_index(drop=True)
)

final_runs

Unnamed: 0,Run,ReleaseDate,LoadDate,spots,bases,spots_with_mates,avgLength,size_MB,AssemblyName,download_path,...,CenterName,Submission,dbgap_study_accession,Consent,RunHash,ReadHash,Rank,FamilyName,FamilyID,Kingdom
0,SRR3948160,2017-07-20 00:01:37,2016-07-30 01:39:00,75932671,15186534200,75932671,200,9457,,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,...,"BEIJING INSTITUTES OF LIFE SCIENCE, CHINESE AC...",SRA442292,,public,A2404DC53D0D24D1807A75FEB00882B7,23B3302D74B5A43308405A3AA1BB3242,species,Muridae,10066,Metazoa
1,SRR6031630,2017-09-13 03:46:12,2017-09-13 03:21:32,97172362,29151708600,97172362,300,11844,,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,...,"UNIVERSITY OF CALIFORNIA, SAN FRANCISCO",SRA607731,,public,CCF68A917BE92EC48C6F9665B9EC60E6,515589F91A78405C82671F61732593E5,species,Muridae,10066,Metazoa
2,SRR6031625,2017-09-13 02:37:10,2017-09-13 02:31:17,1587349,396837250,1587349,250,187,,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,...,"UNIVERSITY OF CALIFORNIA, SAN FRANCISCO",SRA607731,,public,AF19E0FEBE6C8A02FAFBF484A17F2AF2,269E9A4A6582D53F8578F554317D71A4,species,Muridae,10066,Metazoa
3,SRR22515611,2022-12-04 02:10:14,2022-12-04 01:00:40,523279209,158030321118,523279209,302,52138,,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,...,BAYLOR COLLEGE OF MEDICINE,SRA1551893,,public,0BD264B5B5F4CE3483BDF6528E993B31,B3FE5BAFAF6157FAE49CE175287A10A5,species,Muridae,10066,Metazoa
4,SRR5192212,2018-02-01 00:09:40,2017-01-20 06:02:50,37196546,3333293227,0,89,1590,,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,...,UNIVERSITY OF COPENHAGEN,SRA529177,,public,AD9CDB0BA681B90CDCED700B94E82A0C,DDD5C5469C58CB4B86C698591F2F417F,species,Muridae,10066,Metazoa
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8259,SRR11020276,2021-05-04 13:34:09,2020-02-04 23:01:24,266923681,53918583562,266923681,202,20466,,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,...,UNIVERSITY OF MONTANA,SRA1037672,,public,57F415913AC68AEB9EAE4E0DF3BDE078,DABFEC352A9055E453C8A0DE42F56563,species,Leporidae,9979,Metazoa
8260,SRR10082128,2019-10-24 00:42:54,2019-09-06 17:56:03,7759607,1955420964,7759607,252,656,,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,...,UNIVERSITY OF PORTO,SRA957806,,public,9D4FE835632F9FFE266BF174F5AB7432,2D46D702AF943F5E5F5007068A81646A,species,Leporidae,9979,Metazoa
8261,SRR10023741,2019-10-23 15:49:57,2020-10-07 13:15:30,136299990,34347597480,136299990,252,13603,,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,...,CIBIO,SRA949620,,public,29B44AB4D8BAE9B237E2DE35A3B87DCA,93C34BD1DB185CAF0B5D52CAE3951039,species,Leporidae,9979,Metazoa
8262,SRR10012545,2019-10-23 15:49:57,2020-10-07 14:11:53,22551034,5682860568,22551034,252,1858,,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,...,CIBIO,SRA946777,,public,578E0EC01524116B7548C0B58DE6FEC4,E84322247BA7A37AB27DED06E3A0AEBE,species,Leporidae,9979,Metazoa


Just to verify, let's see how many records per family:

In [182]:
(final_runs.loc[:,['TaxID','FamilyID','FamilyName','Kingdom']].
 drop_duplicates().
 loc[:,['Kingdom','FamilyID','FamilyName']].
 value_counts().
 reset_index().
 rename(columns = {0:'count'}))

Unnamed: 0,Kingdom,FamilyID,FamilyName,count
0,Viridiplantae,91896,Orobanchaceae,20
1,Viridiplantae,14101,Juncaceae,20
2,Metazoa,82593,Geometridae,20
3,Metazoa,8256,Pleuronectidae,20
4,Metazoa,8247,Labridae,20
...,...,...,...,...
856,Viridiplantae,22063,Monimiaceae,3
857,Metazoa,124286,Megachilidae,3
858,Metazoa,402692,Oreohelicidae,3
859,Viridiplantae,23097,Hydrangeaceae,3


Now let's save the table with samples to download as a csv file:

In [184]:
final_runs.to_csv('runs_to_download_data.csv',index=False)

And now let's save a simplified version of this table with just the information that we need for fastq-dump

In [189]:
final_runs[['Run','FamilyID']].to_csv('runs_to_download.txt',index=False,header=False)

In [190]:
final_runs[['Run','FamilyID']]

Unnamed: 0,Run,FamilyID
0,SRR3948160,10066
1,SRR6031630,10066
2,SRR6031625,10066
3,SRR22515611,10066
4,SRR5192212,10066
...,...,...
8259,SRR11020276,9979
8260,SRR10082128,9979
8261,SRR10023741,9979
8262,SRR10012545,9979
