In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import time

In [2]:
out_path = '../data/metadata/sra_metadata/biosample_metadata_210324.full.new.tsv'

key_list = ['biosample', 'biosample_accession', 'biosample_title',
            'taxonomy_name', 'taxonomy_id', 'strain', 
            'isolate', 'gisaid_virus_name', 'GISAID_accession',
            'collection_date', 'host', 'host_taxid', 
            'isolation_source', 'lab_host', 'geo_loc_name', 
            'lat_lon', 'pubmed_id', 'owner']

start_time = time.time()

# Get headers
header_df = pd.DataFrame(columns=key_list)
header_df.to_csv(out_path, mode='w', header=True, sep='\t', index=False)

# Get generator
gen = ET.iterparse('../data/metadata/sra_metadata/biosample_set.210324.xml', events=("start", "end"))

# Initiatlise
sample_count = 0
dict_list = []
chunk_size = 5000

# Parse entries
for event, elem in gen:
    tag = elem.tag
    value = elem.text
#     attrib_dict = elem.attrib
    
    # Start new entry for each biosample
    if event == 'start' and tag == 'BioSample':
        bs_dict = {key: None for key in key_list}
        bs_dict['biosample'] = elem.attrib['accession']
        continue
    
    # Get IDs
    if event == 'start' and tag == 'Id':
        attrib_dict = elem.attrib
        
        if 'db' in attrib_dict:
            if attrib_dict['db'] == 'SRA':
                bs_dict['biosample_accession'] = value
                continue

    # Get Biosample title
    if event == 'start' and tag == 'Title':
        bs_dict['biosample_title'] = value
        continue

    # Get pubmed link
    if event == 'start' and tag == 'Link':
        attrib_dict = elem.attrib
        if 'target' in attrib_dict.keys():
            if attrib_dict['target'] == 'pubmed':
                bs_dict['pubmed_id'] = value
                continue

    # Get owner name
    if event == 'start' and tag == 'Name':
        bs_dict['owner'] = value
        continue

    # Get organism name
    if event == 'start' and tag == 'Organism':
        attrib_dict = elem.attrib
        attrib_key = attrib_dict.get('taxonomy_name')
        bs_dict['taxonomy_name'] = elem.attrib['taxonomy_name']
        continue

    if event == 'start' and tag == 'Organism':
        attrib_dict = elem.attrib
        attrib_key = attrib_dict.get("taxonomy_id")
        bs_dict['taxonomy_id'] = elem.attrib['taxonomy_id']
        continue
    # Other attributes
    if event == 'start' and tag == 'Attribute':
        attrib_dict = elem.attrib
        attrib_key = attrib_dict.get("harmonized_name")
        
        if attrib_key in key_list:
            bs_dict[attrib_key] = value
            continue
            
    if event == 'end' and tag =='BioSample':
        # Write file every $chunk_size iterations
        if sample_count <= chunk_size:
            dict_list.append(bs_dict)
            sample_count += 1
        else:
            dict_list.append(bs_dict)
            temp = pd.DataFrame(dict_list)
            temp.to_csv(out_path, mode='a', header=False, sep='\t', index=False)

            # Reset counters
            sample_count = 0
            dict_list = []
        
    elem.clear()
    
# Final chunk
temp = pd.DataFrame(dict_list)
temp.to_csv(out_path, mode='a', header=False, sep='\t', index=False)

time_taken = (time.time()-start_time) / 60 / 60 
print(f'Script took: {time_taken} hrs')

Script took: 1.255691616071595 hrs


In [None]:
temp

## With sample titles

In [None]:
out_path = '../data/metadata/sra_metadata/NCBI_Biosample_metadata_130923/biosample_metadata_130923.with_title.tsv'

key_list = ['biosample', 'biosample_accession', 'biosample_title', 'collection_date', 
            'host', 'host_taxid', 'isolation_source', 
            'lab_host', 'geo_loc_name', 'lat_lon']

start_time = time.time()

# Get headers
header_df = pd.DataFrame(columns=key_list)
header_df.to_csv(out_path, mode='w', header=True, sep='\t', index=False)

# Get generator
gen = ET.iterparse('../data/metadata/sra_metadata/NCBI_Biosample_metadata_130923/biosample_set.xml', events=("start", "end"))

# Initiatlise
sample_count = 0
dict_list = []
chunk_size = 5000

# Parse entries
for event, elem in gen:
    tag = elem.tag
    value = elem.text
#     attrib_dict = elem.attrib
    
    # Start new entry for each biosample
    if event == 'start' and tag == 'BioSample':
        bs_dict = {key: None for key in key_list}
        continue
    
    # Get IDs
    if event == 'start' and tag == 'Id':
        attrib_dict = elem.attrib
        
        if 'db' in attrib_dict:
            if attrib_dict['db'] == "BioSample":
                bs_dict['biosample'] = value
                continue
            elif attrib_dict['db'] == 'SRA':
                bs_dict['biosample_accession'] = value
                continue
    
    # Get Biosample title
    if event == 'start' and tag == 'Title':
        bs_dict['biosample_title'] = value
        continue
    
    # Other attributes
    if event == 'start' and tag == 'Attribute':
        attrib_dict = elem.attrib
        attrib_key = attrib_dict.get("harmonized_name")
        
        if attrib_key in key_list:
            bs_dict[attrib_key] = value
            continue
            
    if event == 'end' and tag =='BioSample':
        if bs_dict['host'] is None and bs_dict['host_taxid'] is None:
            pass
        else:
            # Write file every $chunk_size iterations
            if sample_count <= chunk_size:
                dict_list.append(bs_dict)
                sample_count += 1
            else:
                temp = pd.DataFrame(dict_list)
                temp.to_csv(out_path, mode='a', header=False, sep='\t', index=False)
                
                # Reset counters
                sample_count = 0
                dict_list = []
        
    elem.clear()
    
# Final chunk
temp = pd.DataFrame(dict_list)
temp.to_csv(out_path, mode='a', header=False, sep='\t', index=False)

time_taken = (time.time()-start_time) / 60 / 60 
print(f'Script took: {time_taken} hrs')

In [None]:
out_path = '../data/metadata/sra_metadata/NCBI_Biosample_metadata_130923/biosample_metadata_130923.with_attrs.tsv'

key_list = ['biosample', 'biosample_accession', 'biosample_title', 'collection_date', 
            'host', 'host_taxid', 'isolation_source', 
            'lab_host', 'geo_loc_name', 'lat_lon', 
            'env_medium', 'project_name', 'sample_name']

start_time = time.time()

# Get headers
header_df = pd.DataFrame(columns=key_list)
header_df.to_csv(out_path, mode='w', header=True, sep='\t', index=False)

# Get generator
gen = ET.iterparse('../data/metadata/sra_metadata/NCBI_Biosample_metadata_130923/biosample_set.xml', events=("start", "end"))

# Initiatlise
sample_count = 0
dict_list = []
chunk_size = 5000

# Parse entries
for event, elem in gen:
    tag = elem.tag
    value = elem.text
#     attrib_dict = elem.attrib
    
    # Start new entry for each biosample
    if event == 'start' and tag == 'BioSample':
        bs_dict = {key: None for key in key_list}
        continue
    
    # Get IDs
    if event == 'start' and tag == 'Id':
        attrib_dict = elem.attrib
        
        if 'db' in attrib_dict:
            if attrib_dict['db'] == "BioSample":
                bs_dict['biosample'] = value
                continue
            elif attrib_dict['db'] == 'SRA':
                bs_dict['biosample_accession'] = value
                continue
    
    # Get Biosample title
    if event == 'start' and tag == 'Title':
        bs_dict['biosample_title'] = value
        continue
    
    # Other attributes
    if event == 'start' and tag == 'Attribute':
        attrib_dict = elem.attrib
        attrib_key = attrib_dict.get("harmonized_name")
        
        if attrib_key in key_list:
            bs_dict[attrib_key] = value
            continue
            
    if event == 'end' and tag =='BioSample':
        if bs_dict['host'] is None and bs_dict['host_taxid'] is None:
            pass
        else:
            # Write file every $chunk_size iterations
            if sample_count <= chunk_size:
                dict_list.append(bs_dict)
                sample_count += 1
            else:
                temp = pd.DataFrame(dict_list)
                temp.to_csv(out_path, mode='a', header=False, sep='\t', index=False)
                
                # Reset counters
                sample_count = 0
                dict_list = []
        
    elem.clear()
    
# Final chunk
temp = pd.DataFrame(dict_list)
temp.to_csv(out_path, mode='a', header=False, sep='\t', index=False)

time_taken = (time.time()-start_time) / 60 / 60 
print(f'Script took: {time_taken} hrs')