In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import time
start_time = time.time()

In [2]:
out_path = 'biosample_metadata.with_title.tsv'

In [4]:
key_list = ['biosample', 'Title', 'sra_accession', 'collection_date', 
            'host', 'host_taxid', 'isolation_source', 
            'lab_host', 'geo_loc_name', 'lat_lon']
# Create new file
pd.DataFrame(columns=key_list).to_csv(out_path, sep='\t', index=False)

# Get generator
gen = ET.iterparse('biosample_set.xml', events=("start", "end"))

# Initiatlise
sample_count = 0
dict_list = []
chunk_size = 10000

# Parse entries
for event, elem in gen:
    tag = elem.tag
    value = elem.text
    attrib_dict = elem.attrib
    
    # Start new entry for each biosample
    if event == 'start' and tag == 'BioSample':
        bs_dict = {key: None for key in key_list}
    
    # Get IDs
    if event == 'start' and tag == 'Id':
        if value is not None:
            if value.startswith('SAM'):
                bs_dict['biosample'] = value
            elif value.startswith("SR"):
                bs_dict['sra_accession'] = value
    
    # Biosample title
    if event == 'start' and tag == 'Title':
        bs_dict[tag] = value
    
    # Other attributes
    if event == 'start' and tag == 'Attribute':
        attrib_key = elem.attrib.get("harmonized_name")
        if attrib_key in key_list:
            bs_dict[attrib_key] = value
            
    if event == 'end' and tag =='BioSample':
        if bs_dict['host'] is None and bs_dict['host_taxid'] is None:
            pass
        else:
            # Write file every $chunk_size iterations
            if sample_count <= chunk_size:
                dict_list.append(bs_dict)
                sample_count = sample_count + 1
            else:
                temp = pd.DataFrame(dict_list)
                temp.to_csv(out_path, mode='a', header=False, sep='\t', index=False)
                
                # Reset counters
                sample_count = 0
                dict_list = []
        
    elem.clear()
    
# Final chunk
temp = pd.DataFrame(dict_list)
temp.to_csv(out_path, mode='a', header=False, sep='\t', index=False)    

In [5]:
time_taken = (time.time()-start_time) / 60 / 60 

In [6]:
print(f'Script took: {time_taken} hrs')

Script took: 1.031504081222746 hrs


In [7]:
0.85*60

51.0