### This script is to add rows to taxonomy_aggregated file and create a new file taxonomy_aggregated_full.tsv

In [2]:
import csv
import json
import os
import pandas as pd
from tqdm.notebook import tqdm, trange

In [3]:
prev_aggregated_file_path = "data/taxonomy_aggregated.tsv"
new_aggregated_file_path = "data/taxonomy_aggregated_full.tsv"

In [3]:
row_count = 0
id_prefix = set()
with open(prev_aggregated_file_path, 'r') as file:
    reader = csv.reader(file, delimiter="\t")
    header = next(reader)
    print(header[0])
    for row in reader:
        row_count += 1
        if row[0][:3] not in id_prefix:
            id_prefix.add(row[0][:3])
            print(row[0])
print("Row number is:", row_count)

id
SRR3650981
ERR2226418
OBJI01
OBLL01
ERZ477605
DRR060270
OGVH01
OGWA01
OKRQ01
OBHY01
OBIR01
OCZS01
OEID01
OCVQ01
OJMR01
OJNA01
OLIN01
OCTA01
ODAK01
ONZS01
OOAA01
OOBB01
OOCA01
OODA01
OOIF01
OBKB01
OMGG01
OGCN01
OGDF01
OBCF01
OBDG01
OCLA01
OCQI01
OCRJ01
OGRV01
OGSA01
OGTA01
OGUE01
OIEW01
OLVV01
OLWC01
OLXA01
OLYA01
OLZA01
OMAA01
OMBA01
OMCA01
OMDA01
OMEA01
OMFA01
OMNP01
OCMB01
OKVE01
OFMY01
OFNA01
OFOB01
OFPA01
OFRP01
OFSL01
OMHA01
OMIA01
OJOC01
OJPA01
OJQA01
OJRA01
OJAF01
OJBA01
OJCA01
OJDA01
OJEA01
OJFA01
OJGA01
OJHA01
OJIA01
OLGP01
OLHB01
OMOM01
OFHC01
OENZ01
OFEU01
OHUH01
OHVA01
OHWA01
OHXA01
OHYA01
OHZA01
OIAA01
OIBA01
OICA01
OIDA01
OKTV01
OKUB01
OIFB01
OIGD01
OKWC01
OKXB01
OAOO01
OAPA01
OBAJ01
OBNF01
OBOA01
OBPA01
OBQA01
OCOK01
OCPA01
OCSY01
OEBK01
OEFK01
OFDN01
OHAR01
OHBA01
OHCA01
OHDA01
OHEA01
OHFA01
OHGA01
OHHA01
OHIA01
OHJA01
OHKA01
OHLA01
OHMA01
OHNA01
OHOA01
OHPA01
OHQA01
OHRA01
OHSA01
OHTA01
OKSG01
OJKN01
OJLB01
OLKE01
OGXA01
OGYA01
OGZA01
OFCZ01
OJSR01
OJTA01
OJUA01
OJV

In [4]:
def read_analyses_json(study, id_type):
    """"
    This function takes study_id and return run_id and sample_id matching pair in a hashmap.
    study_id: string: the id of a study
    return analyses_hash: hashmap: {run_id: sample_id}
    """
    studies_dir_path = "../mgnify/data/mgnify/studies"
    analyses_path = os.path.join(studies_dir_path, study, 'analyses.json')
    analyses_hash = {}
    # read analyses.json
    with open(analyses_path) as f:
        analyses = json.load(f)
    # create analyses_hash 
    for data in analyses['data']:
        # get run_id or assembly_id
        try:
            _id = data['relationships'][id_type]['data']['id']
        except KeyError as e:
            print("study: {} does not have {}_id in analyses.json. Error msg:{}".format(study,id_type, e))
            continue
            
        # get experiment type
        exptype = 'undefined'
        try:
            exptype = data['attributes']['experiment-type']
        except KeyError as e:
            continue
            
        # get sample id
        try: 
            sample_id = data['relationships']['sample']['data']['id']
        except KeyError as e:
            print("study: {} does not have smaple_id in analyses.json. Error msg:{}".format(study, e))
            continue
        if (_id and sample_id):
            analyses_hash[_id] = {"sample_id": sample_id, "exptype": exptype}
    return analyses_hash

In [5]:
def read_samples_json(study):
    studies_dir_path = "../mgnify/data/mgnify/studies"
    samples_path = os.path.join(studies_dir_path, study, 'samples.json')
    samples_hash = {}
    # read sample.json
    with open(samples_path) as f:
        samples = json.load(f)
    # create sample_hash 
    for data in samples['data']:
        biome = None
        try:
            sample_id = data['id']
        except KeyError as e:
            print("study: {} does not have sampoe_id in samples.json. Error msg:{}".format(study, e))
            continue
        try:
            biome = data['relationships']['biome']['data']['id']
        except KeyError as e:
            print("study: {} does not have biome in samples.json. Error msg:{}".format(study, e))
            continue
        if biome:
            samples_hash[sample_id] = biome
    return samples_hash

In [7]:
%%time
row_count = 277023
with open(prev_aggregated_file_path, 'r') as f_in:
    with open(new_aggregated_file_path, 'w') as f_out:
        reader = csv.reader(f_in, delimiter="\t")
        writer = csv.writer(f_out, delimiter="\t")
        header = next(reader)
        header.insert(2, 'sample_id')
        header.insert(3, 'biome')
        header.insert(4, 'exptype')
        writer.writerow(header)
#         file_list = list(reader)
        study_hash = {} # this dict is used to store the hash map for same study
        for i in tqdm(range(row_count)):
            row = next(reader)
            _id, study_id = row[0], row[1]
            if _id.startswith("SRR") or _id.startswith("ERR") or _id.startswith("ERZ"):
                # determine the id type, run_id or assembly_id
                if _id.startswith("SRR") or _id.startswith("ERR"):
                    id_type = 'run'
                else:
                    id_type = 'assembly'
                    
                # if the stuty study_hash has been created, use it directly
                if study_id in study_hash:
                    analyses_hash = study_hash[study_id]['analyses_hash']
                    samples_hash = study_hash[study_id]['samples_hash']
                else:
                    analyses_hash = read_analyses_json(study_id, id_type)
                    samples_hash = read_samples_json(study_id)
                    study_hash[study_id] = {}
                    study_hash[study_id]['analyses_hash'] = analyses_hash
                    study_hash[study_id]['samples_hash'] = samples_hash
                    
                # try to extract the sample_id from analyses_hash, and write to new file
                try:
                    sample_id = analyses_hash[_id]['sample_id']
                    exp_type = analyses_hash[_id]['exptype']
                    biome = samples_hash[sample_id]
                    row.insert(2, sample_id)
                    row.insert(3, biome)
                    row.insert(4, exp_type)
                    writer.writerow(row)
                except KeyError:
                    print("{}_id({}) doesn't have sample id or biome".format(id_type, _id))            
print("new file is generated")

HBox(children=(IntProgress(value=0, max=277023), HTML(value='')))

study: MGYS00003447 does not have assembly_id in analyses.json. Error msg:'id'
study: MGYS00003447 does not have assembly_id in analyses.json. Error msg:'id'
study: MGYS00003447 does not have assembly_id in analyses.json. Error msg:'id'
study: MGYS00003447 does not have assembly_id in analyses.json. Error msg:'id'
study: MGYS00003447 does not have assembly_id in analyses.json. Error msg:'id'
study: MGYS00003447 does not have assembly_id in analyses.json. Error msg:'id'
study: MGYS00003447 does not have assembly_id in analyses.json. Error msg:'id'
study: MGYS00003447 does not have assembly_id in analyses.json. Error msg:'id'
study: MGYS00003447 does not have assembly_id in analyses.json. Error msg:'id'
study: MGYS00003447 does not have assembly_id in analyses.json. Error msg:'id'
study: MGYS00003447 does not have assembly_id in analyses.json. Error msg:'id'
study: MGYS00003447 does not have assembly_id in analyses.json. Error msg:'id'
study: MGYS00003447 does not have assembly_id in ana

In [8]:
with open(new_aggregated_file_path, 'r') as f:
    reader = csv.reader(f, delimiter="\t")
    next(reader)
    for i in range(1):
        print(next(reader))

['SRR3650981', 'MGYS00002076', 'SRS1493754', 'root:Host-associated:Mammals:Respiratory system:Nasopharyngeal', 'unknown', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0'