In [1]:
import json
import pandas as pd
import csv
from pandas import read_excel
import ftputil
import time
from Bio import Entrez
import sys

In [3]:
def get_assembly_id(acc_id):
    """Get Assembly ID from Accession ID"""
    from Bio import Entrez
    handle = Entrez.esearch(db='assembly', term=acc_id)
    record = Entrez.read(handle)
    handle.close()
    assembly = record["IdList"][0]
    return assembly

def get_assembly_summary(acc_id):
    """Get esummary from an assembly ID"""
    from Bio import Entrez
    #provide your own mail here
    Entrez.email = "carmen.ausejo@sund.ku.dk" #email
    assembly = get_assembly_id(acc_id)
    esummary_handle = Entrez.esummary(db="assembly", id=assembly, report="full")
    esummary_record = Entrez.read(esummary_handle)
    genbank_id = esummary_record['DocumentSummarySet']['DocumentSummary'][0]['Synonym']['Genbank']
    refseq_id = esummary_record['DocumentSummarySet']['DocumentSummary'][0]['Synonym']['RefSeq']
    return([assembly, genbank_id, refseq_id])

In [6]:
INPUT_TSV_FILENAME = "../../CF_isolates_BioSampleID.csv"
tsv_df = pd.read_csv(INPUT_TSV_FILENAME, sep='\t')
tsv_df = tsv_df.head()

In [5]:
INPUT_JSON_FILENAME = "../../paired_datarecord_preview_id.json"

INSTRUMENT_METHOD_LABEL ='qToF' #"Seed Grant TOF"
EXTRACTION_METHOD = 'Methanol' # "SeedGrant_Extraction"
SAMPLE_PREP = 'THA media' #"metagenomic_mouse_fecal"

template_json = json.loads(open(INPUT_JSON_FILENAME).read())

In [7]:
list_acc = tsv_df['Biosample Accession'].tolist()
genbank_dic = {}

for idx,acc_id in enumerate(list_acc):
    try:
        genbank_dic[acc_id] = (get_assembly_summary(acc_id))
    except:
        print("Accesion ID not found: " + acc_id)
        genbank_dic[acc_id] = (['-', '-', '-'])

    print(acc_id)
    print( str(idx+1) + "/" + str(len(list_acc)))

    time.sleep(4) 

SAMN02415141
1/5
SAMN02603849
2/5
SAMEA3138432
3/5
SAMEA1705934
4/5
SAMEA1705916
5/5


In [8]:
print(genbank_dic)
entrez_df = pd.DataFrame(genbank_dic.items(), columns=['Biosample Accession','esummary_record'])
entrez_df[['Assembly_ID', 'GenBank_ID', 'RefSeq_ID']]= pd.DataFrame(entrez_df.esummary_record.values.tolist(), index= entrez_df.index)
entrez_df = entrez_df[['Biosample Accession','Assembly_ID', 'GenBank_ID', 'RefSeq_ID']]
pairing_df = tsv_df.merge(entrez_df, on=['Biosample Accession'])
print(pairing_df.head())

{'SAMN02415141': ['87511', 'GCA_000510305.1', 'GCF_000510305.1'], 'SAMN02603849': ['376268', 'GCA_000226155.1', 'GCF_000226155.1'], 'SAMEA3138432': ['530308', 'GCA_000334515.1', 'GCF_000334515.1'], 'SAMEA1705934': ['91088', 'GCA_000072485.1', 'GCF_000072485.1'], 'SAMEA1705916': ['45568', 'GCA_000026645.1', 'GCF_000026645.1']}
                     Genome Name / Sample Name  Unnamed: 0  Sales order  \
0    Pseudomonas_aeruginosa_SCV20265_uid232358         NaN          NaN   
1         Pseudomonas_aeruginosa_M18_uid162089         NaN          NaN   
2     Enterobacter_aerogenes_EA1509E_uid187411         NaN          NaN   
3  Stenotrophomonas_maltophilia_K279a_uid61647         NaN          NaN   
4       Pseudomonas_aeruginosa_LESB58_uid59275         NaN          NaN   

   Item number                                         Name  Product  Level  \
0          NaN    Pseudomonas_aeruginosa_SCV20265_uid232358      NaN    NaN   
1          NaN         Pseudomonas_aeruginosa_M18_uid162089    

In [9]:
# Preparing the genome tables

all_genome_links = []
for pair in pairing_df.to_dict(orient="records"):
    genome_dict = {}
    genome_dict["genome_label"] = pair["Biosample Accession"]
    genome_dict["genome_ID"] = {
        "genome_type" : "metagenome",
        "GenBank_NCBI_accession" : pair["GenBank_ID"]
    }
    genome_dict["BioSample_accession"] = pair["Biosample Accession"]
    all_genome_links.append(genome_dict)

In [10]:
print(all_genome_links)

[{'genome_label': 'SAMN02415141', 'genome_ID': {'genome_type': 'metagenome', 'GenBank_NCBI_accession': 'GCA_000510305.1'}, 'BioSample_accession': 'SAMN02415141'}, {'genome_label': 'SAMN02603849', 'genome_ID': {'genome_type': 'metagenome', 'GenBank_NCBI_accession': 'GCA_000226155.1'}, 'BioSample_accession': 'SAMN02603849'}, {'genome_label': 'SAMEA3138432', 'genome_ID': {'genome_type': 'metagenome', 'GenBank_NCBI_accession': 'GCA_000334515.1'}, 'BioSample_accession': 'SAMEA3138432'}, {'genome_label': 'SAMEA1705934', 'genome_ID': {'genome_type': 'metagenome', 'GenBank_NCBI_accession': 'GCA_000072485.1'}, 'BioSample_accession': 'SAMEA1705934'}, {'genome_label': 'SAMEA1705916', 'genome_ID': {'genome_type': 'metagenome', 'GenBank_NCBI_accession': 'GCA_000026645.1'}, 'BioSample_accession': 'SAMEA1705916'}]


In [11]:
# Preparing the join table

all_paired_links = []
for pair in pairing_df.to_dict(orient="records"):
    link_dict = {}
    link_dict["genome_label"] = pair["Biosample Accession"]
    link_dict["metabolomics_file"] = pair["MS data accession, Sputum/ASM Media (data in MSV000080251)"]
    link_dict["sample_preparation_label"] = SAMPLE_PREP
    link_dict["extraction_method_label"] = EXTRACTION_METHOD
    link_dict["instrumentation_method_label"] = INSTRUMENT_METHOD_LABEL
    
    all_paired_links.append(link_dict)

In [12]:
print(all_paired_links)

[{'genome_label': 'SAMN02415141', 'metabolomics_file': 'ftp://massive.ucsd.edu/MSV000080251/peak/CFStrains/Strains/VVP011_TH_GA11_01_27002.mzXML', 'sample_preparation_label': 'metagenomic_mouse_fecal', 'extraction_method_label': 'SeedGrant_Extraction', 'instrumentation_method_label': 'Seed Grant TOF'}, {'genome_label': 'SAMN02603849', 'metabolomics_file': 'ftp://massive.ucsd.edu/MSV000080251/peak/CFStrains/Strains/VVP012_TH_GA12_01_27003.mzXML', 'sample_preparation_label': 'metagenomic_mouse_fecal', 'extraction_method_label': 'SeedGrant_Extraction', 'instrumentation_method_label': 'Seed Grant TOF'}, {'genome_label': 'SAMEA3138432', 'metabolomics_file': 'ftp://massive.ucsd.edu/MSV000080251/peak/CFStrains/Strains/VVP016_TH_GB4_01_27009.mzXML', 'sample_preparation_label': 'metagenomic_mouse_fecal', 'extraction_method_label': 'SeedGrant_Extraction', 'instrumentation_method_label': 'Seed Grant TOF'}, {'genome_label': 'SAMEA1705934', 'metabolomics_file': 'ftp://massive.ucsd.edu/MSV000080251/

In [13]:
# Merging it all together
template_json["genomes"] = all_genome_links
template_json["genome_metabolome_links"] = all_paired_links
with open("../../CF_isolates_json.json", "w") as output_file:
    output_file.write(json.dumps(template_json))

In [14]:
print(template_json)

{'version': '1', 'personal': {'submitter_name': 'Alexander Aksenov', 'submitter_orcid': 'https://orcid.org/0000-0002-9445-2248', 'submitter_email': 'aaaksenov@ucsd.edu', 'PI_name': 'Pieter Dorrestein', 'PI_institution': 'University of California, San Diego', 'PI_email': 'pdorrestein@health.ucsd.edu'}, 'metabolomics': {'project': {'GNPSMassIVE_ID': 'MSV000080251', 'MaSSIVE_URL': 'https://gnps.ucsd.edu/ProteoSAFe/result.jsp?task=476c1e7fef5c4f7692f55d298c81a6d7&view=advanced_view'}}, 'experimental': {'sample_preparation': [{'medium_details': {'medium_type': 'liquid', 'medium': 'other', 'Other_medium': 'Todd Hewitt Agar', 'Other_medium_link': 'https://www.sigmaaldrich.com/catalog/product/sial/t1438?lang=en&region=US'}, 'growth_parameters': {}, 'aeration': {}, 'sample_preparation_method': 'THA media'}], 'extraction_methods': [{'solvents': [{'ratio': 1, 'solvent': 'http://purl.obolibrary.org/obo/CHEBI_17790'}], 'extracted_material': 'cells_supernatant', 'extraction_method': 'Methanol'}], 'i