In [106]:
# Imports
import sys, os
from Bio import Entrez
import json
from ete3 import NCBITaxa
import pandas as pd

In [107]:
# Declares
#CSV_FILE = "../data/TP53_orthologs.csv"
CSV_FILE = sys.argv[1]
DATA_DICT = {}
# The following looks up NCBI Taxon ID from Accession IDs
# Then uses that Taxon ID to get the complete taxnomic lineage of of that species.
Entrez.email = "aglucaci@gmail.com"


In [108]:
#TREE_FILE = "../results/TP53/TP53_codons.fasta.treefile"

TREE_FILE = sys.argv[2]
with open(TREE_FILE, "r") as fh:
    TREE_NEWICK = fh.read()
#end with

In [109]:
df = pd.read_csv(CSV_FILE)
#df.index += 1
#df

In [110]:
transcript_accessions = df['RefSeq Transcript accessions'].tolist()
#len(transcript_accessions)

In [111]:
# Helper function
def match_transcript_to_tree(TREE_NEWICK, accession):
    t = Tree(TREE_NEWICK, format=1);
    for leafname in t.get_leaf_names():
        if accession in leafname:
            return leafname
        #end if
    #end for
#end match
    

In [None]:
def main(transcript_accessions):
    #DATA_DICT = {}
    global DATA_DICT
    count = 1
    for ACCESSION in transcript_accessions:
        #print("# Checking ACCESION:", ACCESSION)
        skip = False
        for i in DATA_DICT.keys():
            if ACCESSION == DATA_DICT[i]["ACCESSION"]:
                skip = True
                break
            #end if
        #end for
        if skip == True: 
            count += 1
            continue
        #end if
        
        try:
            handle = Entrez.esummary(db="nucleotide", id=ACCESSION, rettype="gb", retmode="text", retmax=1)
            records = Entrez.parse(handle)
        except:
            print("# error, sleeping")
            time.sleep(5)
            handle = Entrez.esummary(db="nucleotide", id=ACCESSION, rettype="gb", retmode="text", retmax=1)
            records = Entrez.parse(handle)
        #end try
        
        try:
            for record in records:
                TAXON_ID = record["TaxId"]
                print("#", count, "Processing transcript accession:", str(ACCESSION), "with NCBI Taxon ID:", str(TAXON_ID))
                ncbi = NCBITaxa()
                lineage = ncbi.get_lineage(TAXON_ID)
                #print(lineage)
                names = ncbi.get_taxid_translator(lineage)
                
                leafname = match_transcript_to_tree(TREE_NEWICK, ACCESSION.replace(".", "_"))
                
                DATA_DICT[str(count)] = {"ACCESSION":ACCESSION, "TAXON_ID": TAXON_ID, "LINEAGE": [names[taxid] for taxid in lineage], "TITLE":record["Title"], "LEAFNAME": leafname}
                count += 1
            #end inner for
            handle.close
        except:
            print("Error, retrying:", ACCESSION)
            #main(TREE_NEWICK) # Recursive.
        #end try
    #end outer for
    #return DATA_DICT
#end method



# 1 Processing transcript accession: NM_000546.6 with NCBI Taxon ID: 9606
# 2 Processing transcript accession: NM_011640.3 with NCBI Taxon ID: 10090
# 3 Processing transcript accession: NM_030989.3 with NCBI Taxon ID: 10116
# 4 Processing transcript accession: NM_001271820.1 with NCBI Taxon ID: 7955
# 5 Processing transcript accession: NM_174201.2 with NCBI Taxon ID: 9913
# 6 Processing transcript accession: NM_205264.1 with NCBI Taxon ID: 9031
# 7 Processing transcript accession: NM_213824.3 with NCBI Taxon ID: 9823
# 8 Processing transcript accession: NM_001389218.1 with NCBI Taxon ID: 9615
# 9 Processing transcript accession: NM_001001903.1 with NCBI Taxon ID: 8364
# 10 Processing transcript accession: NM_001009403.1 with NCBI Taxon ID: 9940
# 11 Processing transcript accession: XM_016931470.2 with NCBI Taxon ID: 9598
# 12 Processing transcript accession: NM_001009294.1 with NCBI Taxon ID: 9685
# 13 Processing transcript accession: XM_007483356.1 with NCBI Taxon ID: 13616
# 14 Proce

In [None]:
main(transcript_accessions)

In [None]:
# Check for errors here
#len(transcript_accessions)
#len(DATA_DICT.keys())
# They need to match

In [None]:
df2 = pd.DataFrame.from_dict(DATA_DICT, orient="index")
#df2

In [None]:
lineages = df2['LINEAGE'].tolist()

for i in range(len(lineages[0])):
    to_add = []
    for species in lineages:
        to_add.append(species[i])
    #end inner for
    df2["Annotation"] = to_add
    a = df2["Annotation"].nunique()
    if a > 5: break
    #print(i, df2["Annotation"].nunique())
#end outer for
        
    
    

In [None]:
for item in set(df2["Annotation"].to_list()):
    #print(item)
    df3 = df2[df2["Annotation"] == item]
    #print(df3)
    
    df3 = df3.dropna()
    
    #print(df3["LEAFNAME"].to_list())
    df3.to_csv(item + ".csv", index=True)
    
    print("# Saving to:", item + ".txt" )
    with open(item + ".txt", "w") as fh:
        fh.write("\n".join(df3["LEAFNAME"].to_list()))
        #for row in df3["LEAFNAME"].to_list():
        #    fh.write(row)

In [None]:
# There may be errors above, some LEAFNAMEs do not populate. Check the search for subroutine to make sure it works.