In [1]:
# Imports
import sys, os
from Bio import Entrez
import json
from ete3 import NCBITaxa
import pandas as pd
from ete3 import Tree

In [2]:
# Declares
CSV_FILE = "../data/BDNF/BDNF_orthologs.csv"
#CSV_FILE = sys.argv[1]

DATA_DICT = {}

# The following looks up NCBI Taxon ID from Accession IDs
# Then uses that Taxon ID to get the complete taxnomic lineage of of that species.
Entrez.email = "aglucaci@gmail.com"


In [3]:
TREE_FILE = "../results/BDNF/Recombinants/BDNF_codons_RDP_recombinationFree.fas.treefile"

#TREE_FILE = sys.argv[2]

# Read file, get newick
with open(TREE_FILE, "r") as fh:
    TREE_NEWICK = fh.read()
#end with

In [4]:
TREE_NEWICK

'(NM_001709_5_Homo_sapiens_brain_derived_neurotrophi:0.0072488191,((((((((((((((NM_001270630_1_Rattus_norvegicus_brain_derived_neu:0.0002067937,(((NM_001048142_1_Mus_musculus_brain_derived_neurotro:0.0000025173,XM_021151236_2_PREDICTED_Mus_caroli_brain_derived_:0.0038244124):0.0029655172,XM_021192829_2_PREDICTED_Mus_pahari_brain_derived_:0.0146613311):0.0371460124,XM_032903861_1_PREDICTED_Rattus_rattus_brain_deriv:0.0042058297):0.0054785752):0.0777307590,(XM_007653166_4_PREDICTED_Cricetulus_griseus_brain_:0.0674809283,XM_037204438_1_PREDICTED_Peromyscus_leucopus_brain:0.0301779458):0.0262228836):0.0995228554,XM_029555090_1_PREDICTED_Nannospalax_galili_brain_:0.0659379589):0.0612090200,XM_004660846_2_PREDICTED_Jaculus_jaculus_brain_der:0.1959495753):0.0652542305,(((XM_007497196_2_PREDICTED_Monodelphis_domestica_bra:0.1208032517,((XM_020977976_1_PREDICTED_Phascolarctos_cinereus_br:0.1175236731,XM_027852578_1_PREDICTED_Vombatus_ursinus_brain_de:0.0381565277):0.0257429950,XM_036763631_1_PR

In [5]:
df = pd.read_csv(CSV_FILE)
df.index += 1
df

Unnamed: 0,Gene ID,Gene symbol,Description,Scientific name,Common name,RefSeq Transcript accessions,RefSeq Protein accessions,SPARCLE ID,Curation Status,RefseqSelect,Default sequence
1,627,BDNF,brain derived neurotrophic factor,Homo sapiens,human,NM_001709.5,NP_001700.2,10639753.0,curated,RefseqSelect,default-seq
2,12064,Bdnf,brain derived neurotrophic factor,Mus musculus,house mouse,NM_001048142.1,NP_001041607.1,10639753.0,curated,RefseqSelect,default-seq
3,24225,Bdnf,brain-derived neurotrophic factor,Rattus norvegicus,Norway rat,NM_001270630.1,NP_001257559.1,10639753.0,curated,,default-seq
4,397495,BDNF,brain derived neurotrophic factor,Sus scrofa,pig,XM_005654684.3,XP_005654741.1,10639753.0,model,,default-seq
5,403461,BDNF,brain derived neurotrophic factor,Canis lupus familiaris,dog,XM_038429434.1,XP_038285362.1,,model,,default-seq
...,...,...,...,...,...,...,...,...,...,...,...
158,120876831,BDNF,brain derived neurotrophic factor,Oryx dammah,scimitar-horned oryx,XM_040258919.1,XP_040114853.1,,model,,default-seq
159,121043625,BDNF,brain derived neurotrophic factor,Puma yagouaroundi,jaguarundi,XM_040495572.1,XP_040351506.1,,model,,default-seq
160,121156520,BDNF,brain derived neurotrophic factor,Ochotona curzoniae,black-lipped pika,XM_040980297.1,XP_040836231.1,,model,,default-seq
161,121465693,Bdnf,brain derived neurotrophic factor,Microtus oregoni,creeping vole,XM_041679035.1,XP_041534969.1,,model,,default-seq


In [6]:
transcript_accessions = df['RefSeq Transcript accessions'].tolist()

#len(transcript_accessions)

In [7]:
# Helper function
def match_transcript_to_tree(TREE_NEWICK, accession):
    t = Tree(TREE_NEWICK, format=1)
    
    # uncomment to debug
    #print("# in 'match_transcript_to_tree'", accession)
    
    for leafname in t.get_leaf_names():
        if accession in leafname:
            # uncomment to debug
            #print("# Returning:", leafname)
            return leafname
        #end if
    #end for
#end match
    

In [8]:
def main(transcript_accessions):
    #DATA_DICT = {}
    global DATA_DICT, TREE_NEWICK
    count = 1
    for ACCESSION in transcript_accessions:
        #print("# Checking ACCESION:", ACCESSION)
        skip = False
        for i in DATA_DICT.keys():
            if ACCESSION == DATA_DICT[i]["ACCESSION"]:
                skip = True
                break
            #end if
        #end for
        if skip == True: 
            count += 1
            continue
        #end if
        
        try:
            handle = Entrez.esummary(db="nucleotide", id=ACCESSION, rettype="gb", retmode="text", retmax=1)
            records = Entrez.parse(handle)
        except Exception as e:
            print("# Error, sleeping", e)
            time.sleep(5)
            handle = Entrez.esummary(db="nucleotide", id=ACCESSION, rettype="gb", retmode="text", retmax=1)
            records = Entrez.parse(handle)
        #end try
        
        try:
            for record in records:
                TAXON_ID = record["TaxId"]
                
                print("#", count, "Processing transcript accession:", str(ACCESSION), "with NCBI Taxon ID:", str(TAXON_ID))
                ncbi = NCBITaxa()
                
                lineage = ncbi.get_lineage(TAXON_ID)
                # uncomment to debug
                #print(lineage) # returns a list of taxon id's
                
                names = ncbi.get_taxid_translator(lineage)
                # uncomment to debug
                #print(names) # translates the taxon id's into clade names
                
                # uncomment to debug
                #print("# ACCESSION:", ACCESSION)
                leafname = ""
                leafname = match_transcript_to_tree(TREE_NEWICK, ACCESSION.replace(".", "_"))
                
                # uncomment to debug
                #print(leafname)
                
                DATA_DICT[str(count)] = {"ACCESSION": ACCESSION, "TAXON_ID": TAXON_ID, 
                                         "LINEAGE": [names[taxid] for taxid in lineage], 
                                         "TITLE":record["Title"], "LEAFNAME": leafname}
                count += 1
            #end inner for
            handle.close
        except Exception as e:
            print("# Error (main):", ACCESSION, e, "\n")
            #print(names)
            #print(e)
            #main(TREE_NEWICK) # Recursive.
            #break
        #end try
        
    #end outer for
    
    #return DATA_DICT
#end method



In [9]:
main(transcript_accessions)

# 1 Processing transcript accession: NM_001709.5 with NCBI Taxon ID: 9606
# 2 Processing transcript accession: NM_001048142.1 with NCBI Taxon ID: 10090
# 3 Processing transcript accession: NM_001270630.1 with NCBI Taxon ID: 10116
# 4 Processing transcript accession: XM_005654684.3 with NCBI Taxon ID: 9823
# 5 Processing transcript accession: XM_038429434.1 with NCBI Taxon ID: 9615
# 6 Processing transcript accession: NM_001009828.1 with NCBI Taxon ID: 9685
# 7 Processing transcript accession: NM_001012441.1 with NCBI Taxon ID: 9598
# 8 Processing transcript accession: XM_007497196.2 with NCBI Taxon ID: 13616
# 9 Processing transcript accession: XM_005216334.4 with NCBI Taxon ID: 9913
# 10 Processing transcript accession: XM_015114598.2 with NCBI Taxon ID: 9544
# 11 Processing transcript accession: NM_001081787.1 with NCBI Taxon ID: 9796
# 12 Processing transcript accession: XM_029059317.2 with NCBI Taxon ID: 9258
# 13 Processing transcript accession: XM_017345633.1 with NCBI Taxon ID: 

# 104 Processing transcript accession: XM_022507574.1 with NCBI Taxon ID: 391180
# 105 Processing transcript accession: XM_022583733.1 with NCBI Taxon ID: 9749
# 106 Processing transcript accession: XM_023230347.2 with NCBI Taxon ID: 591936
# 107 Processing transcript accession: XM_024558903.1 with NCBI Taxon ID: 9430
# 108 Processing transcript accession: XM_024764694.1 with NCBI Taxon ID: 1706337
# 109 Processing transcript accession: XM_025358305.1 with NCBI Taxon ID: 9565
# 110 Processing transcript accession: XM_025459816.2 with NCBI Taxon ID: 286419
# 111 Processing transcript accession: XM_025879326.1 with NCBI Taxon ID: 34884
# 112 Processing transcript accession: XM_025929035.1 with NCBI Taxon ID: 9696
# 113 Processing transcript accession: XM_026018600.1 with NCBI Taxon ID: 9627
# 114 Processing transcript accession: XM_026412272.1 with NCBI Taxon ID: 9999
# 115 Processing transcript accession: XM_026512203.1 with NCBI Taxon ID: 116960
# 116 Processing transcript accession: X

In [10]:
# Check for errors here
print(len(transcript_accessions)) # Total
print(len(DATA_DICT.keys())) # What we have identified
# They need to match

162
161


In [11]:
df2 = pd.DataFrame.from_dict(DATA_DICT, orient="index")
#df2

In [12]:
#df2["LINEAGE"].to_list()

In [13]:
# use explode to expand the lists into separate rows
dfe = df2.LINEAGE.explode().to_frame().reset_index(drop=True)

## groupby the values in the column, get the count and sort
#dfg = dfe.groupby('LINEAGE').count() \
#                               .reset_index(name='count') \
#                               .sort_values(['count'], ascending=False) #

#display(dfg)
#dfg = dfe.groupby('LINEAGE').count().reset_index().sort_values(['count'], ascending=False) 
#dfg 

# plot the dataframe
#dfg.plot.bar(x='LINEAGE')

dfe

dfg = dfe.groupby('LINEAGE').LINEAGE.count() \
                               .reset_index(name='count') \
                               .sort_values(['count'], ascending=False) \
                               .head(50).reset_index(drop=True)

dfg.plot.barh(x='LINEAGE', figsize=(12, 12))


<AxesSubplot:ylabel='LINEAGE'>

In [14]:
lineages = df2['LINEAGE'].tolist()
num_taxa = 20
for i in range(len(lineages[0])):
    to_add = []
    for species in lineages:
        to_add.append(species[i])
    #end inner for
    df2["Annotation"] = to_add
    a = df2["Annotation"].nunique()
    if a >= num_taxa: break
    print("#",i, "=", df2["Annotation"].nunique())
#end outer for



0 1
1 1
2 1
3 1
4 1
5 1
6 1
7 1
8 1
9 1
10 1
11 1
12 1
13 1
14 1
15 1
16 1
17 1
18 1
19 2
20 3
21 8
22 17


In [15]:
df2.groupby('Annotation').Annotation.count().plot.pie(figsize=(10, 10), autopct='%1.1f%%', fontsize=12)

<AxesSubplot:ylabel='Annotation'>

In [16]:
df2.groupby('Annotation').Annotation.count().plot.pie(figsize=(10, 10), autopct='%1.1f%%', fontsize=12, subplots=True, wedgeprops={"edgecolor":"0",'linewidth': 0.5})

array([<AxesSubplot:ylabel='Annotation'>], dtype=object)

In [19]:
for item in set(df2["Annotation"].to_list()):
    #print(item)
    df3 = df2[df2["Annotation"] == item]
    #print(df3)
    df3 = df3.dropna()
    
    #if len(df3["ACCESSION"].to_list()) < 3: 
    #    continue
    
    #print(df3["LEAFNAME"].to_list())
    #df3.to_csv(item + ".csv", index=True)
    
    print("# Saving", str(len(df3["LEAFNAME"].to_list())), "to:", item + ".clade")
    with open(item + ".clade", "w") as fh:
        fh.write("\n".join(df3["LEAFNAME"].to_list()))
        #for row in df3["LEAFNAME"].to_list():
        #    fh.write(row)

# Saving  19 to: Carnivora.clade
# Saving  1 to: Macroscelididae.clade
# Saving  1 to: Trichosurus.clade
# Saving  1 to: Didelphinae.clade
# Saving  1 to: Folivora.clade
# Saving  23 to: Glires.clade
# Saving  1 to: Ornithorhynchus anatinus.clade
# Saving  17 to: Chiroptera.clade
# Saving  1 to: Tenrecinae.clade
# Saving  25 to: Primates.clade
# Saving  1 to: Scandentia.clade
# Saving  1 to: Dermoptera.clade
# Saving  1 to: Chrysochlorinae.clade
# Saving  1 to: Trichechidae.clade
# Saving  23 to: Artiodactyla.clade
# Saving  3 to: Perissodactyla.clade
# Saving  0 to: Elephantidae.clade
# Saving  1 to: Phascolarctos.clade
# Saving  1 to: Dasypodidae.clade
# Saving  1 to: Sarcophilus.clade
# Saving  3 to: Eulipotyphla.clade
# Saving  2 to: Pholidota.clade
# Saving  1 to: Orycteropodidae.clade
# Saving  1 to: Vombatus.clade
# Saving  1 to: Tachyglossus aculeatus.clade


In [18]:
df3["Annotation"]

154    Tachyglossus aculeatus
Name: Annotation, dtype: object

In [None]:
# There may be errors above, some LEAFNAMEs do not populate. Check the search for subroutine to make sure it works.