In [1]:
import pandas as pd
import numpy as np
import time
import os

In [3]:
##MAIN
def nodes_indexation(PATH, df_nodes = []):
##This function takes as an input PATH to the nodes.dmp file
##and empty out list, and returns dataframe with parsed taxonomy levels
    with open(PATH) as myfile:
        for line in myfile:
            line = line.split("\t")
            df_nodes.append([line[0], line[2], line[4]])
    #columns of taxonomy_df - tax_id, parent_id, tax_level
    df_nodes = pd.DataFrame(df_nodes, columns = ["tax_id", "parent_id", "tax_level"])
    df_nodes.set_index(["tax_id"], inplace = True)
    return df_nodes

def names_indexation(PATH, df_names = []):
##This function takes as an input PATH to the names.dmp file
##and empty out list, and returns dataframe with tax_id and names
    with open(PATH) as myfile:
        for line in myfile:
            line = line.split("\t")
            df_names.append([line[0], line[2]])
    #columns of taxonomy_df - tax_id, name
    df_names = pd.DataFrame(df_names, columns = ["tax_id", "name"])
    return df_names

def taxonomy_collector(df_taxonomy, initial_taxid, desired_level):
    ##This function takes as an input taxonomy levels dataframe,
    ##tax_id of initial group and desired level of subgroups
    ##and returns dataframe with tax_id, parent_id and tax_level of subgroups
    ## from given initial_taxid
    ## check that tax_level is valid
    available_taxonomy = pd.unique(df_taxonomy["tax_level"])
    print(available_taxonomy)
    if desired_level not in available_taxonomy:
        raise ValueError("desired_level must be one of %r." % available_taxonomy)
    initial_level = df_taxonomy.loc[initial_taxid]["tax_level"]
    desired_df = df_taxonomy[df_taxonomy["tax_level"] == desired_level]
    collected_taxonomy = []
    
    ##obtain all groups from desired level, and then go up 
    ##until root or initial_taxid achieved,
    ##return only groups fron initial_taxid
    for ind in desired_df.index:
        acc_initial = df_taxonomy.loc[str(ind)]
        acc = acc_initial
        while acc["tax_level"] != initial_level:
            acc = df_initial.loc[str(acc["parent_id"])]
            if acc["parent_id"] == "1":
                break
        if acc.name == initial_taxid:
            collected_taxonomy.append([acc_initial.name, acc_initial["parent_id"], acc_initial["tax_level"]])
    #columns of collected_taxonomy_df - tax_id, parent_id, tax_level
    collected_taxonomy = pd.DataFrame(collected_taxonomy, columns = ["tax_id", "parent_id", "tax_level"])
    collected_taxonomy.set_index(["tax_id"], inplace = True)
    return collected_taxonomy

def taxid_to_mnemonics_species(PATH, nheader):
    ####This function reads speclist.txt file
    ##It takes as an input path to file with tax_id <-> mnemonic mapping
    ##Number of header lines to exclude from mapping file 
    ##And dataframe to store the output data
    taxid_to_mnemonics_df = []
    with open(PATH, "r") as inp:
        for i in range(nheader):
            next(inp)
        for line in inp:
            line_n = line
            line = line.split(" ")
            #remove bad lines
            if len(line) != 1 and len([x for x in line if x.endswith(":")]) == 1:
                taxid_to_mnemonics_df.append([line[0], [x for x in line if x.endswith(":")][0][0:-1]])
        taxid_to_mnemonics_df = pd.DataFrame(taxid_to_mnemonics_df, columns = ["mnemonic", "tax_id"])
        taxid_to_mnemonics_df.set_index(["tax_id"], inplace = True)
    return taxid_to_mnemonics_df

def find_tax_level_for_tax_id(df_taxonomy, initial_taxid, desired_level):
    ##This function takes as an input df with taxonomical relations,
    ##taxid of entity and desired level of output
    ##and outputs the tax_id of the desired level
    desired_df = df_taxonomy[df_taxonomy["tax_level"] == desired_level]
    acc = df_taxonomy.loc[str(initial_taxid)]
    while acc["tax_level"] != desired_level:
        acc = df_initial.loc[str(acc["parent_id"])]
        if acc["parent_id"] == "1":
            break
    if acc["tax_level"] == desired_level:
        return acc.name



## SCRIPTS FOR METAZOA

In [4]:
#get main df
if __name__ == "__main__":
    PATH_nodes = "/data/DBs/Taxonomy/13-05-2022/nodes.dmp"
    PATH_speclist = "/data/DBs/Uniprot/speclist.txt"
    
    df_initial = nodes_indexation(PATH_nodes)
    taxid_to_mnemonics_df = taxid_to_mnemonics_species(PATH_speclist, 59)

In [6]:
#get list of mnemonics from parsed swisspfam:
swisspfam_mnemonics = []
for file in os.listdir("/data/ruslan_gumerov/pfam_parsed/"):
    swisspfam_mnemonics.append(os.path.splitext(file)[0])

In [7]:
#convert mnemonics to tax_ids
swisspfam_id = []
no_taxid = []
for x in swisspfam_mnemonics:
    if not taxid_to_mnemonics_df[taxid_to_mnemonics_df["mnemonic"] == x].empty:
        swisspfam_id.append(taxid_to_mnemonics_df[taxid_to_mnemonics_df["mnemonic"] == x].index[0])
    else:
        swisspfam_id.append(None)

In [156]:
# obtain orders for given tax_ids
swisspfam_orders = []
no_order = []
for x in swisspfam_id:
    if x:
        try:
            swisspfam_orders.append(find_tax_level_for_tax_id(df_initial, x, "order"))
        except KeyError:
            swisspfam_orders.append(None)
            no_order.append(x)
    else:
        swisspfam_orders.append(None)

In [167]:
# create and save final DF
total_order_df = pd.DataFrame({"swisspfam_mnemonic" : swisspfam_mnemonics,
                       "tax_id" : swisspfam_id.astype(str), 
                       "order" : swisspfam_orders.astype(str)})
total_order_df.to_csv("/home/ruslan_gumerov/noisy/swisspfam_mnemonic_taxid_order_df.csv")

In [9]:
# read total_order_df
total_order_df = pd.read_csv("/home/ruslan_gumerov/noisy/swisspfam_mnemonic_taxid_order_df.csv")

In [27]:
# fix some errors in previous save
#total_order_df["tax_id"] = total_order_df["tax_id"].astype(str).str[0:-2]
#total_order_df["order"] = total_order_df["order"].astype(str).str[0:-2]

In [8]:
# get all metazoan orders
metazoa_orders = taxonomy_collector(df_initial, "33208", "order")

['no rank' 'superkingdom' 'genus' 'species' 'order' 'family' 'subspecies'
 'subfamily' 'strain' 'serogroup' 'biotype' 'tribe' 'phylum' 'class'
 'species group' 'forma' 'clade' 'suborder' 'subclass' 'varietas'
 'kingdom' 'subphylum' 'forma specialis' 'isolate' 'infraorder'
 'superfamily' 'infraclass' 'superorder' 'subgenus' 'superclass'
 'parvorder' 'serotype' 'species subgroup' 'subcohort' 'cohort' 'genotype'
 'subtribe' 'section' 'series' 'morph' 'subkingdom' 'superphylum'
 'subsection' 'pathogroup']


In [39]:
#create order directories and move pfam jsons in them
for tax_id in metazoa_orders.index:
    if not (total_order_df.dropna()[total_order_df.dropna()["order"] == tax_id]).empty:
        os.makedirs(f"/data/ruslan_gumerov/pfam_metazoa/{tax_id}", exist_ok = True)
        for mnemonic in total_order_df.dropna()[total_order_df.dropna()["order"] == tax_id]["swisspfam_mnemonic"]:
            os.system(f"cp /data/ruslan_gumerov/pfam_parsed/{mnemonic}.json /data/ruslan_gumerov/pfam_metazoa/{tax_id}/{mnemonic}.json")


## SCRIPTS FOR METAZOA END

In [21]:
for i in collected_orders.index:
    time1 = time.time()
    tax_level = "species"
    tax_target = i
    collected_species = taxonomy_collector(df_initial, tax_target, tax_level)
    result_df = pd.concat([collected_orders, taxid_to_mnemonics_df], axis=1, join = "inner")
    time2 = time.time()
    print(f"Tax target: {i}, tax mnemonics: {''.join(result_df['mnemonic'])}, time: {time1-time2/60} min")

['no rank' 'superkingdom' 'genus' 'species' 'order' 'family' 'subspecies'
 'subfamily' 'strain' 'serogroup' 'biotype' 'tribe' 'phylum' 'class'
 'species group' 'forma' 'clade' 'suborder' 'subclass' 'varietas'
 'kingdom' 'subphylum' 'forma specialis' 'isolate' 'infraorder'
 'superfamily' 'infraclass' 'superorder' 'subgenus' 'superclass'
 'parvorder' 'serotype' 'species subgroup' 'subcohort' 'cohort' 'genotype'
 'subtribe' 'section' 'series' 'morph' 'subkingdom' 'superphylum'
 'subsection' 'pathogroup']


KeyboardInterrupt: 

In [55]:
#len(pd.unique(taxid_to_mnemonics_df.index))
df[taxid_to_mnemonics_df.isin(taxid_to_mnemonics_df[taxid_to_mnemonics_df.duplicated()])].sort_values("ID")

12401

In [108]:
collected_orders

Unnamed: 0_level_0,parent_id,tax_level
tax_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6049,1779146,order
6103,6102,order
6125,6102,order
6133,6132,order
6143,6142,order
...,...,...
2899740,37902,order
2933416,6381,order
2937013,6381,order
2937014,6381,order


In [1]:
taxid_to_mnemonics_df

NameError: name 'taxid_to_mnemonics_df' is not defined

In [134]:
def taxid_to_mnemonics(PATH, nheader):
    ##This function takes as an input path to file with tax_id <-> mnemonic mapping
    ##Number of header lines to exclude from mapping file 
    ##And dataframe to store the output data
    taxid_to_mnemonics_df = []
    with open(PATH, "r") as inp:
        for i in range(nheader):
            next(inp)
        for line in inp:
            line_n = line
            line = line.split(" ")
            #remove bad lines
            if line[0] in no_taxid:
                print(line)
            if len(line) != 1 and len([x for x in line if x.endswith(":")]) == 1:
                taxid_to_mnemonics_df.append([line[0], [x for x in line if x.endswith(":")][0][0:-1]])
        taxid_to_mnemonics_df = pd.DataFrame(taxid_to_mnemonics_df, columns = ["mnemonic", "tax_id"])
        taxid_to_mnemonics_df.set_index(["tax_id"], inplace = True)
    return taxid_to_mnemonics_df

taxid_to_mnemonics(PATH_speclist, 59)

Unnamed: 0_level_0,mnemonic
tax_id,Unnamed: 1_level_1
648330,AADNV
10804,AAV2
648242,AAV2S
118452,ABAMA
72259,ABANI
...,...
662,9VIBR
33090,9VIRI
10239,9VIRU
338,9XANT


In [87]:
test = pd.concat([collected_orders, taxid_to_mnemonics_df], axis=1, join='inner')

In [102]:
taxid_to_mnemonics_df.index

Index(['648330', '648242', '118452', '102642', '392897', '515833', '180068',
       '183219', '105883', '183220',
       ...
       '206389', '675072', '200666', '204457', '203691', '188709', '186814',
       '155616', '166384', '216275'],
      dtype='object', name='tax_id', length=12401)

In [15]:
def speclist_table_creation()

with open("/data/DBs/Uniprot/speclist.txt", "r") as inp:
    #skip header
    for i in range(59):
        next(inp)
    for line in inp:
        line = line.split(" ")
        if line[0] != "" :
            print(line)

['AADNV', 'V', '', '648330:', 'N=Aedes', 'albopictus', 'densovirus', '(isolate', 'Boublik/1994)\n']
['AAV2', '', 'V', '', '', '10804:', 'N=Adeno-associated', 'virus', '2\n']
['AAV2S', 'V', '', '648242:', 'N=Adeno-associated', 'virus', '2', '(isolate', 'Srivastava/1982)\n']
['ABAMA', 'E', '', '118452:', 'N=Abacion', 'magnum\n']
['ABANI', 'E', '', '', '72259:', 'N=Abaeis', 'nicippe\n']
['ABAPA', 'E', '', '102642:', 'N=Abax', 'parallelepipedus\n']
['ABAST', 'E', '', '392897:', 'N=Abalistes', 'stellaris\n']
['ABBRI', 'E', '', '', '75332:', 'N=Abbottina', 'rivularis\n']
['ABDAC', 'E', '', '515833:', 'N=Abdopus', 'aculeatus\n']
['ABDS2', 'B', '', '', '56673:', 'N=Antarctic', 'bacterium', 'DS2-3R\n']
['ABECH', 'E', '', '180068:', 'N=Abelia', 'chinensis\n']
['ABEFI', 'E', '', '183219:', 'N=Abelmoschus', 'ficulneus\n']
['ABEGR', 'E', '', '105883:', 'N=Abelia', 'grandiflora\n']
['ABEMA', 'E', '', '183220:', 'N=Abelmoschus', 'manihot\n']
['ABHV', '', 'V', '1241371:', 'N=Abalone', 'herpesvirus', '

In [11]:
desired_df = df_initial[df_initial["tax_level"] == desired_level]
for ind in desired_df.index:
    acc_initial = df_initial.loc[str(ind)]
    acc = acc_initial
    while acc["tax_level"] != initial_level:
        acc = df_initial.loc[str(acc["parent_id"])]
        if acc["parent_id"] == "1":
            break
    if acc.name == initial_taxid:
        print(acc_initial)


parent_id    2731619
tax_level      order
Name: 28883, dtype: object
parent_id    2731363
tax_level      order
Name: 548681, dtype: object
parent_id    2731619
tax_level      order
Name: 1978007, dtype: object


True

In [38]:
f = {1, 2, "asdf"}
f.remove(1)
f

{2, 'asdf'}

In [12]:
def nodes_indexation(PATH, df):
    with open(PATH) as myfile:
        for line in myfile:
            line = line.split("\t")
            df_initial.append([line[0], line[2], line[4]])
    return df_initial
            
df_initial = []
PATH = "/data/DBs/Taxonomy/13-05-2022/nodes.dmp"
df_initial = nodes_indexation(PATH, df_initial)

df_initial = pd.DataFrame(df_initial, columns = ["tax_id", "parent_id", "tax_level"])
df_initial.set_index(["tax_id"], inplace = True)
df_initial
#df_initial.to_csv("/home/ruslan_gumerov/noisy/nodes_df.csv")



Unnamed: 0_level_0,parent_id,tax_level
tax_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,no rank
2,131567,superkingdom
6,335928,genus
7,6,species
9,32199,species
...,...,...
2943286,90627,genus
2943291,2066491,genus
2943312,543,genus
2943317,543,genus
