In [1]:
'''
This builds a dataframe and a hierarchical tree from MeSH terms. Functions to support finding distance between nodes
'''

import pandas as pd
from treelib import Node, Tree
import numpy as np

In [2]:
# Top level terms entered manually because they are not included in headings xlsx
top_terms = ['Anatomy', 'Organisms', 'Diseases', 'Chemicals and Drugs','Analytical, Diagnostic and Therapeutic Techniques, and Equipment', \
 'Psychiatry and Psychology', 'Phenomena and Processes', 'Disciplines and Occupations', \
 'Anthropology, Education, Sociology, and Social Phenomena', 'Technology, Industry, and Agriculture', 'Humanities',\
'Information Science', 'Named Groups', 'Health Care', 'Publication Characteristics', 'Geographicals']

locations = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'V', 'Z']
top_locations = [[loc] for loc in locations]

top_df = pd.DataFrame({
    "Location": top_locations,
    "Term": top_terms
})

In [3]:
headings_df = pd.read_csv('MeSH_headings.csv')

# function for stratifying location strings
def parse_location(location):
    top_char = [str(location)[0]]
    loc_list = str(location).lstrip('ABCDEFGHIJKLMNVZ')
    loc_list = loc_list.split('.')
    return top_char + loc_list

headings_df['Location'] = headings_df['Location'].apply(parse_location) #apply to df

#add headings to df of top level headings and reset index
mesh_df = top_df.append(headings_df)


# depth as measured as num of edges from the top
mesh_df['Depth'] = mesh_df['Location'].apply(lambda location: len(location))
mesh_df = mesh_df.sort_values('Depth').reset_index()
del mesh_df['index']

In [4]:
mesh_np = mesh_df.to_numpy() #convert to np array for speed, test with first 100 entries

tree = Tree() #make tree object
tree.create_node("Root", "root") #make a root node

#loop through entries to make nodes
for row in mesh_np:
    location = row[0]
    term = row[1]
    node_id = '.'.join(location) #node id is concatenated string of locations
    
    #if directly under root node, use specific parent node settings
    if len(location) == 1:
        tree.create_node(term, node_id, parent='root')
    else:
        parent = '.'.join(location[:-1]) #parent node id is node_id minus current current location level
        tree.create_node(term, node_id, parent=parent)
        
#use nodes to calculate complexity of each node's subtree

# function to find number of nodes in subtree given a node location (includes original node)
def subtree_complexity(entry):
    loc = '.'.join(list(entry))
    return len(tree.subtree(loc).nodes)

mesh_df['Complexity'] = mesh_df['Location'].apply(subtree_complexity) #write complexity values to a new column


In [5]:
#test to find subtree by id
subtree = tree.subtree('A')
subtree.show()

Anatomy
├── Animal Structures
│   ├── Air Sacs
│   ├── Anal Sacs
│   ├── Animal Fins
│   ├── Animal Fur
│   ├── Animal Scales
│   ├── Animal Shells
│   ├── Arthropod Antennae
│   │   └── Sensilla
│   ├── Beak
│   ├── Bursa of Fabricius
│   ├── Cloaca
│   ├── Comb and Wattles
│   ├── Compound Eye, Arthropod
│   ├── Corpora Allata
│   ├── Crop, Avian
│   ├── Egg Shell
│   ├── Electric Organ
│   ├── Embryo, Nonmammalian
│   │   ├── Chick Embryo
│   │   └── Chorioallantoic Membrane
│   ├── Fat Body
│   ├── Feathers
│   ├── Forelimb
│   │   ├── Carpus, Animal
│   │   └── Wings, Animal
│   ├── Ganglia, Invertebrate
│   ├── Gills
│   ├── Gizzard, Non-avian
│   ├── Harderian Gland
│   ├── Head Kidney
│   ├── Hemolymph
│   ├── Hepatopancreas
│   ├── High Vocal Center
│   ├── Hindlimb
│   │   ├── Stifle
│   │   └── Tarsus, Animal
│   ├── Hoof and Claw
│   ├── Horns
│   │   └── Antlers
│   ├── Imaginal Discs
│   ├── Interrenal Gland
│   ├── Lateral Line System
│   ├── Malpighian Tubules
│   ├── M

In [6]:
#test for querying subtrees with a keyword

query = input("What would you like to search for? \n")
result = mesh_df.loc[mesh_df['Term']==query.title()]['Location'] #search for row that has querry
loc = '.'.join(list(result)[0]) #concatenate location list into node_id
print(loc)
# tree.depth(tree.get_node(loc))
len(tree.subtree(loc).leaves())

What would you like to search for? 
bleach


IndexError: list index out of range

In [7]:
def find_loc(query):
    result = mesh_df.loc[mesh_df['Term']==query.title()]['Location'] #search for row that has querry
    loc = '.'.join(list(result)[0]) #concatenate location list into node_id
    return list(result)

find_loc("Garlic")

[['B',
  '01',
  '650',
  '940',
  '800',
  '575',
  '912',
  '250',
  '618',
  '100',
  '050',
  '060',
  '300']]

mesh_df.head(20)

In [None]:
#function to score all keywords
def node_score(node_location):
    subtree = tree.subtree(node_id)
    node1_list = find_loc(node1)
    
    #check if either node is a child of the other
    child = False
    if node1_list[-1] in node2_list or node2_list[-1] in node1_list:
        child = True
    
 
    

In [25]:
#function to pad out small querries with additional keywords
def pad_query(keyword_list):
    for keyword in keyword_list:
        loc = find_loc(keyword)
        subtree = tree.subtree(loc)

43851

In [5]:
mesh_df.sort_values('Complexity', ascending=False)

Unnamed: 0,Location,Term,Depth,Complexity
12,[D],Chemicals and Drugs,1,23373
13,[C],Diseases,1,12050
95,"[D, 12]","Amino Acids, Peptides, and Proteins",2,8858
909,"[D, 12, 776]",Proteins,3,7423
11,[E],"Analytical, Diagnostic and Therapeutic Techniq...",1,5232
...,...,...,...,...
25578,"[D, 12, 644, 456, 345, 575]",Glycylglycine,6,1
25580,"[D, 12, 644, 456, 345, 331]",Carnosine,6,1
25581,"[D, 12, 644, 456, 345, 190]",Aspartame,6,1
25582,"[D, 12, 644, 456, 345, 159]",Anserine,6,1


In [43]:
mesh_df['Depth'].mean()

6.130862529250849