# Imports

In [1]:
# Import pandas
import pandas as pd

# Import matplotlib
import matplotlib.pyplot as plt

# Import numpy
import numpy as np 

# Import Network X
import networkx as nx

# Paths for in/out files

In [2]:
# Path of IN-labels
mesh_path = '../../data/final/mesh.pkl'

# Path for IN-tags
geo_path = '../../data/final/geo.pkl'

# Read geo_df and mesh_df

In [3]:
# Read
geo_df = pd.read_pickle(geo_path)

# Echo
geo_df.head()

Unnamed: 0,geo_id,nsamples,date,mesh_id,mesh_heading,category,method
0,200000001,38,2001/01/22,D011379,Prognosis,E,pmid
1,200000001,38,2001/01/22,D016000,Cluster Analysis,E,pmid
2,200000001,38,2001/01/22,D016000,Cluster Analysis,N,pmid
3,200000001,38,2001/01/22,D008297,Male,Sex,pmid
4,200000001,38,2001/01/22,D012333,"RNA, Messenger",D,pmid


In [4]:
# Read
mesh_df = pd.read_pickle(mesh_path)

# Echo
mesh_df.head()

Unnamed: 0,category,mesh_heading,mesh_id,mesh_treenumbers
0,D,Calcimycin,D000001,D03.633.100.221.173
1,D,Temefos,D000002,D02.705.400.625.800
2,J,Abattoirs,D000003,J01.576.423.200.700.100
3,L,Abbreviations as Topic,D000004,L01.559.598.400.556.131
4,A,Abdomen,D000005,A01.923.047


# Compute category-depth

In [59]:
# Construct grand AstraZeneca dataframe
az_df = pd.merge(geo_df, mesh_df, on='mesh_id')

# Drop extra columns from merge
az_df.drop(columns='mesh_heading_y category_y method'.split(), inplace=True)

# Calculate category - Again
az_df['category']=az_df['mesh_treenumbers'].str.split('.').str[0].str[0]

# Report on propperly classified MeSH-ids category-wise
Propper_Tags = list(az_df['category_x']==az_df['category']).count(True)
Total_Tags = az_df['category_x'].shape[0]
print('Correctly categorized MeSH ids: {:4.1f}%'.format(100*Propper_Tags/Total_Tags))

# Calculate category depth
az_df['depth']=az_df['mesh_treenumbers'].str.split('.').str.len()

# Drop old-category column as well as mesh_treenumbers
az_df.drop(columns='category_x mesh_treenumbers'.split(), inplace=True)

# Echo
az_df.head()

Correctly categorized MeSH ids: 83.6%


Unnamed: 0,geo_id,nsamples,date,mesh_id,mesh_heading_x,category,depth
0,200000001,38,2001/01/22,D011379,Prognosis,E,2.0
1,200000088,31,2002/12/08,D011379,Prognosis,E,2.0
2,200000089,40,2002/12/08,D011379,Prognosis,E,2.0
3,200000349,14,2003/06/26,D011379,Prognosis,E,2.0
4,200000350,10,2003/06/26,D011379,Prognosis,E,2.0


# Filter and Clean geo DataFrame

In [5]:
# Construct date filter 
mask_date = az_df['date']==az_df['date'] # Take all studies

# Construct category filter
mask_category = ((az_df['category']=='C') | (az_df['category']=='D')) # Drugs and Diseases

# Construct mask to avoid C28's
mask_c23 = geo_df['mesh_id'].isin(mesh_df[~mesh_df['mesh_treenumbers'].str.startswith('C23', na=False)]['mesh_id'])

# Filter data, and eliminate filterning columns
filtered_geo_df = geo_df[mask_date & mask_category & mask_c28].drop(columns='date method'.split())

# Drop NaNs and duplicates
filtered_geo_df.dropna(axis=0, inplace=True)
filtered_geo_df.drop_duplicates(inplace=True)

# Only select summaries with +1 tag
tags_by_summary = filtered_geo_df['geo_id mesh_id'.split()].groupby('geo_id').count().reset_index() # Count tags per summary
good_summaries = tags_by_summary[tags_by_summary['mesh_id']>1] # Select abstracts with more than one tag
clean_geo = pd.merge(filtered_geo_df, good_summaries, on='geo_id') # Inner Join
clean_geo = clean_geo.drop(columns='mesh_id_y') # Drop column from inner join
clean_geo = clean_geo.rename(columns={'mesh_id_x':'mesh_id'}) # Rename key column

# Write info
print('Number of Records: ',clean_geo.shape[0])

# Echo
clean_geo.head()

Number of Records:  219953


Unnamed: 0,geo_id,nsamples,mesh_id,mesh_heading,category
0,200000001,38,D012333,"RNA, Messenger",D
1,200000001,38,D014604,Uveal Neoplasms,C
2,200000001,38,D012878,Skin Neoplasms,C
3,200000001,38,D008545,Melanoma,C
4,200000001,38,D018450,Disease Progression,C


# Construct Nodes

In [6]:
# Select only relevant columns
nodes = pd.DataFrame(clean_geo['mesh_id category mesh_heading'.split()])

# Drop duplicates
nodes.drop_duplicates(inplace=True)

# Echo
nodes.head()

Unnamed: 0,mesh_id,category,mesh_heading
0,D012333,D,"RNA, Messenger"
1,D014604,C,Uveal Neoplasms
2,D012878,C,Skin Neoplasms
3,D008545,C,Melanoma
4,D018450,C,Disease Progression


# Construct Edges

In [7]:
# Construct all-with-all links inside same geoid-nsample-date record
links = pd.merge(clean_geo, clean_geo, on='geo_id nsamples'.split())

# Rename to Source-Target
links.rename(columns={'mesh_id_x':'source', 'mesh_id_y':'target'}, inplace=True)

# Delete self-linkage
links.drop(links[links['source']==links['target']].index, inplace=True)

# Collapse repetitions while calculating weights
edges = links.groupby('source target'.split()).sum().reset_index()

# Rename sum(nsamples) to 'weight'
edges.rename(columns={'nsamples':'weight'}, inplace=True)

# Account for mirror-duplicates
edges['weight']/=2

# Normalize weights
edges['weight']/=edges['weight'].max()

# Head
edges.head()

Unnamed: 0,source,target,weight
0,D000001,D000230,9.3e-05
1,D000001,D002110,9.3e-05
2,D000001,D002118,0.000124
3,D000001,D003029,0.000124
4,D000001,D006474,0.000124


# Construct Graph

In [8]:
# Construct Directed Graph
dd = nx.from_pandas_edgelist(edges, 
                             source='source', 
                             target='target', 
                             edge_attr='weight', 
                             create_using=nx.DiGraph()
                            )

# Transform to undirected graph
dd = nx.to_undirected(dd)

# Add nodes attributes - Category
nx.set_node_attributes(dd, nodes['mesh_id category'.split()].set_index('mesh_id').to_dict()['category'], 'category')

# Add nodes attributes - Mesh Heading
nx.set_node_attributes(dd, nodes['mesh_id mesh_heading'.split()].set_index('mesh_id').to_dict()['mesh_heading'], 'mesh_heading')

# Save as pickle
nx.write_gpickle(dd, 'Disease_Drugs_Graph.pkl')

# Echo info
print('  Size (Nodes): ', dd.size())
print(' Order (Edges): ', dd.order())
print(' Graph Density: ', nx.density(dd))

  Size (Nodes):  439836
 Order (Edges):  6916
 Graph Density:  0.018393890616747108


# Functions over the tree

In [9]:
def get_categories(graph):
    """
    Get a dictionary with the categories of all the nodes
    """
    return nx.get_node_attributes(graph, 'category')
   
    
def get_mesh_headings(graph):
    """
    Get a dictionary with the mesh-headings of all the nodes
    """
    return nx.get_node_attributes(graph, 'mesh_heading')
    

def get_neighbors(graph, node, cats):
    """
    Get the neighbors of the node such that they have the same/opposite category
    """
        
    # Define empty lists
    same = list()
    oppo = list()
        
    # Select only those with same category
    for neigh in nx.neighbors(dd, node):
            
        # Check for same neighbors
        if cats[neigh]==cats[node]:
            same.append(neigh)
        else:
            oppo.append(neigh)
                
    # Return the tuples same and oppo
    return same, oppo
    
    
def get_top(dictionary_metric, top):
    """
    Find the top-n nodes according to some metric
    """
    
    # Get the items in the metric dictionary
    items = list(dictionary_metric.items())
    
    # Sort them out
    items.sort(reverse=True, key=lambda x: x[1])
    
    # Return the keys
    return list(map(lambda x:x[0], items[:top]))


def get_only(graph, cats, specific_category):
    """
    Select the nodes of the graph where category==category and returns a subgraph
    """
    
    # Define empty list
    only_nodes = list()
    
    # Cycle through the nodes
    for node in graph.nodes():
        if cats[node]==specific_category:
            only_nodes.append(node)
    
    # Return the subgraph
    return nx.subgraph(graph, only_nodes)  

# Recomend drugs for top diseases ['C']

In [10]:
# Read full graph
ee = nx.read_gpickle('Disease_Drugs_Graph.pkl')

# Read categories and labels
cats = get_categories(graph=ee)
labs = get_mesh_headings(graph=ee)

# Choose only disease-nodes
diseases = get_only(graph=ee, cats=cats, specific_category='C')

In [11]:
# Disease eigenvector centrality
diseases_eig = nx.eigenvector_centrality(diseases, max_iter=500, weight='weight')

# Disease PageRank 
diseases_pgn = nx.pagerank(diseases, alpha=0.9, weight='weight')

# Disease Degree 
diseases_deg = nx.degree_centrality(diseases)

In [12]:
# Find top-diseases
top = 250
top_eig = get_top(dictionary_metric=diseases_eig, top=top)
top_pgn = get_top(dictionary_metric=diseases_pgn, top=top)
top_deg = get_top(dictionary_metric=diseases_deg, top=top)
top_diseases = top_eig

In [13]:
# Define containers of important recommendations
rs = list()

# Choose a node
for disease in top_diseases:

    # Get neighbors diseases and neighboring drugs
    nei_dis, nei_dru = get_neighbors(graph=dd, node=disease, cats=cats)

    # Get max possible weight
    ww_max = sum([dd.get_edge_data(disease, nei, 'weight')['weight'] for nei in nei_dis])
    
    # For every neighboring disease
    for n_disease in nei_dis:

        # Find all the neighboring drugs 
        _ , nei_nei_dru = get_neighbors(graph=dd, node=n_disease, cats=cats) 

        # Chose drugs not in nei_dru
        not_in_nei_dru = list(set(nei_nei_dru) - set(nei_dru))

        # Add them to rs with weight
        c1 = [disease]*len(not_in_nei_dru)
        c2 = not_in_nei_dru
        ww = dd.get_edge_data(disease, n_disease, 'weight')['weight']
        c3 = [ww/ww_max]*len(not_in_nei_dru)
        
        rs.extend(zip(c1, c2, c3))

# Get into a DF
rs = pd.DataFrame(data=rs, columns='Disease Drug Recommendation_Strenght'.split())

# Group by disease-drug pairs and add the weights
rs = pd.DataFrame(rs.groupby('Disease Drug'.split()).sum().reset_index())

# Echo
rs.head()

Unnamed: 0,Disease,Drug,Recommendation_Strenght
0,D000077192,D000001,0.256587
1,D000077192,D000017,0.160544
2,D000077192,D000067596,0.205
3,D000077192,D000067616,0.002928
4,D000077192,D000067759,0.170011


# Label all the mesh_id's

In [14]:
# Merge on Disease=Mesh_id
rs_named_1 = pd.merge(rs.rename(columns={'Disease':'mesh_id'}), mesh_df, on='mesh_id')

# Remove unwanted columns
rs_named_1.drop(columns='category mesh_treenumbers'.split(), inplace=True)

# Rename mesh_headings
rs_named_1.rename(columns={'mesh_heading':'Disease', 'mesh_id':'Disease_id'}, inplace=True)

# Re-Organize
rs_named_1 = rs_named_1['Disease Disease_id Drug Recommendation_Strenght'.split()]



# Merge on Drug=Mesh_id
rs_named_2 = pd.merge(rs_named_1.rename(columns={'Drug':'mesh_id'}), mesh_df, on='mesh_id')

# Remove unwanted columns
rs_named_2.drop(columns='category mesh_treenumbers'.split(), inplace=True)

# Rename mesh_headings
rs_named_2.rename(columns={'mesh_heading':'Drug', 'mesh_id':'Drug_id'}, inplace=True)

# Re-Organize
rs_named_2 = rs_named_2['Disease Disease_id Drug_id Drug Recommendation_Strenght'.split()]



# Clean duplicates
rs_clean = rs_named_2.drop_duplicates().reset_index(drop=True)

# Sort by r-strenght
rs_clean.sort_values(by='Recommendation_Strenght Disease Drug'.split(), inplace=True, ascending=False)

# Reset index
rs_clean.reset_index(inplace=True, drop=True)

# Echo
rs_clean.head(50)

Unnamed: 0,Disease,Disease_id,Drug_id,Drug,Recommendation_Strenght
0,Catastrophic Illness,D002388,D035683,MicroRNAs,1.0
1,Pancreatic Diseases,D010182,D016159,Tumor Suppressor Protein p53,1.0
2,Pancreatic Diseases,D010182,D002352,Carrier Proteins,1.0
3,Dysgeusia,D004408,D014408,"Biomarkers, Tumor",0.998946
4,"Carcinoma, Transitional Cell",D002295,D025521,Tumor Suppressor Proteins,0.996169
5,Gastroschisis,D020139,D009363,Neoplasm Proteins,0.99439
6,Gastroschisis,D020139,D014157,Transcription Factors,0.994337
7,"Neoplastic Cells, Circulating",D009360,D035683,MicroRNAs,0.992756
8,"Carcinoma, Transitional Cell",D002295,D034741,"RNA, Small Interfering",0.991934
9,Gastroschisis,D020139,D000970,Antineoplastic Agents,0.991797


# Recomend disease for top drug ['D']

In [15]:
# Read full graph
ee = nx.read_gpickle('Disease_Drugs_Graph.pkl')

# Read categories and labels
cats = get_categories(graph=ee)
labs = get_mesh_headings(graph=ee)

# Choose only drug-nodes
drugs = get_only(graph=ee, cats=cats, specific_category='D')

In [16]:
# Drugs eigenvector centrality
drugs_eig = nx.eigenvector_centrality(drugs, max_iter=500, weight='weight')

# Drugs PageRank 
drugs_pgn = nx.pagerank(drugs, alpha=0.9, weight='weight')

# Drugs Degree 
drugs_deg = nx.degree_centrality(drugs)

In [17]:
# Find top-diseases
top = 250
top_eig = get_top(dictionary_metric=drugs_eig, top=top)
top_pgn = get_top(dictionary_metric=drugs_pgn, top=top)
top_deg = get_top(dictionary_metric=drugs_deg, top=top)
top_drugs = top_eig

In [18]:
# Define containers of important recommendations
rs = list()

# Choose a node
for drug in top_drugs:

    # Get neighbors diseases and neighboring drugs
    nei_dis, nei_dru = get_neighbors(graph=dd, node=drug, cats=cats)

    # Get max possible weight
    ww_max = sum([dd.get_edge_data(drug, nei, 'weight')['weight'] for nei in nei_dru])
    
    # For every neighboring drug
    for n_drug in nei_dru:

        # Find all the neighboring diseases 
        nei_nei_dis, _ = get_neighbors(graph=dd, node=n_drug, cats=cats) 

        # Chose disease not in nei_dis
        not_in_nei_dis = list(set(nei_nei_dis) - set(nei_dis))

        # Add them to rs with weight
        c1 = [drug]*len(not_in_nei_dis)
        c2 = not_in_nei_dis
        ww = dd.get_edge_data(drug, n_drug, 'weight')['weight']
        c3 = [ww/ww_max]*len(not_in_nei_dis)
        
        rs.extend(zip(c1, c2, c3))

# Get into a DF
rs = pd.DataFrame(data=rs, columns='Drug Disease Recommendation_Strenght'.split())

# Group by disease-drug pairs and add the weights
rs = pd.DataFrame(rs.groupby('Drug Disease'.split()).sum().reset_index())

# Echo
rs.head()

Unnamed: 0,Drug,Disease,Recommendation_Strenght
0,D000069283,D000008,0.140059
1,D000069283,D000012,0.250248
2,D000069283,D000013,0.23116
3,D000069283,D000015,0.150471
4,D000069283,D000022,0.088374


# Label all the mesh_id's

In [19]:
# Merge on Disease=Mesh_id
rs_named_1 = pd.merge(rs.rename(columns={'Drug':'mesh_id'}), mesh_df, on='mesh_id')

# Remove unwanted columns
rs_named_1.drop(columns='category mesh_treenumbers'.split(), inplace=True)

# Rename mesh_headings
rs_named_1.rename(columns={'mesh_heading':'Drug', 'mesh_id':'Drug_id'}, inplace=True)

# Re-Organize
rs_named_1 = rs_named_1['Drug Drug_id Disease Recommendation_Strenght'.split()]



# Merge on Drug=Mesh_id
rs_named_2 = pd.merge(rs_named_1.rename(columns={'Disease':'mesh_id'}), mesh_df, on='mesh_id')

# Remove unwanted columns
rs_named_2.drop(columns='category mesh_treenumbers'.split(), inplace=True)

# Rename mesh_headings
rs_named_2.rename(columns={'mesh_heading':'Disease', 'mesh_id':'Disease_id'}, inplace=True)

# Re-Organize
rs_named_2 = rs_named_2['Drug Drug_id Disease_id Disease Recommendation_Strenght'.split()]



# Clean duplicates
rs_clean = rs_named_2.drop_duplicates().reset_index(drop=True)

# Sort by r-strenght
rs_clean.sort_values(by='Recommendation_Strenght Drug Disease'.split(), inplace=True, ascending=False)

# Reset index
rs_clean.reset_index(inplace=True, drop=True)

# Echo
rs_clean.head(50)

Unnamed: 0,Drug,Drug_id,Disease_id,Disease,Recommendation_Strenght
0,Interleukin-3 Receptor alpha Subunit,D053650,D054198,Precursor Cell Lymphoblastic Leukemia-Lymphoma,1.0
1,Interleukin-3 Receptor alpha Subunit,D053650,D009369,Neoplasms,1.0
2,Interleukin-3 Receptor alpha Subunit,D053650,D018365,"Neoplasm, Residual",1.0
3,Interleukin-3 Receptor alpha Subunit,D053650,D007249,Inflammation,1.0
4,Interleukin-3 Receptor alpha Subunit,D053650,D020022,Genetic Predisposition to Disease,1.0
5,Interleukin-3 Receptor alpha Subunit,D053650,D018450,Disease Progression,1.0
6,Interleukin-3 Receptor alpha Subunit,D053650,D004195,"Disease Models, Animal",1.0
7,RUNX1 Translocation Partner 1 Protein,D000075142,D018450,Disease Progression,1.0
8,Pre-B Cell Receptors,D054420,D015275,Tumor Lysis Syndrome,1.0
9,Pre-B Cell Receptors,D054420,D014178,"Translocation, Genetic",1.0
