# Import the usual suspects

In [7]:
# Import pandas
import pandas as pd

# Import matplotlib
import matplotlib.pyplot as plt

# Import Network X
import networkx as nx

# Run script to construct Disease/Drug graphs 

In [8]:
# Run
%run ./41_0_L_Create_DD_Graph.py

# Run statistics on graphs

In [9]:
# Read Disease
disease_g = nx.read_gpickle(DISEASE_GRAPH)

# Read Drug
drug_g = nx.read_gpickle(DRUGS_GRAPH)

In [10]:
# Disease eigenvector centrality
disease_eig = nx.eigenvector_centrality(disease_g, max_iter=500, weight='weight')

# Drug eigenvector centrality
drug_eig = nx.eigenvector_centrality(drug_g, max_iter=500, weight='weight')

In [11]:
# Disease PageRank 
disease_pgn = nx.pagerank(disease_g, alpha=0.9, weight='weight')

# Drug PageRank 
drug_pgn = nx.pagerank(drug_g, alpha=0.9, weight='weight')

In [12]:
# Disease Degree 
disease_deg = nx.degree_centrality(disease_g)

# Drug Degree 
drug_deg = nx.degree_centrality(drug_g)

# Get Top Diseases/Drugs for time-series analysis

In [13]:
# Get n-top nodes according to a specific metric
def get_top(dictionary, top):
    items = list(dictionary.items())
    items.sort(reverse=True, key=lambda x: x[1])
    return list(map(lambda x:x[0], items[:top]))

In [14]:
# Choose n-top
n_top = 20

# Creatre multi-index
outside = [*['Diseases']*3,*['Drugs']*3]
inside = ['EigenCentrality','PageRank','Degree']*2
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

# Create DataFrame with n-top diseases/drugs according to all the metrics
all_time_top = pd.DataFrame(data=[get_top(disease_eig, n_top),
                                  get_top(disease_pgn, n_top),
                                  get_top(disease_deg, n_top),
                                  get_top(drug_eig, n_top), 
                                  get_top(drug_pgn, n_top),
                                  get_top(drug_deg, n_top)], index=hier_index).transpose()

# Save as pickle file for Claire's time series analysis
all_time_top.to_pickle('All_Time_Top.pkl')

#Echo
all_time_top.head()

Unnamed: 0_level_0,Diseases,Diseases,Diseases,Drugs,Drugs,Drugs
Unnamed: 0_level_1,EigenCentrality,PageRank,Degree,EigenCentrality,PageRank,Degree
0,D009369,D009369,D009369,D014157,D012333,D012333
1,D001943,D001943,D007249,D012333,D014157,D035683
2,D001932,D020022,D020022,D014408,D014408,D014157
3,D000230,D018450,D004195,D004268,D035683,D034741
4,D009362,D000230,D018450,D035683,D000970,D000970


In [32]:
df1 = all_time_top.Diseases.rename(columns={'PageRank':'mesh_id'})['mesh_id']

In [34]:
pd.merge(df1, mesh_df, on='mesh_id')

Unnamed: 0,mesh_id,category,mesh_heading,mesh_treenumbers
0,D009369,C,Neoplasms,C04
1,D001943,C,Breast Neoplasms,C04.588.180
2,D001943,C,Breast Neoplasms,C17.800.090.500
3,D020022,C,Genetic Predisposition to Disease,C23.550.291.687.500
4,D020022,G,Genetic Predisposition to Disease,G05.380.355
5,D018450,C,Disease Progression,C23.550.291.656
6,D000230,C,Adenocarcinoma,C04.557.470.200.025
7,D002869,C,Chromosome Aberrations,C23.550.210
8,D002869,G,Chromosome Aberrations,G05.365.590.175
9,D001932,C,Brain Neoplasms,C04.588.614.250.195


In [41]:
geo_df.groupby('mesh_id').count().head()

Unnamed: 0_level_0,geo_id,nsamples,date,mesh_heading,category,method
mesh_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C483997,2,2,2,2,2,2
C531600,2,2,2,2,2,2
C531621,1,1,1,1,1,1
C531623,2,2,2,0,0,2
C531629,4,4,4,0,0,4


In [42]:
geo_df[geo_df['mesh_id']=='C483997']

Unnamed: 0,geo_id,nsamples,date,mesh_id,mesh_heading,category,method
14322,200003369,6,2006/03/31,C483997,"Wis heavy-chain disease protein, human",D,dnorm
14323,200003369,6,2006/03/31,C483997,"Wis heavy-chain disease protein, human",C,dnorm
