# Imports

In [24]:
# Import pandas
import pandas as pd

# Import matplotlib
import matplotlib.pyplot as plt

# Import Network X
import networkx as nx

# Paths for in/out files

In [25]:
# Path of IN-labels
names_path = '../src/visualization/mesh.pkl'

# Path for IN-tags
tags_path = '../src/visualization/geo.pkl'

# Path of OUT-labels - for Gephi
gephi_node_labels = '../s2ds19_AZ/Graphs/Gephi/InputData/gephi_node_labels_C.csv'

# Path of OUT-edges - for Gephi
gephi_weighted_edges = '../s2ds19_AZ/Graphs/Gephi/InputData/gephi_weighted_edges_C.csv'

# Read names_df

In [26]:
# Read
names_df = pd.read_pickle(names_path)

# Echo
names_df.head()

Unnamed: 0,category,mesh_heading,mesh_id,mesh_treenumbers
0,D,Calcimycin,D000001,D03.633.100.221.173
1,D,Temefos,D000002,D02.705.400.625.800
2,J,Abattoirs,D000003,J01.576.423.200.700.100
3,L,Abbreviations as Topic,D000004,L01.559.598.400.556.131
4,A,Abdomen,D000005,A01.923.047


# Construct Gephi labels

In [27]:
# Check all entries
print('Shape -Before-: ', names_df.shape)

# Check non-None entries
gephi_labels = names_df.dropna(axis=0)

# Check for duplicates
gephi_labels = gephi_labels.drop_duplicates(subset='mesh_id', keep='first')

# Check all entries - again
print('Shape  -After-: ', gephi_labels.shape)

# Copy only id and label
gephi_labels = gephi_labels.drop(columns='mesh_treenumbers category'.split())

# Rename for csv-file
gephi_labels = gephi_labels.rename(columns={'mesh_id':'id', 'mesh_heading':'label'})

# Save to .csv
gephi_labels['id label'.split()].to_csv('Gephi_Labels.csv', index=False)

# Echo
gephi_labels.head()

Shape -Before-:  (59748, 4)
Shape  -After-:  (29349, 4)


Unnamed: 0,label,id
0,Calcimycin,D000001
1,Temefos,D000002
2,Abattoirs,D000003
3,Abbreviations as Topic,D000004
4,Abdomen,D000005


# Read tags_df

In [40]:
# Read
tags_df = pd.read_pickle(tags_path)

# Echo
tags_df.head()

Unnamed: 0,geo_id,nsamples,date,mesh_id,mesh_heading,category,method
0,200000001,38,2001/01/22,D011379,Prognosis,E,pmid
1,200000001,38,2001/01/22,D016000,Cluster Analysis,E,pmid
2,200000001,38,2001/01/22,D016000,Cluster Analysis,N,pmid
3,200000001,38,2001/01/22,D008297,Male,Sex,pmid
4,200000001,38,2001/01/22,D012333,"RNA, Messenger",D,pmid


In [46]:
tags_df[tags_df['category']=='G']

Unnamed: 0,geo_id,nsamples,date,mesh_id,mesh_heading,category,method
17,200000003,342,2001/07/19,D052138,"Genes, Neoplasm",G,pmid
18,200000003,342,2001/07/19,D009928,Organ Specificity,G,pmid
19,200000003,342,2001/07/19,D015536,Down-Regulation,G,pmid
22,200000003,342,2001/07/19,D015972,"Gene Expression Regulation, Neoplastic",G,pmid
25,200000003,342,2001/07/19,D015398,Signal Transduction,G,pmid
33,200000010,4,2001/10/03,D015723,Gene Library,G,pmid
35,200000010,4,2001/10/03,D015870,Gene Expression,G,pmid
38,200000010,4,2001/10/03,D015536,Down-Regulation,G,pmid
43,200000010,4,2001/10/03,D015854,Up-Regulation,G,pmid
51,200000014,765,2001/11/29,D002869,Chromosome Aberrations,G,pmid


# Filter DataFrame - Do it here since it still looks like the original thing...

In [37]:
# Construct date filter
mask_date = tags_df['date']==tags_df['date']

# Construct category filter
mask_category = tags_df['category']=='C'#tags_df['category']

# Filter data
tags_df = tags_df[mask_date & mask_category]

# Eliminate filterning columns
tags_df.drop(columns='date category method'.split(), inplace=True)

# Clean Data

In [7]:
# Check shape
print('Shape -before-: ', tags_df.shape)

# Drop NaNs
tags_df.dropna(axis=0,inplace=True)

# Delete duplicates
tags_df = tags_df.drop_duplicates()

# Only select summaries with +1 tag
tags_by_summary = tags_df['geo_id mesh_id'.split()].groupby('geo_id').count().reset_index() # Count tags per summary
good_summaries = tags_by_summary[tags_by_summary['mesh_id']>1] # Select abstracts with more than one tag
clean_tags = pd.merge(tags_df, good_summaries, on='geo_id') # Inner Join
clean_tags = clean_tags.drop(columns='mesh_id_y') # Drop column from inner join
clean_tags = clean_tags.rename(columns={'mesh_id_x':'mesh_id'}) # Rename key column
                             
# Check shape -again
print('Shape  -after-: ', tags_df.shape)

# Echo
tags_df.head()

Shape -before-:  (90162, 3)
Shape  -after-:  (90084, 3)


Unnamed: 0,geo_id,nsamples,mesh_id
2,200000001,38.0,D008545
4,200000001,38.0,D009361
6,200000001,38.0,D018450
10,200000001,38.0,D012878
12,200000001,38.0,D014604


# Construct DataFrame with Weighted Links

In [8]:
# Construct all-with-all links inside same geoid-nsample-date record
links = pd.merge(tags_df, tags_df, on='geo_id nsamples'.split())

# Echo info
print('     All links: ',links.shape[0])

# Rename to Source-Target
links.rename(columns={'mesh_id_x':'source', 'mesh_id_y':'target'}, inplace=True)

# Delete self-linkage
links.drop(links[links['source']==links['target']].index, inplace=True)

# Collapse repetitions while calculating weights
links_weights = links.groupby('source target'.split()).sum().reset_index()

# Rename sum(nsamples) to 'weight'
links_weights.rename(columns={'nsamples':'weight'}, inplace=True)

# Account for mirror-duplicates
links_weights['weight']/=2

# Normalize weights
links_weights['weight']/=links_weights['weight'].max()

# Save to .csv
links_weights.to_csv(gephi_weighted_edges, index=False)

# Echo info
print('Weighted links: ', links_weights.shape[0])

# Head
links_weights.head()

     All links:  322536
Weighted links:  61332


Unnamed: 0,source,target,weight
0,D000008,D000208,0.000411
1,D000008,D000310,0.000555
2,D000008,D001064,0.000411
3,D000008,D002471,0.000555
4,D000008,D006623,0.000555


# Construct Graph

In [10]:
# Construct Directed Graph
az = nx.from_pandas_edgelist(links_weights, 
                             source='source', 
                             target='target', 
                             edge_attr='weight', 
                             create_using=nx.DiGraph()
                            )

# Check for perfectly balanced links
#suma = 0.0
#n0='D000001'
#for n1 in az.neighbors(n0):
#    for n2 in az.neighbors(n1):
#        suma += az[n1][n2]['weight'] - az[n2][n1]['weight']
#print('Balanced graph: ', suma==0.0)

# Transform to undirected graph
azud = nx.to_undirected(az)

# Echo info
print('  Size (Nodes): ', azud.size())
print(' Order (Edges): ', azud.order())

  Size (Nodes):  61332
 Order (Edges):  2322


# Run statistics

In [11]:
# Eigenvector centrality
eigencentrality = nx.eigenvector_centrality(azud, max_iter=500, weight='weight')

In [12]:
# PageRank
pagerank = nx.pagerank(azud, alpha=0.9)