# Imports

In [1]:
# Pandas, Numpy and Matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from graph_tool.all import *

ImportError: dlopen(/Users/luvela/anaconda3/lib/python3.6/site-packages/graph_tool/libgraph_tool_core.so, 9): Library not loaded: @rpath/libboost_python3.dylib
  Referenced from: /Users/luvela/anaconda3/lib/python3.6/site-packages/graph_tool/libgraph_tool_core.so
  Reason: image not found

# Get tagged words

In [6]:
# Set name of file
filename = '../data/interim/disease_tags.pkl'

# Read to DataFrame
df = pd.read_pickle(filename)

# Echo
print('Total entries in df: ', df.shape[0])
print('Unique summaries in df: ', df['Id'].nunique())
df.head()

Total entries in df:  124502
Unique summaries in df:  29417


Unnamed: 0,Id,start,end,disease_tag,ont,unique_id
0,200000001,56,85,cutaneous malignant melanomas,OMIM,155600
1,200000003,0,20,Renal Cell Carcinoma,MESH,D002292
2,200000003,141,154,kidney cancer,MESH,D007680
3,200000003,245,290,neoplastic and non-cancerous renal epithelium,MESH,D007680
4,200000003,656,668,renal cancer,MESH,D007680


# Extract titles

In [8]:
# Load relevant columns
names = pd.DataFrame(df['ont unique_id disease_tag'.split()])

# Construct id and label fields
names['id'] = names['ont']+':'+names['unique_id']
names['label']=names['disease_tag']

# Drop unnecesary fields
names.drop(labels='ont unique_id disease_tag'.split(),axis=1, inplace=True)

# Delete duplicates
names.drop_duplicates(subset='id', keep='first',inplace=True)

# Delete NaN's
names.dropna(axis=0,inplace=True)

# Save to .csv
names.to_csv('titles.csv', index=False)

# Echo
names.head()

Unnamed: 0,id,label
0,OMIM:155600,cutaneous malignant melanomas
1,MESH:D002292,Renal Cell Carcinoma
2,MESH:D007680,kidney cancer
6,MESH:D007674,kidney tumor
7,MESH:D002277,epithelial tumors


# Extract Links 1: Clean Data

In [10]:
# Exclude start/end/disease_tag columns
tags = pd.DataFrame(df['Id ont unique_id'.split()])

# Drop NaNs
tags.dropna(axis=0,inplace=True)

# Rename fields, create combined field 'ont:unique_id', create repetition field.
tags['summary_id'] = tags['Id']
tags['disease_id'] = tags['ont']+':'+tags['unique_id']
tags['year'] = 2017 

# Leave only important fields
tags = tags['year summary_id disease_id'.split()]

# Delete duplicates
tags = tags.drop_duplicates()

# Echo
print('Unique summaries in tags: ', tags['summary_id'].nunique())
print('Total tags after cleaning: ', tags.shape[0])
tags.head()

Unique summaries in tags:  28346
Total tags after cleaning:  61625


Unnamed: 0,year,summary_id,disease_id
0,2017,200000001,OMIM:155600
1,2017,200000003,MESH:D002292
2,2017,200000003,MESH:D007680
6,2017,200000003,MESH:D007674
7,2017,200000003,MESH:D002277


# Extract Links: Choose summaries with +1 tag

In [11]:
# Count tags per summary
tags_by_summary = tags['summary_id disease_id'.split()].groupby('summary_id').count().reset_index()

# Select abstracts with more than one tag
good_summaries = tags_by_summary[tags_by_summary['disease_id']>1]

# Inner Join
clean_tags = pd.merge(tags, good_summaries, on='summary_id')

# Rename columns
clean_tags.rename(columns={'disease_id_x':'disease_id'}, inplace=True)

# Drop extra column
clean_tags = clean_tags['year summary_id disease_id'.split()]

# Echo
clean_tags.head()

Unnamed: 0,year,summary_id,disease_id
0,2017,200000003,MESH:D002292
1,2017,200000003,MESH:D007680
2,2017,200000003,MESH:D007674
3,2017,200000003,MESH:D002277
4,2017,200000014,MESH:D009369


# Extract Links 3: Construct ( Source, Target) tuples

In [13]:
%%time 
# About 4 minutes

# Import library for combinatorics
import itertools

# Define list of tuples
links = list()

# Cycle through summary ids
for summary in clean_tags['summary_id']:
    
    # Select all tags for this summary
    tagos = clean_tags[clean_tags['summary_id']==summary]['disease_id']
    
    # Append all possible pairs
    links.extend(list(itertools.combinations(tagos,2)))

CPU times: user 3min 41s, sys: 436 ms, total: 3min 42s
Wall time: 3min 42s


# Extract Links 4: Construct Weights

In [14]:
# Construct DataFrame with named columns Source and Target
lynx = pd.DataFrame(links, columns='Source Target'.split())

# Define Weight columns with default value 1 
lynx['Weight'] = 1

# Count repetitions of the same (Source,Target)-tuple
lynx = lynx.groupby('Source Target'.split()).count().reset_index()#.sort_values(by='counts',ascending=False)

# Normalize Weights to Unity
lynx['Weight'] = lynx['Weight']/lynx['Weight'].max()

# Export to .csv
lynx.to_csv('links.csv', index=False)

# Echo
lynx.head()

Unnamed: 0,Source,Target,Weight
0,MESH:C531600,MESH:C537901,0.000694
1,MESH:C531600,MESH:D002276,0.001735
2,MESH:C531600,MESH:D009369,0.001735
3,MESH:C531623,MESH:C538007,0.002776
4,MESH:C531623,MESH:D006330,0.002776
