In [1]:
import getpass
import itertools as itt
import json
import os
import sys
import time

import igraph
import networkx as nx
from tqdm import tqdm_notebook as tqdm

import bio2bel_hgnc
import bio2bel_hippie
import bio2bel_hippie.parser
import bio2bel_phewascatalog
import embeddingdb
import pybel
import nrl.model
from pybel.dsl import ComplexAbundance

In [2]:
print(sys.version)

3.7.3 (default, Mar 27 2019, 09:23:15) 
[Clang 10.0.1 (clang-1001.0.46.3)]


In [3]:
print(time.asctime())

Sat Jun 22 21:24:39 2019


In [4]:
print(getpass.getuser())

cthoyt


In [5]:
print(f"""PyBEL Version: {pybel.get_version()}
NRL Version: {nrl.get_version()}
embeddingdb Version: {embeddingdb.get_version()}
""")

PyBEL Version: 0.13.3-dev
NRL Version: 0.0.2-dev
embeddingdb Version: 0.0.1-dev



## Learning with HIPPIE

In [6]:
hippie_df = bio2bel_hippie.parser.get_df()
hippie_df.head()

Unnamed: 0,source_uniprot_id,source_entrez_id,target_uniprot_id,target_entrez_id,confidence,metadata
0,AL1A1_HUMAN,216,AL1A1_HUMAN,216,0.76,"experiments:in vivo,Two-hybrid;pmids:12081471,..."
1,ITA7_HUMAN,3679,ACHA_HUMAN,1134,0.73,"experiments:in vivo,Affinity Capture-Western,a..."
2,NEB1_HUMAN,55607,ACTG_HUMAN,71,0.65,"experiments:in vitro,in vivo;pmids:9362513,120..."
3,SRGN_HUMAN,5552,CD44_HUMAN,960,0.63,"experiments:in vivo;pmids:9334256,16189514,167..."
4,GRB7_HUMAN,2886,ERBB2_HUMAN,2064,0.9,"experiments:in vitro,in vivo,Reconstituted Com..."


In [7]:
it = tqdm(hippie_df[['source_entrez_id', 'target_entrez_id']].values, desc='PPIs')
hippie_graph = nx.Graph(
    (f'ncbigene:{source}', f'ncbigene:{target}')
    for source, target in it
)
hippie_graph.number_of_nodes(), hippie_graph.number_of_edges()

HBox(children=(IntProgress(value=0, description='PPIs', max=411430, style=ProgressStyle(description_width='ini…




(18166, 410143)

In [8]:
hippie_deepwalk_model = nrl.model.DeepWalkModel(
    word2vec_parameters=nrl.model.Word2VecParameters(
        workers=7,
    ),
)
hippie_deepwalk_model

<nrl.model.deepwalk.DeepWalkModel at 0x10cb4d400>

In [9]:
%%time
hippie_deepwalk_model.fit(hippie_graph)

CPU times: user 26min 12s, sys: 10.2 s, total: 26min 22s
Wall time: 5min 46s


In [10]:
hippie_deepwalk_model.dump_metadata('hippie_metadata.json')

In [16]:
hippie_deepwalk_model.save('hippie_word2vec.model')

## Learning with HIPPIE+

In [17]:
hgnc_manager = bio2bel_hgnc.Manager()
hgnc_symbol_entrez_id_mapping = hgnc_manager.build_hgnc_symbol_entrez_id_mapping()

In [18]:
phewascatalog_df = bio2bel_phewascatalog.parser.get_df()
phewascatalog_df.head()

Unnamed: 0,chromosome,snp,phewas phenotype,cases,p-value,odds-ratio,gene_name,phewas code,gwas-associations
0,19 45395619,rs2075650,Alzheimer's disease,737,5.237e-28,2.41,TOMM40,290.11,"Alzheimer's disease, Alzheimer's disease bioma..."
1,19 45395619,rs2075650,Dementias,1170,2.409e-26,2.114,TOMM40,290.1,"Alzheimer's disease, Alzheimer's disease bioma..."
2,6 396321,rs12203592,Actinic keratosis,2505,4.1409999999999996e-26,1.691,IRF4,702.1,"Eye color, Hair color, Freckling, Progressive ..."
3,6 26093141,rs1800562,Iron metabolism disorder,40,3.409e-25,12.27,HFE,275.1,"Mean corpuscular hemoglobin, Glycated hemoglob..."
4,19 45395619,rs2075650,Delirium dementia and amnestic disorders,1566,8.027e-24,1.841,TOMM40,290.0,"Alzheimer's disease, Alzheimer's disease bioma..."


In [19]:
phewascatalog_df['entrez_id'] = phewascatalog_df['gene_name'].map(hgnc_symbol_entrez_id_mapping.get)
phewascatalog_df.head()

Unnamed: 0,chromosome,snp,phewas phenotype,cases,p-value,odds-ratio,gene_name,phewas code,gwas-associations,entrez_id
0,19 45395619,rs2075650,Alzheimer's disease,737,5.237e-28,2.41,TOMM40,290.11,"Alzheimer's disease, Alzheimer's disease bioma...",10452
1,19 45395619,rs2075650,Dementias,1170,2.409e-26,2.114,TOMM40,290.1,"Alzheimer's disease, Alzheimer's disease bioma...",10452
2,6 396321,rs12203592,Actinic keratosis,2505,4.1409999999999996e-26,1.691,IRF4,702.1,"Eye color, Hair color, Freckling, Progressive ...",3662
3,6 26093141,rs1800562,Iron metabolism disorder,40,3.409e-25,12.27,HFE,275.1,"Mean corpuscular hemoglobin, Glycated hemoglob...",3077
4,19 45395619,rs2075650,Delirium dementia and amnestic disorders,1566,8.027e-24,1.841,TOMM40,290.0,"Alzheimer's disease, Alzheimer's disease bioma...",10452


In [20]:
hippie_and_phewas_graph = hippie_graph.copy()

it = tqdm(phewascatalog_df[['snp', 'entrez_id']].values, desc='SNP-gene')
hippie_and_phewas_graph.add_edges_from(
    (f'dbsnp:{snp}', f'ncbigene:{entrez_id}')
    for snp, entrez_id in it
    if entrez_id
)

it = tqdm(phewascatalog_df[['snp', 'phewas phenotype']].values, desc='SNP-phenotype')
hippie_and_phewas_graph.add_edges_from(
    (f'dbsnp:{snp}', f'phenotype:"{phenotype}"', )
    for snp, phenotype in it
)

hippie_and_phewas_graph.number_of_nodes(), hippie_and_phewas_graph.number_of_edges()

HBox(children=(IntProgress(value=0, description='SNP-gene', max=215107, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='SNP-phenotype', max=215107, style=ProgressStyle(description_w…




(22750, 627219)

In [21]:
hippie_plus_deepwalk_model = nrl.model.DeepWalkModel(
    word2vec_parameters=nrl.model.Word2VecParameters(
        workers=7,
    ),
)
hippie_plus_deepwalk_model

<nrl.model.deepwalk.DeepWalkModel at 0x139b93f28>

In [22]:
%%time
hippie_plus_word2vec_model = hippie_plus_deepwalk_model.fit(hippie_and_phewas_graph)
hippie_plus_word2vec_model

CPU times: user 44min 20s, sys: 16.7 s, total: 44min 37s
Wall time: 9min 5s


In [23]:
hippie_plus_deepwalk_model.dump_metadata('hippie_plus_metadata.json')

In [24]:
hippie_plus_deepwalk_model.model.save('hippie_plus_word2vec.model')