In [1]:
import numpy as np
import networkx as nx
import pandas as pd
from node2vec import Node2Vec

In [2]:
file_path = '../data/dgidb/preprocessed_34_10.tsv'
interaction_matrix = pd.read_csv(file_path, sep='\t', index_col=0)

In [3]:
# Generate node lists
drugs = list(interaction_matrix.columns)
genes = list(interaction_matrix.index)

In [4]:
G = nx.Graph()
G.add_nodes_from(drugs, bipartite=0)
G.add_nodes_from(genes, bipartite=1)

for i in range (len(drugs)):
    for j in range (len(genes)):
        if interaction_matrix.iloc[j,i] == 0:
            G.add_edge(drugs[i], genes[j])

In [5]:
node2vec = Node2Vec(G, dimensions=128, walk_length=10, num_walks=80, workers=5)
model = node2vec.fit(window=5, min_count=1, batch_words=4)

Computing transition probabilities: 100%|██████████| 1636/1636 [26:09<00:00,  1.04it/s]
Generating walks (CPU: 1): 100%|██████████| 16/16 [00:29<00:00,  1.82s/it]
Generating walks (CPU: 2): 100%|██████████| 16/16 [00:24<00:00,  1.56s/it]
Generating walks (CPU: 3): 100%|██████████| 16/16 [00:28<00:00,  1.76s/it]
Generating walks (CPU: 4): 100%|██████████| 16/16 [00:22<00:00,  1.43s/it]
Generating walks (CPU: 5): 100%|██████████| 16/16 [00:17<00:00,  1.11s/it]


In [6]:
embeddings = {node: model.wv[node] for node in G.nodes()}

In [7]:
# Extract embeddings for drugs and genes
drug_embeddings = {drug: embeddings[drug] for drug in drugs}
gene_embeddings = {gene: embeddings[gene] for gene in genes}

# Convert to pandas DataFrames
drug_embeddings_df = pd.DataFrame.from_dict(drug_embeddings, orient='index')
gene_embeddings_df = pd.DataFrame.from_dict(gene_embeddings, orient='index')

# Export to CSV files
save_path = '../data/dgidb/embeddings'

drug_embeddings_df.to_csv(save_path+'/node2vec_drug_embeddings.csv', header=None)
gene_embeddings_df.to_csv(save_path+'/node2vec_gene_embeddings.csv', header=None)