# Hetionet-Instruct Dataset Builder

Author: [Khairi Abidi](https://github.com/abidikhairi)

This notebook builds a dataset for drug discovery based on the hetionet knowledge graph.

In [2]:
import pandas as pd
from tqdm import tqdm

In [14]:
edges_file = '../../../data/hetionet/edges.sif'
nodes_file = '../../../data/hetionet/nodes.tsv'

In [15]:
def get_id2name(nodes_file):
    nodes_df = pd.read_csv(nodes_file, sep='\t')
    id2name = dict(zip(nodes_df['id'], nodes_df['name']))
    return id2name

In [20]:
id2name = get_id2name(nodes_file)

In [21]:
edges_df = pd.read_csv(edges_file, sep='\t')

edges_df.head(2)

Unnamed: 0,source,metaedge,target
0,Gene::9021,GpBP,Biological Process::GO:0071357
1,Gene::51676,GpBP,Biological Process::GO:0098780


In [22]:
compound_drug_edges = edges_df[edges_df['metaedge'] == 'CbG']

compound_drug_edges.head(2)

Unnamed: 0,source,metaedge,target
728491,Compound::DB00514,CbG,Gene::1136
728492,Compound::DB00686,CbG,Gene::2246


In [23]:
compound_drug_edges1 = compound_drug_edges.groupby('target').agg({'source': list}).reset_index()

compound_drug_edges1.head(2)

Unnamed: 0,target,source
0,Gene::10,"[Compound::DB00951, Compound::DB00250, Compoun..."
1,Gene::100,"[Compound::DB00552, Compound::DB00640, Compoun..."


In [24]:
allowed_types = ['GpBP', 'GpMF', 'GpCC']

helper_dict = {
    'GpBP': 'process',
    'GpMF': 'molecular function',
    'GpCC': 'location'
}

In [25]:
max_nodes_per_hop = 5
num_heldout_genes = 100

validation_genes = compound_drug_edges1['target'].sample(num_heldout_genes).tolist()
train_genes_df = compound_drug_edges1[~compound_drug_edges1['target'].isin(validation_genes)]
validation_genes_df = compound_drug_edges1[compound_drug_edges1['target'].isin(validation_genes)]

In [26]:
len(validation_genes_df), len(train_genes_df), len(validation_genes) 

(100, 1589, 100)

In [27]:
def get_graph_structure(df):
    rows = [] 
    
    for idx, row in tqdm(enumerate(df.itertuples()), total=len(df)):
        source = row.source
        target = row.target
        
        compounds = map(id2name.get, source)
        compounds = " \n".join(map(lambda x: f'- {x}', compounds))        
        
        gene_df = edges_df[edges_df['source'] == target]
        gene_df = gene_df[gene_df['metaedge'].isin(allowed_types)]
        metaedges = gene_df['metaedge'].unique().tolist()
        
        xs = []
        for metaedge in metaedges:
            sub_df = gene_df[gene_df['metaedge'] == metaedge]
            num_nodes = len(sub_df)
            targets = sub_df.sample(n=min(max_nodes_per_hop, num_nodes))['target'].tolist()
            y = helper_dict[metaedge]
            targets = ', '.join(list(map(lambda x: f'<{y}, {id2name[x]}>', targets)))
            xs.append(targets)
        
        rows.append({
            'CentralNode': id2name[target],
            'Compounds': compounds,
            '1HopNeighbors': xs
        })
            
    return rows

In [28]:
train_rows = get_graph_structure(train_genes_df)
validation_rows = get_graph_structure(validation_genes_df)

100%|██████████| 1589/1589 [02:17<00:00, 11.55it/s]
100%|██████████| 100/100 [00:08<00:00, 11.63it/s]


In [44]:
task_prefix = 'Predict drugs for the central gene based on its neighborhood:'

In [45]:
PROMPT_TEMPLATE = """Central node: {{central node}}  
One-hop neighbors: {{1-hop neighbor list}}  
    
Answer:"""

In [46]:
def get_dataset(rows,):
    dataset = []
    for row in rows:
        central_node = row['CentralNode']
        compounds = row['Compounds']
        one_hop_neighbors = '\n'.join(row['1HopNeighbors'])
        
        prompt = PROMPT_TEMPLATE.replace('{{central node}}', central_node)
        prompt = prompt.replace('{{1-hop neighbor list}}', one_hop_neighbors)
        prompt = f'{task_prefix}\n{prompt}'
        dataset.append({
            'input': prompt,
            'target': compounds
        })
    return pd.DataFrame(dataset)

In [48]:
train_df = get_dataset(train_rows)
validation_df = get_dataset(validation_rows)

In [49]:
train_df.shape, validation_df.shape

((1589, 2), (100, 2))

In [50]:
from huggingface_hub import HfApi

In [51]:
api = HfApi()
api.create_repo(repo_id="khairi/drug-discovery-hetionet", exist_ok=True, repo_type='dataset')

RepoUrl('https://huggingface.co/datasets/khairi/drug-discovery-hetionet', endpoint='https://huggingface.co', repo_type='dataset', repo_id='khairi/drug-discovery-hetionet')

In [52]:
train_df.to_parquet(f'hf://datasets/khairi/drug-discovery-hetionet/train.parquet', index=False)

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  /tmp/hffs-mjq3aubv                    : 100%|##########|  315kB /  315kB            

In [53]:
validation_df.to_parquet(f'hf://datasets/khairi/drug-discovery-hetionet/validation.parquet', index=False)

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  /tmp/hffs-gm1xqe2u                    : 100%|##########| 30.3kB / 30.3kB            

In [54]:
# TODO: document the notebook