# Hetionet-Instruct Dataset Builder

Author: [Khairi Abidi](https://github.com/abidikhairi)

This notebook builds a dataset for drug discovery based on the hetionet knowledge graph.

In [None]:
import pandas as pd
from tqdm import tqdm

In [None]:
edges_file = '../../../data/hetionet/edges.sif'
nodes_file = '../../../data/hetionet/nodes.tsv'

In [None]:
def get_id2name(nodes_file):
    nodes_df = pd.read_csv(nodes_file, sep='\t')
    id2name = dict(zip(nodes_df['id'], nodes_df['name']))
    return id2name

In [None]:
id2name = get_id2name(nodes_file)

In [None]:
edges_df = pd.read_csv(edges_file, sep='\t')

edges_df.head(2)

In [None]:
compound_drug_edges = edges_df[edges_df['metaedge'] == 'CbG']

compound_drug_edges.head(2)

In [None]:
compound_drug_edges1 = compound_drug_edges.groupby('target').agg({'source': list}).reset_index()

compound_drug_edges1.head(2)

In [None]:
allowed_types = ['GpBP', 'GpMF', 'GpCC']

helper_dict = {
    'GpBP': 'process',
    'GpMF': 'molecular function',
    'GpCC': 'location'
}

In [None]:
max_nodes_per_hop = 5
num_heldout_genes = 100

validation_genes = compound_drug_edges1['target'].sample(num_heldout_genes).tolist()
train_genes_df = compound_drug_edges1[~compound_drug_edges1['target'].isin(validation_genes)]
validation_genes_df = compound_drug_edges1[compound_drug_edges1['target'].isin(validation_genes)]

In [None]:
len(validation_genes_df), len(train_genes_df), len(validation_genes) 

In [None]:
def get_graph_structure(df):
    rows = [] 
    
    for idx, row in tqdm(enumerate(df.itertuples()), total=len(df)):
        source = row.source
        target = row.target
        
        compounds = map(id2name.get, source)
        compounds = " \n".join(map(lambda x: f'- {x}', compounds))        
        
        gene_df = edges_df[edges_df['source'] == target]
        gene_df = gene_df[gene_df['metaedge'].isin(allowed_types)]
        metaedges = gene_df['metaedge'].unique().tolist()
        
        xs = []
        for metaedge in metaedges:
            sub_df = gene_df[gene_df['metaedge'] == metaedge]
            num_nodes = len(sub_df)
            targets = sub_df.sample(n=min(max_nodes_per_hop, num_nodes))['target'].tolist()
            y = helper_dict[metaedge]
            targets = ', '.join(list(map(lambda x: f'[{y}, {x}]', targets)))
            xs.append(targets)
        
        rows.append({
            'CentralNode': id2name[target],
            'Compounds': compounds,
            '1HopNeighbors': xs
        })
            
    return rows

In [None]:
train_rows = get_graph_structure(train_genes_df)
validation_rows = get_graph_structure(validation_genes_df)

In [None]:
task_prefix = 'Predict drugs for the central gene based on its neighborhood:'

In [None]:
PROMPT_TEMPLATE = """Central node: {{central node}}  
One-hop neighbors: {{1-hop neighbor list}}  
    
Answer:"""

In [None]:
def get_dataset(rows,):
    dataset = []
    for row in rows:
        central_node = row['CentralNode']
        compounds = row['Compounds']
        one_hop_neighbors = '\n'.join(row['1HopNeighbors'])
        
        prompt = PROMPT_TEMPLATE.replace('{{central node}}', central_node)
        prompt = prompt.replace('{{1-hop neighbor list}}', one_hop_neighbors)
        prompt = f'{task_prefix}\n{prompt}'
        dataset.append({
            'input': prompt,
            'target': compounds
        })
    return pd.DataFrame(dataset)

In [None]:
train_df = get_dataset(train_rows)
validation_df = get_dataset(validation_rows)

In [None]:
train_df.shape, validation_df.shape

## Push dataset to Huggingface Hub

In [None]:
from huggingface_hub import HfApi

In [None]:
api = HfApi()
api.create_repo(repo_id="khairi/drug-discovery-hetionet", exist_ok=True, repo_type='dataset')

In [None]:
train_df.to_parquet(f'hf://datasets/khairi/drug-discovery-hetionet/train.parquet', index=False)

In [None]:
validation_df.to_parquet(f'hf://datasets/khairi/drug-discovery-hetionet/validation.parquet', index=False)

In [None]:
# TODO: document the notebook