In [None]:
####

In [None]:
### Generate Embeddings for the nodes in the graph (Google colab T4 or A100 GPU use)

### upload the files with same names from folder and the folder containing this notebook(Question_1)

In [3]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
import pandas as pd
from tqdm import tqdm





In [None]:

class BiomedEmbedder(nn.Module):
    def __init__(self, output_dim=200):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')
        self.model = AutoModel.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')
        # Add projection layer to reduce dimensions
        self.projection = nn.Linear(768, output_dim)

    def forward(self, text):
        # Tokenize
        inputs = self.tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")

        # Get BERT embeddings
        with torch.no_grad():
            outputs = self.model(**inputs)

        # Get CLS token embedding and project to lower dimension
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # [batch_size, 768]
        reduced_embedding = self.projection(cls_embedding)   # [batch_size, 200]

        # Normalize the embeddings (optional but recommended)
        normalized_embedding = torch.nn.functional.normalize(reduced_embedding, p=2, dim=1)

        return normalized_embedding

# Create the embedder
embedder = BiomedEmbedder(output_dim=200)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
node_data = pd.read_csv('/content/Nodes.csv')

In [None]:
node_data.head(1).values

array([[0, 'UniProtKB:P53805', 'RCAN1', 'biolink:Protein',
        'RCAN1 GeneǂGenetic locus associated with RCAN1ǂRCAN1 geneǂcalcipressin-1 (human)ǂRcan1 (rat)ǂRCAN1 (human)ǂcalcipressin-1ǂCalcipressin-1ǂRcan1 (mouse)ǂRCAN1',
        'biolink:NucleicAcidEntityǂbiolink:Geneǂbiolink:Polypeptideǂbiolink:Protein',
        'https://identifiers.org/uniprot:P53805',
        'Calcipressin-1 (252 aa, ~28 kDa) is encoded by the human RCAN1 gene. This protein is involved in the regulation of transcription through the modulation of transcription factor phosphorylation.; UMLS Semantic Type: UMLSSC:T116',
        'HGNC:3040ǂRGD:631338ǂNCIT:C82947ǂENSEMBL:ENSG00000159200ǂOMIM:602917ǂMGI:1890564ǂUniProtKB:P53805ǂPR:P53805ǂUMLS:C1414160ǂPR:000013829ǂNCBIGene:1827ǂUMLS:C2826585ǂNCIT:C82944',
        'DOI:10.1042/bj20030267ǂPMID:12809556ǂDOI:10.1038/35012518ǂDOI:10.1016/j.abb.2005.05.002ǂPMID:15935327ǂPMID:14702039ǂPMID:15489334ǂDOI:10.1006/geno.1997.4866ǂDOI:10.1093/hmg/9.11.1681ǂDOI:10.1093/hmg/4.10.1

In [None]:
#features used for building intial embedding example
print("feature 1: ",node_data['all_names'][1])
print("feature 2: ",node_data['description'][1])
print("feature 3: ",node_data['label'][1])


feature 1:  1-phosphatidylinositol 4,5-bisphosphate phosphodiesterase eta-1 (human)ǂPlch1 (mouse)ǂPLCH1ǂ1-phosphatidylinositol 4,5-bisphosphate phosphodiesterase eta-1ǂGenetic locus associated with PLCH1ǂPLCH1 (human)ǂPLCH1 [plasma membrane]ǂPLCH1 gene
feature 2:  A protein that is a translation product of the human PLCH1 gene or a 1:1 ortholog thereof. // COMMENTS: Category=gene.
feature 3:  biolink:NucleicAcidEntityǂbiolink:MolecularEntityǂbiolink:ChemicalOrDrugOrTreatmentǂbiolink:OntologyClassǂbiolink:BiologicalEntityǂbiolink:NamedThingǂbiolink:GeneOrGeneProductǂbiolink:ChemicalEntityOrProteinOrPolypeptideǂbiolink:GeneProductMixinǂbiolink:MacromolecularMachineMixinǂbiolink:ChemicalEntityǂbiolink:ChemicalEntityOrGeneOrGeneProductǂbiolink:Proteinǂbiolink:ThingWithTaxonǂbiolink:PhysicalEssenceOrOccurrentǂbiolink:GenomicEntityǂbiolink:Polypeptideǂbiolink:Geneǂbiolink:PhysicalEssence


In [None]:
text = node_data['all_names'][1] + " "  + node_data['description'][1] + " " + node_data['label'][1]
embedding = embedder(text)
print(embedding) ### how To built initial node embeddings instead of random initalization

tensor([[ 0.0110, -0.0424, -0.1000, -0.0613,  0.0810, -0.0204, -0.0873, -0.0058,
          0.0452,  0.0011, -0.0709, -0.0567,  0.1070, -0.0695, -0.0639,  0.1239,
          0.0809,  0.0557, -0.0164, -0.0819,  0.0649,  0.0457,  0.0579, -0.0266,
          0.0693,  0.1380, -0.0065, -0.0310,  0.0769, -0.0100, -0.0167, -0.0711,
          0.0305, -0.0205,  0.0936, -0.1172, -0.1164,  0.0256, -0.0745, -0.0678,
         -0.0678,  0.1394, -0.0702,  0.0423,  0.0609, -0.1457, -0.0226, -0.0441,
          0.0554,  0.0384,  0.0168, -0.0759, -0.0871, -0.0397,  0.1171, -0.0129,
         -0.0060,  0.0133, -0.0266,  0.0072,  0.0185, -0.0353, -0.0845,  0.0193,
          0.0020, -0.0723, -0.0613, -0.1265,  0.0080, -0.0203,  0.0128, -0.1012,
         -0.1241, -0.0776, -0.0473,  0.0053,  0.0685, -0.0243,  0.0592, -0.0562,
          0.0638,  0.0618, -0.0594, -0.0843,  0.1302, -0.0364, -0.0268, -0.1495,
         -0.0907, -0.0504,  0.0298, -0.1106, -0.0640,  0.0442, -0.0404, -0.0538,
          0.0371,  0.0651, -

In [None]:


class BiomedEmbedder(nn.Module):
    def __init__(self, output_dim=200):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')
        self.model = AutoModel.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')
        # Add projection layer to reduce dimensions
        self.projection = nn.Linear(768, output_dim)

    def forward(self, text, device):
        # Tokenize
        inputs = self.tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")

        # Move inputs to the correct device
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Get BERT embeddings
        with torch.no_grad():
            outputs = self.model(**inputs)

        # Get CLS token embedding and project to lower dimension
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # [batch_size, 768]
        reduced_embedding = self.projection(cls_embedding)   # [batch_size, 200]

        # Normalize the embeddings
        normalized_embedding = torch.nn.functional.normalize(reduced_embedding, p=2, dim=1)

        return normalized_embedding

def create_node_embeddings(node_data):
    # Initialize device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Initialize embedder
    embedder = BiomedEmbedder(output_dim=200)
    embedder = embedder.to(device)  # Move entire model to device
    embedder.model = embedder.model.to(device)  # Ensure BERT model is on the correct device
    embedder.eval()  # Set to evaluation mode

    embeddings_list = []

    # Generate embeddings for each row
    for idx in tqdm(range(len(node_data))):
        # Combine text fields
        text = str(node_data['all_names'][idx]) + " " + str(node_data['description'][idx]) + " " + str(node_data['label'][idx])

        # Generate embedding
        with torch.no_grad():
            embedding = embedder(text, device)
            # Move embedding to CPU before converting to numpy
            embedding = embedding.cpu().numpy()

        embeddings_list.append(embedding[0])

    # Create DataFrame
    embedding_cols = [f'embedding_{i}' for i in range(200)]
    embeddings_df = pd.DataFrame(embeddings_list, columns=embedding_cols)

    return embeddings_df

# Usage
embeddings_df = create_node_embeddings(node_data)

100%|██████████| 170009/170009 [26:58<00:00, 105.04it/s]


In [None]:
# Save
# embeddings_df.to_parquet("intial_embeddings.parquet")
# Load
# embeddings_df = pd.read_parquet("intial_embeddings.parquet")

In [None]:
embeddings_df = pd.read_parquet("/content/intial_embeddings.parquet")

In [None]:
embeddings_df

Unnamed: 0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_190,embedding_191,embedding_192,embedding_193,embedding_194,embedding_195,embedding_196,embedding_197,embedding_198,embedding_199
0,-0.065050,0.052441,0.066256,-0.095235,0.062256,-0.101860,-0.039117,0.048280,-0.073613,-0.028794,...,-0.084606,-0.097289,-0.082331,-0.120398,-0.060827,-0.101922,-0.037814,-0.022881,-0.045825,-0.059099
1,-0.073818,0.042275,0.056639,-0.084617,0.060079,-0.107990,-0.074784,0.085110,-0.077811,-0.003581,...,-0.079674,-0.096880,-0.065922,-0.118254,-0.034069,-0.088112,-0.031660,-0.022860,-0.054555,-0.056793
2,-0.088355,0.051239,0.061465,-0.079984,0.058764,-0.109659,-0.059310,0.092959,-0.075921,-0.017833,...,-0.094287,-0.099682,-0.071652,-0.100319,-0.044037,-0.089869,-0.038200,-0.022832,-0.045239,-0.052873
3,-0.078723,0.044490,0.055423,-0.082926,0.056779,-0.108701,-0.065673,0.086695,-0.079611,-0.011326,...,-0.095015,-0.090733,-0.066025,-0.115431,-0.042431,-0.082074,-0.035606,-0.022070,-0.058043,-0.047620
4,-0.065127,0.044258,0.059040,-0.091809,0.062338,-0.097432,-0.040006,0.059490,-0.080162,-0.021707,...,-0.084547,-0.096006,-0.087863,-0.117698,-0.046943,-0.100802,-0.036992,-0.028012,-0.063876,-0.050279
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170004,-0.084227,0.050556,0.073219,-0.045889,0.085087,-0.045673,-0.023777,0.076699,-0.107897,-0.011489,...,-0.080544,-0.074876,-0.087662,-0.135485,-0.112203,-0.133465,-0.044816,-0.038606,-0.080218,-0.024912
170005,-0.078099,0.056143,0.064074,-0.010196,0.095619,-0.079900,-0.047217,0.119001,-0.110159,-0.000269,...,-0.103512,-0.078047,-0.073658,-0.116785,-0.084542,-0.107023,-0.045979,-0.030095,-0.062928,-0.008692
170006,-0.086429,0.040883,0.067615,-0.031543,0.081785,-0.072048,-0.035457,0.085105,-0.108085,-0.021263,...,-0.085876,-0.080495,-0.085256,-0.144030,-0.105964,-0.136824,-0.050860,-0.047969,-0.081800,-0.020154
170007,-0.081469,0.060719,0.058369,0.004406,0.100114,-0.082131,-0.043994,0.116099,-0.109580,0.003496,...,-0.087504,-0.079209,-0.068384,-0.114230,-0.085524,-0.119032,-0.038846,-0.036889,-0.068401,0.015301


In [None]:
# node_data.shape

(170009, 11)

In [None]:
node_data_with_initial_embedding = pd.concat([node_data[['id','name','category']], embeddings_df], axis=1)


In [None]:
node_data_with_initial_embedding.head(4)

Unnamed: 0,id,name,category,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,...,embedding_190,embedding_191,embedding_192,embedding_193,embedding_194,embedding_195,embedding_196,embedding_197,embedding_198,embedding_199
0,UniProtKB:P53805,RCAN1,biolink:Protein,-0.06505,0.052441,0.066256,-0.095235,0.062256,-0.10186,-0.039117,...,-0.084606,-0.097289,-0.082331,-0.120398,-0.060827,-0.101922,-0.037814,-0.022881,-0.045825,-0.059099
1,UniProtKB:Q4KWH8,PLCH1,biolink:Protein,-0.073818,0.042275,0.056639,-0.084617,0.060079,-0.10799,-0.074784,...,-0.079674,-0.09688,-0.065922,-0.118254,-0.034069,-0.088112,-0.03166,-0.02286,-0.054555,-0.056793
2,UniProtKB:Q9UPU7,TBC1D2B,biolink:Protein,-0.088355,0.051239,0.061465,-0.079984,0.058764,-0.109659,-0.05931,...,-0.094287,-0.099682,-0.071652,-0.100319,-0.044037,-0.089869,-0.0382,-0.022832,-0.045239,-0.052873
3,UniProtKB:P35462,DRD3,biolink:Protein,-0.078723,0.04449,0.055423,-0.082926,0.056779,-0.108701,-0.065673,...,-0.095015,-0.090733,-0.066025,-0.115431,-0.042431,-0.082074,-0.035606,-0.02207,-0.058043,-0.04762


In [None]:
node_data_with_initial_embedding.head(3)

Unnamed: 0,id,name,category,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,...,embedding_190,embedding_191,embedding_192,embedding_193,embedding_194,embedding_195,embedding_196,embedding_197,embedding_198,embedding_199
0,UniProtKB:P53805,RCAN1,biolink:Protein,-0.06505,0.052441,0.066256,-0.095235,0.062256,-0.10186,-0.039117,...,-0.084606,-0.097289,-0.082331,-0.120398,-0.060827,-0.101922,-0.037814,-0.022881,-0.045825,-0.059099
1,UniProtKB:Q4KWH8,PLCH1,biolink:Protein,-0.073818,0.042275,0.056639,-0.084617,0.060079,-0.10799,-0.074784,...,-0.079674,-0.09688,-0.065922,-0.118254,-0.034069,-0.088112,-0.03166,-0.02286,-0.054555,-0.056793
2,UniProtKB:Q9UPU7,TBC1D2B,biolink:Protein,-0.088355,0.051239,0.061465,-0.079984,0.058764,-0.109659,-0.05931,...,-0.094287,-0.099682,-0.071652,-0.100319,-0.044037,-0.089869,-0.0382,-0.022832,-0.045239,-0.052873


In [None]:
edge_data = pd.read_csv('/content/Edges.csv')

edge_data.head(4)

Unnamed: 0.1,Unnamed: 0,subject,object,predicate,knowledge_source,publications,publications_info,type,start_id,end_id
0,0,NCBIGene:8483,FMA:70022,biolink:related_to,infores:ensembl-gene,,{},biolink:related_to,NCBIGene:8483,FMA:70022
1,1,NCBIGene:390650,CHEMBL.TARGET:CHEMBL372,biolink:in_taxon,infores:ncbi-geneǂinfores:ensembl-gene,,{},biolink:in_taxon,NCBIGene:390650,CHEMBL.TARGET:CHEMBL372
2,2,UMLS:C1158823,ENSEMBL:ENST00000485267,biolink:has_participant,infores:ensembl-gene,,{},biolink:has_participant,UMLS:C1158823,ENSEMBL:ENST00000485267
3,3,ENSEMBL:ENST00000464141,CHEMBL.TARGET:CHEMBL372,biolink:in_taxon,infores:ensembl-gene,,{},biolink:in_taxon,ENSEMBL:ENST00000464141,CHEMBL.TARGET:CHEMBL372


In [None]:
### Generate Embeddings for the nodes in the graph

## For more on Adding in information and buiding knowledge graph embedding, It was require some time on some errors
## here is my explanation instead on how I would go about solving it

###

In [None]:
# Required libraries for knowledge graph embeddings
import torch
from pykeen.pipeline import pipeline
import pandas as pd
from pykeen.triples import TriplesFactory

# ====================================================================================
# Knowledge Graph Embeddings Generation Process
# ====================================================================================

# There are multiple knowledge graph completion algorithms available:
# - TransE
# - RotatE
# and others...

# We have two implementation options:
# 1. Write everything from scratch using PyTorch
# 2. Use PyKEEN library (recommended due to extensive functionality)

# ====================================================================================
# Known Issues and Solutions
# ====================================================================================

# PyKEEN can be buggy with special characters in names
# Solution: Create a mapping dictionary
# Example:
# node_mapping = {
#     "CHEMBL.TARGET:CHEMBL372": 0,  # Convert problematic names to indices
#     "Drug_Name": 1
# }

# Similarly for relationships:
# relation_mapping = {
#     "treated": 2,
#     "effects": 3
# }

# ====================================================================================
# Input Data Format
# ====================================================================================

# Input should be formatted as triples with numerical indices:
# Head | Tail | Relation
#  3   | 45   |    9
#  7   | 89   |    7

# Using integers instead of strings improves performance
# Each number corresponds to a mapped term from your dictionaries

# ====================================================================================
# Model Training Example
# ====================================================================================

# Pipeline configuration example - parameters can be adjusted based on needs
'''
result = pipeline(
    training=training,
    testing=testing,
    model='RotatE',  # Can use other models like TransE
    model_kwargs={
        # Half the dimension for RotatE due to its complex space nature
        'embedding_dim': len(embedding_cols) // 2,
        # Can initialize with pre-trained embeddings from Hugging Face
        'entity_initializer': InitialEmbeddings(initial_embeddings),
        'relation_dim': len(embedding_cols) // 2,
    },
    training_kwargs={
        'num_epochs': 50,        # Adjust based on convergence
        'batch_size': 32,        # Adjust based on memory
        'learning_rate': 0.001,  # Typical starting point
        'optimizer': 'Adam'      # Could use others like AdamW
    }
)
'''

# ====================================================================================
# Model Evaluation
# ====================================================================================

# Evaluate performance using metrics like:
# - hits@N: Shows how many positive connections appear in top N predictions
# - Other metrics available based on your specific needs

# ====================================================================================
# Extracting Final Embeddings
# ====================================================================================

# Extract trained model
# model = result.model

# Get final embeddings and convert to DataFrame
'''
# Extract embeddings
final_embeddings = result.model.entity_representations[0]()
entity_ids = result.training.entity_to_id

# Convert to more usable DataFrame format
final_df = pd.DataFrame([
    {
        'id': entity,
        **{f'final_embedding_{i}': emb[i].item() for i in range(len(emb))}
    }
    for entity, emb in zip(entity_ids.keys(), final_embeddings)
])

# Print results
print("Final embeddings shape:", final_df.shape)
print("\nSample of final embeddings:")
print(final_df.head())
'''

In [None]:
#### End ####