In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from rdkit import Chem
import requests
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
import umap
from sklearn.cluster import DBSCAN

from tkgdti.data.GraphBuilder import GraphBuilder
import os 

from tkgdti.data.utils import get_protein_sequence_uniprot
from tkgdti.embed.AA2EMB import AA2EMB


import re
import pandas as pd

# set seed 
torch.manual_seed(0)
np.random.seed(0)


  from .autonotebook import tqdm as notebook_tqdm


# ProtBert; protein similarity protein 

In [2]:
SIM_QUANTILE = 0.999

In [3]:
embed_dict = torch.load('../../extdata/meta/aas_dict.pt')
aas = embed_dict['amino_acids']
gene2aa = embed_dict['meta_df']
z_prot = embed_dict['embeddings']

  embed_dict = torch.load('../../extdata/meta/aas_dict.pt')


In [4]:
# Initialize an empty dict for results
res = {'drug_i': [], 'drug_j': [], 'cos_sim': []}

N = len(aas)  # total number of drugs/proteins

for i in range(N - 1):
    # Print progress
    print(f'progress: {i}/{N}', end='\r')
    
    # Cosine similarities of the i-th row ONLY with columns i+1 to N
    # shape = (1, N - i - 1) after slicing z_prot[i+1:]
    a = z_prot[[i]]
    b = z_prot[i+1:].reshape(-1, z_prot.shape[1])
    row_sims = cosine_similarity(a,b).ravel()
    
    # The "j" indices for the upper-triangle are i+1...N-1
    j_vals = list(range(i+1, N))
    
    # Add to the dictionary
    res['drug_i'].extend([i] * len(j_vals))
    res['drug_j'].extend(j_vals)
    res['cos_sim'].extend(row_sims.tolist())

# Convert to DataFrame
res = pd.DataFrame(res)
res.head()


progress: 13051/13053

Unnamed: 0,drug_i,drug_j,cos_sim
0,0,1,0.641422
1,0,2,0.94792
2,0,3,0.949097
3,0,4,0.863938
4,0,5,0.863159


In [5]:
b.shape

(1, 1024)

In [6]:
res = pd.DataFrame(res)
res.head()

Unnamed: 0,drug_i,drug_j,cos_sim
0,0,1,0.641422
1,0,2,0.94792
2,0,3,0.949097
3,0,4,0.863938
4,0,5,0.863159


In [7]:


aa2gene = {aa:g for aa,g in zip(gene2aa.sequence, gene2aa.gene_name)}

res = res.assign(gene_i = [aa2gene[aas[i]] for i in res['drug_i']],
                 gene_j = [aa2gene[aas[j]] for j in res['drug_j']])

res.head() 

Unnamed: 0,drug_i,drug_j,cos_sim,gene_i,gene_j
0,0,1,0.641422,A1BG,A1CF
1,0,2,0.94792,A1BG,A2M
2,0,3,0.949097,A1BG,A3GALT2
3,0,4,0.863938,A1BG,A4GALT
4,0,5,0.863159,A1BG,A4GNT


In [8]:
res = res[lambda x: x.gene_i != x.gene_j]

In [9]:
cos_sim_thresh = np.quantile(res['cos_sim'], SIM_QUANTILE)
print(f'Cosine similarity threshold: {cos_sim_thresh:.4f}')
res = res.assign(is_similar=res['cos_sim'] > cos_sim_thresh)

Cosine similarity threshold: 0.9781


In [10]:
sim_relations = res[res['is_similar']]
sim_relations = sim_relations[['gene_i', 'gene_j']].rename({'gene_i': 'src', 'gene_j': 'dst'}, axis=1)
sim_relations = sim_relations.assign(src_type = 'gene', dst_type = 'gene', relation = 'protbert_similarity')

# duplicate reverse relations (since we only compute the upper triangle and cos sim is symmetric)
sim_relations = pd.concat([sim_relations, sim_relations.rename({'src': 'dst', 'dst': 'src'}, axis=1)])

sim_relations.to_csv('../../extdata/relations/protbert__gene_gene_similarity.csv', index=False)

In [None]:
print(sim_relations.shape)
sim_relations.head() 

(170368, 5)


Unnamed: 0,src,dst,src_type,dst_type,relation
1693,A1BG,CD7,gene,gene,protbert_similarity
10309,A1BG,SIGLEC1,gene,gene,protbert_similarity
22522,A1CF,RBM47,gene,gene,protbert_similarity
27418,A2M,C3,gene,gene,protbert_similarity
27428,A2M,C7,gene,gene,protbert_similarity


: 