In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
# Directory to the project folder

deep_learning_dir = '/content/gdrive/MyDrive/BMI 707 Project/target_embeddings' 

In [3]:
# Import packages

import os
import torch
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
# Install ELMo embedder
# Reference: https://pypi.org/project/simple-elmo/

!pip install --upgrade simple_elmo

In [None]:
from simple_elmo import ElmoModel

# Load model weights to ELMo
model_dir = deep_learning_dir + '/uniref50_v2'

model = ElmoModel()
model.load(model_dir)

In [None]:
def seq2emb(sequence, model):
  '''
  Retrieving an embedding from a sequence using ELMO model.

  Input: 
    sequence (str): string with sequence
    model (ElmoModel): elmo model with preloaded weights

  Returns:
    emb (tensor): tensor with shape ([1024]) with embedding of protein.
  '''
  emb = torch.tensor(model.get_elmo_vectors(sequence))
  emb = torch.tensor(emb).sum(dim=0).mean(dim=0) 
  return emb

## Target Dataset Processing:


In [6]:
targets_df

Unnamed: 0.1,Unnamed: 0,drugbank_id,target_id,target_sequence
0,0,DB00001,BE0000048,MAHVRGLQLPGCLALAALCSLVHSQHVFLAPQQARSLLQRVRRANT...
1,1,DB00002,BE0000767,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...
2,1,DB00002,BE0000901,MWQLLLPTALLLLVSAGMRTEDLPKAVVFLEPQWYSVLEKDSVTLK...
3,1,DB00002,BE0002093,MWLLYLLVPALFCRAGGSIPIPQKLFGEVTSPLFPKPYPNNFETTT...
4,1,DB00002,BE0002094,MEGPRGWLVLCVLAISLASMVTEDLCRAPDGKKGEAGRPGRRGRPG...
...,...,...,...,...
19201,13445,DB15569,BE0004071,MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...
19202,13446,DB15570,BE0009787,MNPTDIADTTLDESIYSNYYLYESIPKPCTKEGIKAFGELFLPPLY...
19203,13468,DB15593,BE0009794,MLWWEEVEDCYEREDVQKKTFTKWVNAQFSKFGKQHIENLFSDLQD...
19204,13469,DB15594,BE0009797,none


In [5]:
targets_df = pd.read_csv("/content/gdrive/MyDrive/BMI 707 Project/embeddings/target_embeddings/target_mappings.csv")

# Clean and sort data
clean_targets_df = targets_df[targets_df.target_sequence != 'none'] # remove entries with values 'none'
clean_targets_df = clean_targets_df.drop_duplicates(subset = ["target_id"]) # drop duplicates 
clean_targets_df = clean_targets_df.sort_values(by="target_sequence", key=lambda x: x.str.len()) # sort by sequence length
clean_targets_df

Unnamed: 0.1,Unnamed: 0,drugbank_id,target_id,target_sequence
2023,358,DB00370,BE0005582,MLLWVQQALLA
7754,2251,DB02379,BE0002017,AGVPFNTKTPYGPT
3041,604,DB00619,BE0001104,GEGDVRCRGAASAVAAAAAAARQ
12279,4870,DB05194,BE0002482,MIWEEFTPEEGKGYREEVLTVKEIT
11351,4195,DB04464,BE0004086,MAQDIISTIGDLVKWIIDTVNKFTKK
...,...,...,...,...
1356,190,DB00201,BE0000739,MGDAEGEDEVQFLRTDDEVVLQCSATVLKEQLKLCLAAEGFGNRLC...
5850,1349,DB01411,BE0001034,MGLPLARLAAVCLALSLAGGSELQTEGRTRYHGRNVCSTWGNFHYK...
12064,4665,DB04959,BE0002460,MATSGGEEAAAAAPAPGTPATGADTTPGWEVAVRPLLSASYSAFEM...
13987,6395,DB07293,BE0003801,MESLVLGVNEKTHVQLSLPVLQVRDVLVRGFGDSVEEALSEAREHL...


In [None]:
# Generate target embeddings as a list

target_embeddings = []

for emb in tqdm(clean_targets_df['target_sequence']):
  one_emb = seq2emb(emb, model) # use seq2emb function defined above
  target_embeddings.append(one_emb)

print('Done!')

In [None]:
# Convert list to array for the dataframe (readability) 

embs_array = []
for i in target_embeddings:
  a = i.numpy()
  embs_array.append(a)

# Add embeddings to dataframe
clean_targets_df['embeddings'] = embs_array

In [None]:
# Export to pickle
clean_targets_df.to_pickle('drugbank_target_embedddings.pkl')

# Copy to another directory to save file
# !cp -r "/content/drugbank_target_embedddings.pkl" "/content/gdrive/MyDrive/BMI 707 Project/embeddings"

## Load and view the final file: 

In [None]:
# Load and view
final_df = pd.read_pickle(deep_learning_dir + "/drugbank_target_embedddings.pkl")
final_df

Unnamed: 0.1,Unnamed: 0,drugbank_id,target_id,target_sequence,embeddings
2023,358,DB00370,BE0005582,MLLWVQQALLA,"[7.6123374360613525, 7.972399162128568, 1.5261..."
7754,2251,DB02379,BE0002017,AGVPFNTKTPYGPT,"[7.6123374360613525, 7.972399162128568, 1.5261..."
3041,604,DB00619,BE0001104,GEGDVRCRGAASAVAAAAAAARQ,"[7.6123374360613525, 7.972399162128568, 1.5261..."
12279,4870,DB05194,BE0002482,MIWEEFTPEEGKGYREEVLTVKEIT,"[7.6123374360613525, 7.972399162128568, 1.5261..."
11351,4195,DB04464,BE0004086,MAQDIISTIGDLVKWIIDTVNKFTKK,"[7.6123374360613525, 7.972399162128568, 1.5261..."
...,...,...,...,...,...
1356,190,DB00201,BE0000739,MGDAEGEDEVQFLRTDDEVVLQCSATVLKEQLKLCLAAEGFGNRLC...,"[387.6335365786217, 532.5416541863233, 167.123..."
5850,1349,DB01411,BE0001034,MGLPLARLAAVCLALSLAGGSELQTEGRTRYHGRNVCSTWGNFHYK...,"[444.92687717976514, 738.1280948370695, -60.24..."
12064,4665,DB04959,BE0002460,MATSGGEEAAAAAPAPGTPATGADTTPGWEVAVRPLLSASYSAFEM...,"[392.5545535045676, 573.7628478072584, 160.352..."
13987,6395,DB07293,BE0003801,MESLVLGVNEKTHVQLSLPVLQVRDVLVRGFGDSVEEALSEAREHL...,"[530.1488111329963, 766.3823193386197, 209.408..."


In [None]:
# View one example
print("Sample Target Embedding\nTarget ID:", final_df['target_id'][1], 
      "\nTarget Embedding:", final_df['embeddings'][1], 
      "\nTarget Sequence:", final_df['target_sequence'][1])

Sample Target Embedding
Target ID: BE0000767 
Target Embedding: [ 91.97858468 125.20936934  41.84956151 ...  45.29450765 -88.95176846
 -41.53821779] 
Target Sequence: MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYVQRNYDLSFLKTIQEVAGYVLIALNTVERIPLENLQIIRGNMYYENSYALAVLSNYDANKTGLKELPMRNLQEILHGAVRFSNNPALCNVESIQWRDIVSSDFLSNMSMDFQNHLGSCQKCDPSCPNGSCWGAGEENCQKLTKIICAQQCSGRCRGKSPSDCCHNQCAAGCTGPRESDCLVCRKFRDEATCKDTCPPLMLYNPTTYQMDVNPEGKYSFGATCVKKCPRNYVVTDHGSCVRACGADSYEMEEDGVRKCKKCEGPCRKVCNGIGIGEFKDSLSINATNIKHFKNCTSISGDLHILPVAFRGDSFTHTPPLDPQELDILKTVKEITGFLLIQAWPENRTDLHAFENLEIIRGRTKQHGQFSLAVVSLNITSLGLRSLKEISDGDVIISGNKNLCYANTINWKKLFGTSGQKTKIISNRGENSCKATGQVCHALCSPEGCWGPEPRDCVSCRNVSRGRECVDKCNLLEGEPREFVENSECIQCHPECLPQAMNITCTGRGPDNCIQCAHYIDGPHCVKTCPAGVMGENNTLVWKYADAGHVCHLCHPNCTYGCTGPGLEGCPTNGPKIPSIATGMVGALLLLLVVALGIGLFMRRRHIVRKRTLRRLLQERELVEPLTPSGEAPNQALLRILKETEFKKIKVLGSGAFGTVYKGLWIPEGEKVKIPVAIKELREATSPKANKEILDEAYVMASVDNPHVCRLLGICLTSTVQLITQLMPFGCLLDYVREHKDNIGSQYLLNWCVQIAKGMNYLEDRRL