# Creating vector representations of seqs and SMILES
*Siew Wei Feng*

## Required libraries

In [1]:
import pickle as pkl
import pandas as pd
import numpy as np
import gensim 
from gensim.models import Word2Vec 
import random 
import re

## Functions

### Function to obtain vector representations given protein sequences / SMILES

In [2]:
def word2vec(dims,data,window_size,negative_size):
    texts = [[word for word in re.findall(r'.{3}',document)] for document in list(data)]
    #before
    #model = Word2Vec(texts,size=dims,window=window_size,min_count=1,negative=negative_size,sg=1,sample=0.001,hs=1,workers=4)
    #after
    model = Word2Vec(texts,vector_size=dims,window=window_size,min_count=1,negative=negative_size,sg=1,sample=0.001,hs=1,workers=4)
    #before
    #vectors = pd.DataFrame([model[word] for word in (model.wv.vocab)])
    #vectors['Word'] = list(model.wv.vocab)
    #after
    word_to_indexes = model.wv.key_to_index
    indexes = [word_to_indexes[word] for word in word_to_indexes]
    vectors = pd.DataFrame([model.wv.vectors[index] for index in indexes])
    vectors['Word'] = [word for word in word_to_indexes]
    word_vec = pd.DataFrame()
    dictionary=[]
    Index = []
    for i in range(len(data)):
        Index.append(i)
    Index = list(Index)
    # Word segmentation
    for i in range(len(texts)):
        i_word=[]         
        for w in range(len(texts[i])):
            i_word.append(Index[i])    
        dictionary.extend(i_word)

    word_vec['Id'] = dictionary

    # word vectors generation
    dictionary=[]
    for i in range(len(texts)):
        i_word=[]         
        for w in range(len(texts[i])):
            i_word.append(texts[i][w])    
        dictionary.extend(i_word)
    word_vec['Word'] = dictionary

    del dictionary,i_word
    word_vec = word_vec.merge(vectors,on='Word', how='left')
    word_vec.columns = ['Id']+['Word']+["vec_{0}".format(i) for i in range(0,dims)]

    return word_vec

In [3]:
def feature_embeddings(word_vec,dims):
    word_vec=word_vec.drop('Word',axis=1)
    name = ["vec_{0}".format(i) for i in range(0,dims)]
    feature_embeddings = pd.DataFrame(word_vec.groupby(['Id'])[name].agg('mean')).reset_index()
    feature_embeddings.columns=["Index"]+["mean_ci_{0}".format(i) for i in range(0,dims)]
    return feature_embeddings

### Load train and test dataset

In [4]:
train_df = pd.read_pickle('pdb_train.p')
test_df = pd.read_pickle('pdb_test.p')

#uncomment to create dictionary where key= pdb_id and value = list of lig_ids belonging to ligands that bind to the protein

#pdb_ids= list(train_df['pdb_id']) + list(test_df['pdb_id'])
#lig_ids= list(train_df['lig_id']) + list(test_df['lig_id'])
#unique_pdb_ids= np.unique(pdb_ids)
#pdb_id_binds_to_lig_id= dict()

#for pdb_id in unique_pdb_ids:
    #indices = [i for i, x in enumerate(pdb_ids) if x == pdb_id]
    #pdb_id_binds_to_lig_id[pdb_id]= np.take(lig_ids, indices)

#f = open('pdb_id_binds_to_lig_id.pckl', 'wb')
#pkl.dump(pdb_id_binds_to_lig_id, f)
#f.close()

train_df = train_df.sample(n=6000)
test_df = test_df.sample(n=2000)

In [5]:
f = open('pdb_id_binds_to_lig_id.pckl', 'rb')
pdb_id_binds_to_lig_id = pkl.load(f)
f.close()

### Create negative samples (i.e. non-binding protein-ligand pairs)

In [6]:
def generate_negative_samples(df, negative_samples_per_positive_sample, pdb_id_binds_to_lig_id):
    negative_samples = []
    n_negative_samples = negative_samples_per_positive_sample*len(df)
    while len(negative_samples) < n_negative_samples:
        idx1, idx2 = np.random.choice(range(len(df)), 2, replace=False)
        pdb_id = list(df['pdb_id'])[idx1]
        lig_id = list(df['lig_id'])[idx2]
        if lig_id not in pdb_id_binds_to_lig_id[pdb_id]:
            negative_sample = df.iloc[idx1].copy()
            negative_sample['ligand_xyz'] = df.iloc[idx2]['ligand_xyz']
            negative_sample['ligand_xyz_2d'] = df.iloc[idx2]['ligand_xyz_2d']
            negative_sample['ligand_bonds'] = df.iloc[idx2]['ligand_bonds']
            negative_sample['smiles'] = df.iloc[idx2]['smiles']
            negative_samples.append(negative_sample)
    return pd.DataFrame(negative_samples)

In [7]:
negative_samples_per_positive_sample = 1
n_positive_samples = len(train_df)
n_negative_samples = negative_samples_per_positive_sample* n_positive_samples
negative_samples = generate_negative_samples(train_df, negative_samples_per_positive_sample, pdb_id_binds_to_lig_id)
train_df = pd.concat([train_df, negative_samples], ignore_index=True)
y_train = pd.concat([pd.DataFrame.from_dict({'labels':[1 for i in range(n_positive_samples)]}),
                     pd.DataFrame.from_dict({'labels':[0 for i in range(n_negative_samples)]})])

In [8]:
y_train.shape

(12000, 1)

In [9]:
negative_samples_per_positive_sample = 1
n_positive_samples = len(test_df)
n_negative_samples = negative_samples_per_positive_sample * n_positive_samples
negative_samples = generate_negative_samples(test_df, negative_samples_per_positive_sample, pdb_id_binds_to_lig_id)
test_df = pd.concat([test_df, negative_samples], ignore_index=True)
y_test = pd.concat([pd.DataFrame.from_dict({'labels':[1 for i in range(n_positive_samples)]}),
                     pd.DataFrame.from_dict({'labels':[0 for i in range(n_negative_samples)]})])

In [10]:
y_test.shape

(4000, 1)

In [11]:
n_positive_samples

2000

In [12]:
n_negative_samples

2000

### Vectorize protein sequences in train and test dataset

In [13]:
seqs= list(train_df['seq']) + list(test_df['seq'])
prot_vec=word2vec(dims= 100,data= seqs, window_size= 5,negative_size= 5) 
prot_feature_embeddings= feature_embeddings(word_vec= prot_vec,dims= 100)

In [14]:
seq_train= prot_feature_embeddings.iloc[0:len(train_df['seq']), 1:]
seq_test= prot_feature_embeddings.iloc[len(train_df['seq']):, 1:]

### Vectorize ligand SMILES in train and test dataset

In [15]:
smiles= list(train_df['smiles']) + list(test_df['smiles'])
smiles_vec=word2vec(dims= 100,data= smiles, window_size= 5,negative_size= 5) 
smiles_feature_embeddings= feature_embeddings(word_vec= smiles_vec,dims= 100)

In [16]:
smiles_train= smiles_feature_embeddings.iloc[0:len(train_df['smiles']), 1:]
smiles_test= smiles_feature_embeddings.iloc[len(train_df['smiles']):, 1:]

### Create vector representation of protein-ligand pair by concatenating their vector representations together

In [17]:
x_train = pd.concat([seq_train, smiles_train], ignore_index=True, axis=1)
x_test = pd.concat([seq_test, smiles_test], ignore_index=True, axis=1)

In [18]:
x_test.shape

(4000, 200)

### Save x_train, x_test, y_train and y_test

In [19]:
f = open('x_train.pckl', 'wb')
pkl.dump(x_train, f)
f.close()

f = open('x_test.pckl', 'wb')
pkl.dump(x_test, f)
f.close()

f = open('y_train.pckl', 'wb')
pkl.dump(y_train, f)
f.close()

f = open('y_test.pckl', 'wb')
pkl.dump(y_test, f)
f.close()


In [20]:
#to reaccess
f = open('x_train.pckl', 'rb')
x_train = pkl.load(f)
f.close()

f = open('x_test.pckl', 'rb')
x_test = pkl.load(f)
f.close()

f = open('y_train.pckl', 'rb')
y_train = pkl.load(f)
f.close()

f = open('y_test.pckl', 'rb')
y_test = pkl.load(f)
f.close()