In [1]:
!pip install rdkit-pypi -q
# !pip install pybel -q
# !pip install PyBioMed -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors
import torch
from tqdm import tqdm
# from transformers import BertModel, BertTokenizer
# from PyBioMed.PyProtein import CTD
import warnings
warnings.filterwarnings('ignore')

In [4]:
working_directory = "/content/drive/MyDrive/Protacs_new_dataset/"
os.chdir(working_directory)
print("current working directory is: ", os.getcwd())

current working directory is:  /content/drive/MyDrive/Protacs_new_dataset


In [5]:
data_dir = os.path.join(os.getcwd(), 'data')
src_dir = os.path.join(os.getcwd(), 'src')
fig_dir = os.path.join(data_dir, 'figures')
checkpoint_dir = os.path.join(os.getcwd(), 'checkpoints')
dirs_to_make = [
    data_dir,
    os.path.join(data_dir, 'raw'),
    os.path.join(data_dir, 'processed'),
    os.path.join(data_dir, 'train'),
    os.path.join(data_dir, 'val'),
    os.path.join(data_dir, 'test'),
    src_dir,
    fig_dir,
    checkpoint_dir,
]
for d in dirs_to_make:
    if not os.path.exists(d):
        os.makedirs(d)

In [6]:
train_dir = os.path.join(data_dir, 'train')
val_dir = os.path.join(data_dir, 'val')
test_dir = os.path.join(data_dir, 'test')

In [7]:
selected_columns = ['Smiles', 'Cell Type', 'POI Sequence', 'E3 Ligase Sequence', 'Activity']

In [8]:
def load_data(directory, columns, prefix):
    data = []
    for filename in os.listdir(directory):
        if filename.startswith(prefix) and filename.endswith('.csv'):
            file_path = os.path.join(directory, filename)
            df = pd.read_csv(file_path, usecols=columns)
            data.append(df)
    if data:
        return pd.concat(data, ignore_index=True)
    else:
        return pd.DataFrame(columns=columns)

In [9]:
train_data = load_data(train_dir, selected_columns, 'scaffold_train_data')
val_data = load_data(val_dir, selected_columns, 'scaffold_val_data')
test_data = load_data(test_dir, selected_columns, 'scaffold_test_data')

In [10]:
print("Train Data:")
print(train_data.shape)
print("\nValidation Data:")
print(val_data.shape)
print("\nTest Data:")
print(test_data.shape)

Train Data:
(1262, 5)

Validation Data:
(245, 5)

Test Data:
(97, 5)


In [11]:
train_data.head()

Unnamed: 0,Smiles,Cell Type,POI Sequence,E3 Ligase Sequence,Activity
0,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,LnCaP95,MSTATTVAPAGIPATPGPVNPPPPEVSNPSKPGRKTNQLQYMQNVV...,MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...,True
1,Cc1ccsc1C(=O)n1nc(Nc2ccc(S(=O)(=O)NCCOCc3cn(CC...,HBL-1,MENFQKVEKIGEGTYGVVYKARNKLTGEVVALKKIRLDTETEGVPS...,MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNI...,True
2,COc1ccc(Cl)c(S(=O)(=O)Nc2ccc(-c3nc(OC[C@H]4CN(...,HEK293,MQRDHTMDYKESCPSVSIPSSDEHREKKKRFTVYKVLVSVGRSEWF...,MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...,False
3,CN(c1cccc(CNc2nc(Nc3ccc(OCCOCCOCCC(=O)NCCOc4cc...,PC3,MAAAYLDPNLNHTPNSSTKTHLGTGMERSPGAMERVLKVFHYFESN...,MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNI...,True
4,COc1cc2c(Oc3ccc(NC(=O)C4(C(=O)Nc5ccc(F)cc5)CC4...,MDA-MB-231,MSLIRKKGFYKQDVNKTAWELPKTYVSPTHVGSGAYGSVCSAIDKR...,MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...,True


# **Calculating Descriptors and fingerprints for Scaffold splits**

### These descriptors have been selected after applying Man-Whitney U-Test and using adjusted p-values using the Benjamini-Hochberg procedure

In [12]:
descriptor_names = ['SMR_VSA9', 'fr_NH0', 'fr_piperzine',
       'NumRotatableBonds', 'SlogP_VSA3', 'BCUT2D_MWHI', 'EState_VSA10',
       'PEOE_VSA14', 'fr_Al_OH_noTert', 'NumHDonors', 'SlogP_VSA8', 'Kappa2',
       'NumSaturatedRings', 'BCUT2D_CHGLO', 'Kappa3', 'SPS', 'PEOE_VSA3',
       'qed', 'PEOE_VSA1', 'BCUT2D_MRHI', 'fr_Al_OH', 'SMR_VSA1',
       'NumSaturatedHeterocycles', 'BalabanJ', 'VSA_EState3',
       'NumAliphaticRings', 'EState_VSA2', 'NumAliphaticHeterocycles',
       'SlogP_VSA10', 'SMR_VSA6', 'fr_sulfonamd', 'SlogP_VSA7', 'EState_VSA8',
       'fr_piperdine', 'NHOHCount', 'fr_aniline', 'fr_COO2', 'fr_Al_COO',
       'fr_COO']

In [13]:
import os
import pandas as pd
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors

# Define directories
data_dirs = {
    'train': os.path.join(os.getcwd(), 'data', 'train'),
    'val': os.path.join(os.getcwd(), 'data', 'val'),
    'test': os.path.join(os.getcwd(), 'data', 'test')
}

calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)

# Function to compute molecular descriptors
def compute_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return calculator.CalcDescriptors(mol)
    else:
        return [None] * len(descriptor_names)

# Function to process datasets and save descriptors
def compute_and_save_descriptors(data_dirs):
    for split, data_dir in data_dirs.items():
        for file in os.listdir(data_dir):
            if file.startswith('scaffold') and file.endswith('.csv'):
                # Load the CSV file
                file_path = os.path.join(data_dir, file)
                df = pd.read_csv(file_path)

                # Compute descriptors
                if 'Smiles' in df.columns:
                    descriptor_values = df['Smiles'].apply(compute_descriptors)
                    descriptor_df = pd.DataFrame(descriptor_values.tolist(), columns=descriptor_names)

                    # Save to new CSV file
                    output_file_path = os.path.join(data_dir, f'scaffold_{split}_descriptors.csv')
                    descriptor_df.to_csv(output_file_path, index=False)
                    print(f'Saved descriptors to {output_file_path}')
                else:
                    print(f'Skipping {file}: No Smiles column found')

# Run the function to compute and save descriptors
compute_and_save_descriptors(data_dirs)


Saved descriptors to /content/drive/MyDrive/Protacs_new_dataset/data/train/scaffold_train_descriptors.csv
Skipping scaffold_train_fingerprints.csv: No Smiles column found
Skipping scaffold_train_protein_descriptors.csv: No Smiles column found
Skipping scaffold_train_e3_descriptors.csv: No Smiles column found
Saved descriptors to /content/drive/MyDrive/Protacs_new_dataset/data/val/scaffold_val_descriptors.csv
Skipping scaffold_val_fingerprints.csv: No Smiles column found
Skipping scaffold_val_protein_descriptors.csv: No Smiles column found
Skipping scaffold_val_e3_descriptors.csv: No Smiles column found
Saved descriptors to /content/drive/MyDrive/Protacs_new_dataset/data/test/scaffold_test_descriptors.csv
Skipping scaffold_descriptors.csv: No Smiles column found
Skipping scaffold_test_fingerprints.csv: No Smiles column found
Skipping scaffold_test_protein_descriptors.csv: No Smiles column found
Skipping scaffold_test_e3_descriptors.csv: No Smiles column found


In [None]:
descriptor_df_test

Unnamed: 0,SMR_VSA9,fr_NH0,fr_piperzine,NumRotatableBonds,SlogP_VSA3,BCUT2D_MWHI,EState_VSA10,PEOE_VSA14,fr_Al_OH_noTert,NumHDonors,...,SMR_VSA6,fr_sulfonamd,SlogP_VSA7,EState_VSA8,fr_piperdine,NHOHCount,fr_aniline,fr_COO2,fr_Al_COO,fr_COO
0,11.818733,5,1,8,9.589074,35.495693,23.972686,0.000000,0,2,...,49.068358,0,5.022633,31.402528,1,2,2,0,0,0
1,34.038075,8,0,13,20.804433,35.495695,24.752031,0.000000,1,2,...,32.723782,0,5.022633,69.456636,2,2,0,0,0,0
2,16.819815,7,0,18,23.857337,35.496773,14.383612,0.000000,0,2,...,81.438015,0,5.022633,34.678415,0,2,2,0,0,0
3,5.749512,8,1,16,16.133831,35.495695,23.972686,0.000000,0,4,...,60.180940,0,5.022633,51.263431,1,4,5,0,0,0
4,11.840869,2,0,12,37.004525,35.495694,41.938468,5.969305,0,3,...,6.606882,0,5.022633,27.458424,1,3,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,16.814289,7,0,37,62.301806,32.233233,32.390483,0.000000,0,3,...,153.083699,1,0.000000,25.323629,2,3,3,0,0,0
93,11.840869,8,0,17,48.351903,32.233260,45.561728,6.176299,0,4,...,59.986551,1,0.000000,58.372026,1,4,5,0,0,0
94,5.749512,1,0,19,61.925264,16.593113,38.356297,23.877221,0,1,...,59.964853,0,0.000000,5.316789,1,1,0,0,0,0
95,22.756403,6,0,17,14.383612,32.133485,37.143931,0.000000,0,4,...,52.049878,0,0.000000,20.517465,2,6,1,0,0,0


## **Fingerprints**

In [None]:
# Define directories
data_dirs = {
    'train': os.path.join(os.getcwd(), 'data', 'train'),
    'val': os.path.join(os.getcwd(), 'data', 'val'),
    'test': os.path.join(os.getcwd(), 'data', 'test')
}

In [None]:
# Compute Morgan fingerprints
def compute_morgan_fingerprints(smiles_list, radius=4, n_bits=2048):
    fingerprints_list = []
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
            fingerprints_list.append(fingerprint.ToBitString())
        else:
            fingerprints_list.append('0' * n_bits)  # Append a zeroed vector if the molecule is invalid
    return fingerprints_list

# Process each dataset
for split, data_dir in data_dirs.items():
    for file in os.listdir(data_dir):
        if file.startswith('scaffold') and file.endswith('.csv'):
            file_path = os.path.join(data_dir, file)
            df = pd.read_csv(file_path)

            # Compute Morgan fingerprints
            if 'Smiles' in df.columns:
                fingerprints = compute_morgan_fingerprints(df['Smiles'])

                fingerprints_int = [[int(bit) for bit in fp] for fp in fingerprints]

                fingerprints_df = pd.DataFrame(fingerprints_int, columns=[f'FP_{i}' for i in range(len(fingerprints_int[0]))])

                output_file_path = os.path.join(data_dir, f'scaffold_{split}_fingerprints.csv')
                fingerprints_df.to_csv(output_file_path, index=False)
                print(f'Saved fingerprints to {output_file_path}')
            else:
                print(f'Skipping {file}: No Smiles column found')

print("Morgan fingerprints calculation and saving complete.")

Saved fingerprints to /content/drive/MyDrive/Protacs_new_dataset/data/train/scaffold_train_fingerprints.csv
Saved fingerprints to /content/drive/MyDrive/Protacs_new_dataset/data/val/scaffold_val_fingerprints.csv
Saved fingerprints to /content/drive/MyDrive/Protacs_new_dataset/data/test/scaffold_test_fingerprints.csv
Skipping scaffold_descriptors.csv: No Smiles column found
Morgan fingerprints calculation and saving complete.


In [None]:
# check the shape of saved fingerprint data
def check_fingerprint_shapes(data_dirs):
    for split, data_dir in data_dirs.items():
        for file in os.listdir(data_dir):
            if file.startswith('scaffold') and file.endswith('_fingerprints.csv'):
                file_path = os.path.join(data_dir, file)
                df = pd.read_csv(file_path)

                print(f'Shape of {file}: {df.shape}')

check_fingerprint_shapes(data_dirs)

Shape of scaffold_train_fingerprints.csv: (1262, 2048)
Shape of scaffold_val_fingerprints.csv: (245, 2048)
Shape of scaffold_test_fingerprints.csv: (97, 2048)


# **Descriptors for POI sequences**

In [None]:
train_data.head(3)

Unnamed: 0,Smiles,Cell Type,POI Sequence,E3 Ligase Sequence,Activity
0,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,LnCaP95,MSTATTVAPAGIPATPGPVNPPPPEVSNPSKPGRKTNQLQYMQNVV...,MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...,True
1,Cc1ccsc1C(=O)n1nc(Nc2ccc(S(=O)(=O)NCCOCc3cn(CC...,HBL-1,MENFQKVEKIGEGTYGVVYKARNKLTGEVVALKKIRLDTETEGVPS...,MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNI...,True
2,COc1ccc(Cl)c(S(=O)(=O)Nc2ccc(-c3nc(OC[C@H]4CN(...,HEK293,MQRDHTMDYKESCPSVSIPSSDEHREKKKRFTVYKVLVSVGRSEWF...,MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...,False


In [None]:
poi_seq = train_data['POI Sequence'].to_list()

In [None]:
poi_seq

['MSDVTIVKEGWVQKRGEYIKNWRPRYFLLKTDGSFIGYKEKPQDVDLPYPLNNFSVAKCQLMKTERPKPNTFIIRCLQWTTVIERTFHVDTPEEREEWTEAIQAVADRLQRQEEERMNCSPTSQIDNIGEEEMDASTTHHKRKTMNDFDYLKLLGKGTFGKVILVREKASGKYYAMKILKKEVIIAKDEVAHTLTESRVLKNTRHPFLTSLKYSFQTKDRLCFVMEYVNGGELFFHLSRERVFSEDRTRFYGAEIVSALDYLHSGKIVYRDLKLENLMLDKDGHIKITDFGLCKEGITDAATMKTFCGTPEYLAPEVLEDNDYGRAVDWWGLGVVMYEMMCGRLPFYNQDHEKLFELILMEDIKFPRTLSSDAKSLLSGLLIKDPNKRLGGGPDDAKEIMRHSFFSGVNWQDVYDKKLVPPFKPQVTSETDTRYFDEEFTAQTITITPPEKYDEDGMDCMDNERRPHFPQFSYSASGRE',
 'MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPEELGAEEEMEAGRPRPVLRSVNSREPSQVIFCNRSPRVVLPVWLNFDGEPQPYPTLPPGTGRRIHSYRGHLWLFRDAGTHDGLLVNQTELFVPSLNVDGQPIFANITLPVYTLKERCLQVVRSLVKPENYRRLDIVRSLYEDLEDHPNVQKDLERLTQERIAHQRMGD',
 'MNGEAICSALPTIPYHKLADLRYLSRGASGTVSSARHADWRVQVAVKHLHIHTPLLDSERKDVLREAEILHKARFSYILPILGICNEPEFLGIVTEYMPNGSLNELLHRKTEYPDVAWPLRFRILHEIALGVNYLHNMTPPLLHHDLKTQNILLDNEFHVKIADFGLSKWRMMSLSQSRSSKSAPEGGTIIYMPPENYEPGQKSRASIKHDIYSYAVITWEVLSRKQPFEDVTNPLQIMYSVSQGHRPVINEESLPYDIPHRARMISLIESGWAQNPDERPSFLKCLIELEPVLRT

In [None]:
# @title **Calc POI seq Descriptors**

def compute_protein_descriptors(data_dirs):
    for split, data_dir in data_dirs.items():
        for file in os.listdir(data_dir):
            if file.startswith('scaffold') and file.startswith('scaffold') and file.endswith('.csv'):
                file_path = os.path.join(data_dir, file)
                df = pd.read_csv(file_path)

                # Compute descriptors
                if 'POI Sequence' in df.columns:
                    poi_seq = df['POI Sequence'].to_list()
                    data = []

                    for protein in poi_seq:
                        protein_descriptor = CTD.CalculateC(protein)
                        data.append(protein_descriptor)

                    descriptors_df = pd.DataFrame(data)

                    output_file_path = os.path.join(data_dir, f'scaffold_{split}_protein_descriptors.csv')
                    descriptors_df.to_csv(output_file_path, index=False)
                    print(f'Saved protein descriptors to {output_file_path}')
                else:
                    print(f'Skipping {file}: No POI Sequence column found')

compute_protein_descriptors(data_dirs)


Saved protein descriptors to /content/drive/MyDrive/Protacs_new_dataset/data/train/scaffold_train_protein_descriptors.csv
Skipping scaffold_train_fingerprints.csv: No POI Sequence column found
Saved protein descriptors to /content/drive/MyDrive/Protacs_new_dataset/data/val/scaffold_val_protein_descriptors.csv
Skipping scaffold_val_fingerprints.csv: No POI Sequence column found
Saved protein descriptors to /content/drive/MyDrive/Protacs_new_dataset/data/test/scaffold_test_protein_descriptors.csv
Skipping scaffold_descriptors.csv: No POI Sequence column found
Skipping scaffold_test_fingerprints.csv: No POI Sequence column found


In [None]:
# Function to check the shape of saved fingerprint data
def check_fingerprint_shapes(data_dirs):
    for split, data_dir in data_dirs.items():
        for file in os.listdir(data_dir):
            if file.startswith('scaffold') and file.endswith('_protein_descriptors.csv'):
                file_path = os.path.join(data_dir, file)
                df = pd.read_csv(file_path)

                print(f'Shape of {file}: {df.shape}')

check_fingerprint_shapes(data_dirs)

Shape of scaffold_train_protein_descriptors.csv: (1262, 21)
Shape of scaffold_val_protein_descriptors.csv: (245, 21)
Shape of scaffold_test_protein_descriptors.csv: (97, 21)


In [None]:
train_data.columns

Index(['Smiles', 'Cell Type', 'POI Sequence', 'E3 Ligase Sequence',
       'Activity'],
      dtype='object')

In [None]:
# @title **Calc E3 Descriptors**

def compute_E3_descriptors(data_dirs):
    for split, data_dir in data_dirs.items():
        for file in os.listdir(data_dir):
            if file.startswith('scaffold') and file.endswith('.csv'):
                file_path = os.path.join(data_dir, file)
                df = pd.read_csv(file_path)

                if 'E3 Ligase Sequence' in df.columns:
                    poi_seq = df['E3 Ligase Sequence'].to_list()
                    data = []

                    for protein in poi_seq:
                        protein_descriptor = CTD.CalculateC(protein)
                        data.append(protein_descriptor)

                    descriptors_df = pd.DataFrame(data)

                    output_file_path = os.path.join(data_dir, f'scaffold_{split}_e3_descriptors.csv')
                    descriptors_df.to_csv(output_file_path, index=False)
                    print(f'Saved E3 ligase descriptors to {output_file_path}')
                else:
                    print(f'Skipping {file}: No E3 ligase Sequence column found')

compute_E3_descriptors(data_dirs)


Saved E3 ligase descriptors to /content/drive/MyDrive/Protacs_new_dataset/data/train/scaffold_train_e3_descriptors.csv
Skipping scaffold_train_fingerprints.csv: No E3 ligase Sequence column found
Skipping scaffold_train_protein_descriptors.csv: No E3 ligase Sequence column found
Saved E3 ligase descriptors to /content/drive/MyDrive/Protacs_new_dataset/data/val/scaffold_val_e3_descriptors.csv
Skipping scaffold_val_fingerprints.csv: No E3 ligase Sequence column found
Skipping scaffold_val_protein_descriptors.csv: No E3 ligase Sequence column found
Saved E3 ligase descriptors to /content/drive/MyDrive/Protacs_new_dataset/data/test/scaffold_test_e3_descriptors.csv
Skipping scaffold_descriptors.csv: No E3 ligase Sequence column found
Skipping scaffold_test_fingerprints.csv: No E3 ligase Sequence column found
Skipping scaffold_test_protein_descriptors.csv: No E3 ligase Sequence column found


In [None]:
# Function to check the shape of saved fingerprint data
def check_fingerprint_shapes(data_dirs):
    for split, data_dir in data_dirs.items():
        for file in os.listdir(data_dir):
            if file.startswith('scaffold') and file.endswith('_e3_descriptors.csv'):
                file_path = os.path.join(data_dir, file)
                df = pd.read_csv(file_path)

                print(f'Shape of {file}: {df.shape}')

check_fingerprint_shapes(data_dirs)

Shape of scaffold_train_e3_descriptors.csv: (1262, 21)
Shape of scaffold_val_e3_descriptors.csv: (245, 21)
Shape of scaffold_test_e3_descriptors.csv: (97, 21)


# **Now we need to calculate Embeddings for POI Sequence and E3 ligase**

## **Embeddings for POI Sequence**

In [None]:
poi_seq = train_data['POI Sequence'].to_list()
poi_seq

['MSTATTVAPAGIPATPGPVNPPPPEVSNPSKPGRKTNQLQYMQNVVVKTLWKHQFAWPFYQPVDAIKLNLPDYHKIIKNPMDMGTIKKRLENNYYWSASECMQDFNTMFTNCYIYNKPTDDIVLMAQALEKIFLQKVAQMPQEEVELLPPAPKGKGRKPAAGAQSAGTQQVAAVSSVSPATPFQSVPPTVSQTPVIAATPVPTITANVTSVPVPPAAAPPPPATPIVPVVPPTPPVVKKKGVKRKADTTTPTTSAITASRSESPPPLSDPKQAKVVARRESGGRPIKPPKKDLEDGEVPQHAGKKGKLSEHLRYCDSILREMLSKKHAAYAWPFYKPVDAEALELHDYHDIIKHPMDLSTVKRKMDGREYPDAQGFAADVRLMFSNCYKYNPPDHEVVAMARKLQDVFEMRFAKMPDEPVEAPALPAPAAPMVSKGAESSRSSEESSSDSGSSDSEEERATRLAELQEQLKAVHEQLAALSQAPVNKPKKKKEKKEKEKKKKDKEKEKEKHKVKAEEEKKAKVAPPAKQAQQKKAPAKKANSTTTAGRQLKKGGKQASASYDSEEEEEGLPMSYDEKRQLSLDINRLPGEKLGRVVHIIQSREPSLRDSNPDEIEIDFETLKPTTLRELERYVKSCLQKKQRKPFSASGKKQAAKSKEELAQEKKKELEKRLQDVSGQLSSSKKPARKEKPGSAPSGGPSRLSSSSSSESGSSSSSGSSSDSSDSE',
 'MENFQKVEKIGEGTYGVVYKARNKLTGEVVALKKIRLDTETEGVPSTAIREISLLKELNHPNIVKLLDVIHTENKLYLVFEFLHQDLKKFMDASALTGIPLPLIKSYLFQLLQGLAFCHSHRVLHRDLKPQNLLINTEGAIKLADFGLARAFGVPVRTYTHEVVTLWYRAPEILLGCKYYSTAVDIWSLGCIFAEMVTRRALFPGDSEIDQLFRIFRTLGTPDEVVWPGVTSMPDYKPSFPKWARQDFSKVVPPLDEDGRSLLSQML

In [None]:
# from transformers import T5Tokenizer, T5Model
# import torch

# # Load pre-trained ProtT5 model and tokenizer
# model_name = "Rostlab/prot_t5_xl_uniref50"
# tokenizer = T5Tokenizer.from_pretrained(model_name)
# model = T5Model.from_pretrained(model_name)


# # Function to generate embeddings for a single protein sequence
# def get_protein_embedding(sequence):
#     # Tokenize and generate embeddings
#     inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True)
#     with torch.no_grad():
#         outputs = model(**inputs)
#         embeddings = outputs.last_hidden_state
#     # To get a single per-protein embedding, take the mean along the sequence length axis
#     protein_embedding = embeddings.mean(dim=1)
#     return protein_embedding

# # Generate embeddings for all sequences
# embeddings_list = [get_protein_embedding(seq) for seq in poi_seq]

# # Print embeddings for each protein sequence
# for i, embedding in enumerate(embeddings_list):
#     print(f"Protein Sequence {i+1} Embedding:", embedding)


In [None]:
# Load ProtBERT Model
print("Load ProtBERT Model...")
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
model = BertModel.from_pretrained("Rostlab/prot_bert").to('cuda' if torch.cuda.is_available() else 'cpu')


Load ProtBERT Model...


tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

In [None]:
# Get ProtBERT embeddings
def get_bert_embedding(sequence: str, len_seq_limit: int = 1200):
    sequence_w_spaces = ' '.join(list(sequence))
    encoded_input = tokenizer(
        sequence_w_spaces,
        truncation=True,
        max_length=len_seq_limit,
        padding='max_length',
        return_tensors='pt').to('cuda' if torch.cuda.is_available() else 'cpu')
    with torch.no_grad():
        output = model(**encoded_input)
    output_hidden = output['last_hidden_state'][:, 0, :].cpu().numpy()
    return output_hidden

In [None]:
# process sequences and save embeddings
def process_sequences(data_path, save_path):
    data = pd.read_csv(data_path)
    poi_seq = data['POI Sequence'].tolist()
    embeddings_list = []
    for seq in tqdm(poi_seq):
        embedding = get_bert_embedding(sequence=seq)
        embeddings_list.append(embedding)
    embeddings_array = np.array(embeddings_list)
    np.save(save_path, embeddings_array)
    print(f"Embeddings saved to {save_path}")

In [None]:
# Define paths for train, val, and test sets
train_data_path = os.path.join(train_dir + '/scaffold_train_data.csv')
val_data_path = os.path.join(val_dir +'/scaffold_val_data.csv')
test_data_path = os.path.join(test_dir + '/scaffold_test_data.csv')


In [None]:
data_dir

'/content/drive/MyDrive/Protacs_new_dataset/data'

In [None]:
# Define paths to save embeddings
train_save_path = os.path.join(data_dir, 'train', 'scaffold_protbert_train_embeddings.npy')
val_save_path = os.path.join(data_dir, 'val', 'scaffold_protbert_val_embeddings.npy')
test_save_path = os.path.join(data_dir, 'test', 'scaffold_protbert_test_embeddings.npy')

In [None]:
# Process and save embeddings for train, val, and test sets
print("Processing train set...")
process_sequences(train_data_path, train_save_path)

print("Processing validation set...")
process_sequences(val_data_path, val_save_path)

print("Processing test set...")
process_sequences(test_data_path, test_save_path)

Processing train set...


100%|██████████| 1262/1262 [06:47<00:00,  3.09it/s]


Embeddings saved to /content/drive/MyDrive/Protacs_new_dataset/data/train/scaffold_protbert_train_embeddings.npy
Processing validation set...


100%|██████████| 245/245 [01:18<00:00,  3.11it/s]


Embeddings saved to /content/drive/MyDrive/Protacs_new_dataset/data/val/scaffold_protbert_val_embeddings.npy
Processing test set...


100%|██████████| 97/97 [00:31<00:00,  3.11it/s]

Embeddings saved to /content/drive/MyDrive/Protacs_new_dataset/data/test/scaffold_protbert_test_embeddings.npy





## **Embeddings For E3 ligase**

In [None]:
# process sequences and save embeddings
def process_sequences(data_path, save_path):
    data = pd.read_csv(data_path)
    e3_seq = data['E3 Ligase Sequence'].tolist()
    embeddings_list = []
    for seq in tqdm(e3_seq):
        embedding = get_bert_embedding(sequence=seq)
        embeddings_list.append(embedding)
    embeddings_array = np.array(embeddings_list)
    np.save(save_path, embeddings_array)
    print(f"Embeddings saved to {save_path}")

In [None]:
# Define paths to save embeddings
train_save_path = os.path.join(data_dir, 'train', 'scaffold_protbert_E3_train_embeddings.npy')
val_save_path = os.path.join(data_dir, 'val', 'scaffold_protbert_E3_val_embeddings.npy')
test_save_path = os.path.join(data_dir, 'test', 'scaffold_protbert_E3_test_embeddings.npy')

In [None]:
# Process and save embeddings for train, val, and test sets
print("Processing train set...")
process_sequences(train_data_path, train_save_path)

print("Processing validation set...")
process_sequences(val_data_path, val_save_path)

print("Processing test set...")
process_sequences(test_data_path, test_save_path)

Processing train set...


100%|██████████| 1262/1262 [06:43<00:00,  3.13it/s]


Embeddings saved to /content/drive/MyDrive/Protacs_new_dataset/data/train/scaffold_protbert_E3_train_embeddings.npy
Processing validation set...


100%|██████████| 245/245 [01:18<00:00,  3.13it/s]


Embeddings saved to /content/drive/MyDrive/Protacs_new_dataset/data/val/scaffold_protbert_E3_val_embeddings.npy
Processing test set...


100%|██████████| 97/97 [00:31<00:00,  3.13it/s]

Embeddings saved to /content/drive/MyDrive/Protacs_new_dataset/data/test/scaffold_protbert_E3_test_embeddings.npy



