In [1]:
!pip install rdkit-pypi -q
!pip install pybel -q
!pip install PyBioMed -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m387.8/387.8 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.4/46.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for ratelimit (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m65.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for PyBioMed (setup.py) ... [?25l[?25hdone


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors
import torch
from tqdm import tqdm
from transformers import BertModel, BertTokenizer
from PyBioMed.PyProtein import CTD
import warnings
warnings.filterwarnings('ignore')

In [4]:
working_directory = "/content/drive/MyDrive/Protacs_new_dataset/"
os.chdir(working_directory)
print("current working directory is: ", os.getcwd())

current working directory is:  /content/drive/MyDrive/Protacs_new_dataset


In [5]:
data_dir = os.path.join(os.getcwd(), 'data')
src_dir = os.path.join(os.getcwd(), 'src')
fig_dir = os.path.join(data_dir, 'figures')
checkpoint_dir = os.path.join(os.getcwd(), 'checkpoints')
dirs_to_make = [
    data_dir,
    os.path.join(data_dir, 'raw'),
    os.path.join(data_dir, 'processed'),
    os.path.join(data_dir, 'train'),
    os.path.join(data_dir, 'val'),
    os.path.join(data_dir, 'test'),
    src_dir,
    fig_dir,
    checkpoint_dir,
]
for d in dirs_to_make:
    if not os.path.exists(d):
        os.makedirs(d)

In [6]:
data_dir

'/content/drive/MyDrive/Protacs_new_dataset/data'

In [7]:
train_dir = os.path.join(data_dir, 'cluster/train')
val_dir = os.path.join(data_dir, 'cluster/val')
test_dir = os.path.join(data_dir, 'cluster/test')

In [8]:
!ls {train_dir}

cluster_train_data.csv		  cluster_train_fingerprints.csv
cluster_train_descriptors.csv	  cluster_train_protein_descriptors.csv
cluster_train_e3_descriptors.csv


In [9]:
selected_columns = ['Smiles', 'Cell Type', 'POI Sequence', 'E3 Ligase Sequence', 'Activity']

In [10]:
def load_data(directory, columns, prefix):
    data = []
    for filename in os.listdir(directory):
        if filename.startswith(prefix) and filename.endswith('.csv'):
            file_path = os.path.join(directory, filename)
            df = pd.read_csv(file_path, usecols=columns)
            data.append(df)
    if data:
        return pd.concat(data, ignore_index=True)
    else:
        return pd.DataFrame(columns=columns)

In [11]:
train_data = load_data(train_dir, selected_columns, 'cluster_train_data')
val_data = load_data(val_dir, selected_columns, 'cluster_val_data')
test_data = load_data(test_dir, selected_columns, 'cluster_test_data')

In [12]:
print("Train Data:")
print(train_data.shape)
print("\nValidation Data:")
print(val_data.shape)
print("\nTest Data:")
print(test_data.shape)

Train Data:
(1260, 5)

Validation Data:
(251, 5)

Test Data:
(93, 5)


In [13]:
train_data.head()

Unnamed: 0,Smiles,Cell Type,POI Sequence,E3 Ligase Sequence,Activity
0,CN(CCOCCOCCOCCNc1cccc2c1C(=O)N(C1CCC(=O)NC1=O)...,MOLT-4,MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME...,MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNI...,True
1,C=CC(=O)Nc1cccc(-n2c(=O)cc(C)c3cnc(Nc4ccc(N5CC...,H1975,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...,False
2,C=CC(=O)Nc1cccc(-n2c(=O)cc(C)c3cnc(Nc4ccc(N5CC...,H1975,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...,True
3,C=CC(=O)Nc1cccc(-n2c(=O)cc(C)c3cnc(Nc4ccc(N5CC...,H1975,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...,True
4,C=CC(=O)Nc1cccc(-n2c(=O)cc(C)c3cnc(Nc4ccc(N5CC...,H1975,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...,False


# **Calculating Descriptors and fingerprints for cluster splits**

### These descriptors have been selected after applying Man-Whitney U-Test and using adjusted p-values using the Benjamini-Hochberg procedure

In [23]:
descriptor_names = ['SMR_VSA9', 'fr_NH0', 'fr_piperzine',
       'NumRotatableBonds', 'SlogP_VSA3', 'BCUT2D_MWHI', 'EState_VSA10',
       'PEOE_VSA14', 'fr_Al_OH_noTert', 'NumHDonors', 'SlogP_VSA8', 'Kappa2',
       'NumSaturatedRings', 'BCUT2D_CHGLO', 'Kappa3', 'PEOE_VSA3',
       'qed', 'PEOE_VSA1', 'BCUT2D_MRHI', 'fr_Al_OH', 'SMR_VSA1',
       'NumSaturatedHeterocycles', 'BalabanJ', 'VSA_EState3',
       'NumAliphaticRings', 'EState_VSA2', 'NumAliphaticHeterocycles',
       'SlogP_VSA10', 'SMR_VSA6', 'fr_sulfonamd', 'SlogP_VSA7', 'EState_VSA8',
       'fr_piperdine', 'NHOHCount', 'fr_aniline', 'fr_COO2', 'fr_Al_COO',
       'fr_COO']

In [14]:
# Define directories
data_dirs = {
    'train': os.path.join(os.getcwd(), 'data', 'cluster/train'),
    'val': os.path.join(os.getcwd(), 'data', 'cluster/val'),
    'test': os.path.join(os.getcwd(), 'data', 'cluster/test')
}

In [24]:
import os
import pandas as pd
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors



calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)

# Function to compute molecular descriptors
def compute_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return calculator.CalcDescriptors(mol)
    else:
        return [None] * len(descriptor_names)

# Function to process datasets and save descriptors
def compute_and_save_descriptors(data_dirs):
    for split, data_dir in data_dirs.items():
        for file in os.listdir(data_dir):
            if file.startswith('cluster') and file.endswith('.csv'):
                # Load the CSV file
                file_path = os.path.join(data_dir, file)
                df = pd.read_csv(file_path)

                # Compute descriptors
                if 'Smiles' in df.columns:
                    descriptor_values = df['Smiles'].apply(compute_descriptors)
                    descriptor_df = pd.DataFrame(descriptor_values.tolist(), columns=descriptor_names)

                    # Save to new CSV file
                    output_file_path = os.path.join(data_dir, f'cluster_{split}_descriptors.csv')
                    descriptor_df.to_csv(output_file_path, index=False)
                    print(f'Saved descriptors to {output_file_path}')
                else:
                    print(f'Skipping {file}: No Smiles column found')

# Run the function to compute and save descriptors
compute_and_save_descriptors(data_dirs)


Saved descriptors to /content/drive/MyDrive/Protacs_new_dataset/data/cluster/train/cluster_train_descriptors.csv
Saved descriptors to /content/drive/MyDrive/Protacs_new_dataset/data/cluster/val/cluster_val_descriptors.csv
Saved descriptors to /content/drive/MyDrive/Protacs_new_dataset/data/cluster/test/cluster_test_descriptors.csv


## **Fingerprints**

In [26]:
# Compute Morgan fingerprints
def compute_morgan_fingerprints(smiles_list, radius=4, n_bits=2048):
    fingerprints_list = []
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
            fingerprints_list.append(fingerprint.ToBitString())
        else:
            fingerprints_list.append('0' * n_bits)  # Append a zeroed vector if the molecule is invalid
    return fingerprints_list

# Process each dataset
for split, data_dir in data_dirs.items():
    for file in os.listdir(data_dir):
        if file.startswith('cluster') and file.endswith('.csv'):
            file_path = os.path.join(data_dir, file)
            df = pd.read_csv(file_path)

            # Compute Morgan fingerprints
            if 'Smiles' in df.columns:
                fingerprints = compute_morgan_fingerprints(df['Smiles'])

                fingerprints_int = [[int(bit) for bit in fp] for fp in fingerprints]

                fingerprints_df = pd.DataFrame(fingerprints_int, columns=[f'FP_{i}' for i in range(len(fingerprints_int[0]))])

                output_file_path = os.path.join(data_dir, f'cluster_{split}_fingerprints.csv')
                fingerprints_df.to_csv(output_file_path, index=False)
                print(f'Saved fingerprints to {output_file_path}')
            else:
                print(f'Skipping {file}: No Smiles column found')

print("Morgan fingerprints calculation and saving complete.")

Saved fingerprints to /content/drive/MyDrive/Protacs_new_dataset/data/cluster/train/cluster_train_fingerprints.csv
Skipping cluster_train_descriptors.csv: No Smiles column found
Saved fingerprints to /content/drive/MyDrive/Protacs_new_dataset/data/cluster/val/cluster_val_fingerprints.csv
Skipping cluster_val_descriptors.csv: No Smiles column found
Saved fingerprints to /content/drive/MyDrive/Protacs_new_dataset/data/cluster/test/cluster_test_fingerprints.csv
Skipping cluster_test_descriptors.csv: No Smiles column found
Morgan fingerprints calculation and saving complete.


In [16]:
# check the shape of saved fingerprint data
def check_fingerprint_shapes(data_dirs):
    for split, data_dir in data_dirs.items():
        for file in os.listdir(data_dir):
            if file.startswith('cluster') and file.endswith('_fingerprints.csv'):
                file_path = os.path.join(data_dir, file)
                df = pd.read_csv(file_path)

                print(f'Shape of {file}: {df.shape}')

check_fingerprint_shapes(data_dirs)

Shape of cluster_train_fingerprints.csv: (1260, 2048)
Shape of cluster_val_fingerprints.csv: (251, 2048)
Shape of cluster_test_fingerprints.csv: (93, 2048)


# **Descriptors for POI sequences**

In [15]:
train_data.head(3)

Unnamed: 0,Smiles,Cell Type,POI Sequence,E3 Ligase Sequence,Activity
0,CN(CCOCCOCCOCCNc1cccc2c1C(=O)N(C1CCC(=O)NC1=O)...,MOLT-4,MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME...,MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNI...,True
1,C=CC(=O)Nc1cccc(-n2c(=O)cc(C)c3cnc(Nc4ccc(N5CC...,H1975,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...,False
2,C=CC(=O)Nc1cccc(-n2c(=O)cc(C)c3cnc(Nc4ccc(N5CC...,H1975,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...,True


In [17]:
poi_seq = train_data['POI Sequence'].to_list()

In [30]:
poi_seq

['MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEMETPSAINGNPSWHLADSPAVNGATGHSSSLDAREVIPMAAVKQALREAGDEFELRYRRAFSDLTSQLHITPGTAYQSFEQVVNELFRDGVNWGRIVAFFSFGGALCVESVDKEMQVLVSRIAAWMATYLNDHLEPWIQENGGWDTFVELYGNNAAAESRKGQERFNRWFLTGMTVAGVVLLGSLFSRK',
 'MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYVQRNYDLSFLKTIQEVAGYVLIALNTVERIPLENLQIIRGNMYYENSYALAVLSNYDANKTGLKELPMRNLQEILHGAVRFSNNPALCNVESIQWRDIVSSDFLSNMSMDFQNHLGSCQKCDPSCPNGSCWGAGEENCQKLTKIICAQQCSGRCRGKSPSDCCHNQCAAGCTGPRESDCLVCRKFRDEATCKDTCPPLMLYNPTTYQMDVNPEGKYSFGATCVKKCPRNYVVTDHGSCVRACGADSYEMEEDGVRKCKKCEGPCRKVCNGIGIGEFKDSLSINATNIKHFKNCTSISGDLHILPVAFRGDSFTHTPPLDPQELDILKTVKEITGFLLIQAWPENRTDLHAFENLEIIRGRTKQHGQFSLAVVSLNITSLGLRSLKEISDGDVIISGNKNLCYANTINWKKLFGTSGQKTKIISNRGENSCKATGQVCHALCSPEGCWGPEPRDCVSCRNVSRGRECVDKCNLLEGEPREFVENSECIQCHPECLPQAMNITCTGRGPDNCIQCAHYIDGPHCVKTCPAGVMGENNTLVWKYADAGHVCHLCHPNCTYGCTGPGLEGCPTNGPKIPSIATGMVGALLLLLVVALGIGLFMRRRHIVRKRTLRRLLQERELVEPLTPSGEAPNQALLRILKETEFKKIKVLGSGAFGTVYKGLWIPEGEKVKIPVAIKELREATSPKANKEIL

In [33]:
# @title **Calc POI seq Descriptors**

def compute_protein_descriptors(data_dirs):
    for split, data_dir in data_dirs.items():
        for file in os.listdir(data_dir):
            if file.startswith('cluster') and file.startswith('cluster') and file.endswith('.csv'):
                file_path = os.path.join(data_dir, file)
                df = pd.read_csv(file_path)

                # Compute descriptors
                if 'POI Sequence' in df.columns:
                    poi_seq = df['POI Sequence'].to_list()
                    data = []

                    for protein in poi_seq:
                        protein_descriptor = CTD.CalculateC(protein)
                        data.append(protein_descriptor)

                    descriptors_df = pd.DataFrame(data)

                    output_file_path = os.path.join(data_dir, f'cluster_{split}_protein_descriptors.csv')
                    descriptors_df.to_csv(output_file_path, index=False)
                    print(f'Saved protein descriptors to {output_file_path}')
                else:
                    print(f'Skipping {file}: No POI Sequence column found')

compute_protein_descriptors(data_dirs)


Saved protein descriptors to /content/drive/MyDrive/Protacs_new_dataset/data/cluster/train/cluster_train_protein_descriptors.csv
Skipping cluster_train_descriptors.csv: No POI Sequence column found
Skipping cluster_train_fingerprints.csv: No POI Sequence column found
Saved protein descriptors to /content/drive/MyDrive/Protacs_new_dataset/data/cluster/val/cluster_val_protein_descriptors.csv
Skipping cluster_val_descriptors.csv: No POI Sequence column found
Skipping cluster_val_fingerprints.csv: No POI Sequence column found
Saved protein descriptors to /content/drive/MyDrive/Protacs_new_dataset/data/cluster/test/cluster_test_protein_descriptors.csv
Skipping cluster_test_descriptors.csv: No POI Sequence column found
Skipping cluster_test_fingerprints.csv: No POI Sequence column found


In [18]:
# Function to check the shape of saved fingerprint data
def check_fingerprint_shapes(data_dirs):
    for split, data_dir in data_dirs.items():
        for file in os.listdir(data_dir):
            if file.startswith('cluster') and file.endswith('_protein_descriptors.csv'):
                file_path = os.path.join(data_dir, file)
                df = pd.read_csv(file_path)

                print(f'Shape of {file}: {df.shape}')

check_fingerprint_shapes(data_dirs)

Shape of cluster_train_protein_descriptors.csv: (1260, 21)
Shape of cluster_val_protein_descriptors.csv: (251, 21)
Shape of cluster_test_protein_descriptors.csv: (93, 21)


In [35]:
train_data.columns

Index(['Smiles', 'Cell Type', 'POI Sequence', 'E3 Ligase Sequence',
       'Activity'],
      dtype='object')

In [36]:
# @title **Calc E3 Descriptors**

def compute_E3_descriptors(data_dirs):
    for split, data_dir in data_dirs.items():
        for file in os.listdir(data_dir):
            if file.startswith('cluster') and file.endswith('.csv'):
                file_path = os.path.join(data_dir, file)
                df = pd.read_csv(file_path)

                if 'E3 Ligase Sequence' in df.columns:
                    poi_seq = df['E3 Ligase Sequence'].to_list()
                    data = []

                    for protein in poi_seq:
                        protein_descriptor = CTD.CalculateC(protein)
                        data.append(protein_descriptor)

                    descriptors_df = pd.DataFrame(data)

                    output_file_path = os.path.join(data_dir, f'cluster_{split}_e3_descriptors.csv')
                    descriptors_df.to_csv(output_file_path, index=False)
                    print(f'Saved E3 ligase descriptors to {output_file_path}')
                else:
                    print(f'Skipping {file}: No E3 ligase Sequence column found')

compute_E3_descriptors(data_dirs)


Saved E3 ligase descriptors to /content/drive/MyDrive/Protacs_new_dataset/data/cluster/train/cluster_train_e3_descriptors.csv
Skipping cluster_train_descriptors.csv: No E3 ligase Sequence column found
Skipping cluster_train_fingerprints.csv: No E3 ligase Sequence column found
Skipping cluster_train_protein_descriptors.csv: No E3 ligase Sequence column found
Saved E3 ligase descriptors to /content/drive/MyDrive/Protacs_new_dataset/data/cluster/val/cluster_val_e3_descriptors.csv
Skipping cluster_val_descriptors.csv: No E3 ligase Sequence column found
Skipping cluster_val_fingerprints.csv: No E3 ligase Sequence column found
Skipping cluster_val_protein_descriptors.csv: No E3 ligase Sequence column found
Saved E3 ligase descriptors to /content/drive/MyDrive/Protacs_new_dataset/data/cluster/test/cluster_test_e3_descriptors.csv
Skipping cluster_test_descriptors.csv: No E3 ligase Sequence column found
Skipping cluster_test_fingerprints.csv: No E3 ligase Sequence column found
Skipping cluster_

In [37]:
# Function to check the shape of saved fingerprint data
def check_fingerprint_shapes(data_dirs):
    for split, data_dir in data_dirs.items():
        for file in os.listdir(data_dir):
            if file.startswith('cluster') and file.endswith('_e3_descriptors.csv'):
                file_path = os.path.join(data_dir, file)
                df = pd.read_csv(file_path)

                print(f'Shape of {file}: {df.shape}')

check_fingerprint_shapes(data_dirs)

Shape of cluster_train_e3_descriptors.csv: (1260, 21)
Shape of cluster_val_e3_descriptors.csv: (251, 21)
Shape of cluster_test_e3_descriptors.csv: (93, 21)


# **Now we need to calculate Embeddings for POI Sequence and E3 ligase**

## **Embeddings for POI Sequence**

In [19]:
poi_seq = train_data['POI Sequence'].to_list()
poi_seq

['MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEMETPSAINGNPSWHLADSPAVNGATGHSSSLDAREVIPMAAVKQALREAGDEFELRYRRAFSDLTSQLHITPGTAYQSFEQVVNELFRDGVNWGRIVAFFSFGGALCVESVDKEMQVLVSRIAAWMATYLNDHLEPWIQENGGWDTFVELYGNNAAAESRKGQERFNRWFLTGMTVAGVVLLGSLFSRK',
 'MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYVQRNYDLSFLKTIQEVAGYVLIALNTVERIPLENLQIIRGNMYYENSYALAVLSNYDANKTGLKELPMRNLQEILHGAVRFSNNPALCNVESIQWRDIVSSDFLSNMSMDFQNHLGSCQKCDPSCPNGSCWGAGEENCQKLTKIICAQQCSGRCRGKSPSDCCHNQCAAGCTGPRESDCLVCRKFRDEATCKDTCPPLMLYNPTTYQMDVNPEGKYSFGATCVKKCPRNYVVTDHGSCVRACGADSYEMEEDGVRKCKKCEGPCRKVCNGIGIGEFKDSLSINATNIKHFKNCTSISGDLHILPVAFRGDSFTHTPPLDPQELDILKTVKEITGFLLIQAWPENRTDLHAFENLEIIRGRTKQHGQFSLAVVSLNITSLGLRSLKEISDGDVIISGNKNLCYANTINWKKLFGTSGQKTKIISNRGENSCKATGQVCHALCSPEGCWGPEPRDCVSCRNVSRGRECVDKCNLLEGEPREFVENSECIQCHPECLPQAMNITCTGRGPDNCIQCAHYIDGPHCVKTCPAGVMGENNTLVWKYADAGHVCHLCHPNCTYGCTGPGLEGCPTNGPKIPSIATGMVGALLLLLVVALGIGLFMRRRHIVRKRTLRRLLQERELVEPLTPSGEAPNQALLRILKETEFKKIKVLGSGAFGTVYKGLWIPEGEKVKIPVAIKELREATSPKANKEIL

In [None]:
# from transformers import T5Tokenizer, T5Model
# import torch

# # Load pre-trained ProtT5 model and tokenizer
# model_name = "Rostlab/prot_t5_xl_uniref50"
# tokenizer = T5Tokenizer.from_pretrained(model_name)
# model = T5Model.from_pretrained(model_name)


# # Function to generate embeddings for a single protein sequence
# def get_protein_embedding(sequence):
#     # Tokenize and generate embeddings
#     inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True)
#     with torch.no_grad():
#         outputs = model(**inputs)
#         embeddings = outputs.last_hidden_state
#     # To get a single per-protein embedding, take the mean along the sequence length axis
#     protein_embedding = embeddings.mean(dim=1)
#     return protein_embedding

# # Generate embeddings for all sequences
# embeddings_list = [get_protein_embedding(seq) for seq in poi_seq]

# # Print embeddings for each protein sequence
# for i, embedding in enumerate(embeddings_list):
#     print(f"Protein Sequence {i+1} Embedding:", embedding)


In [20]:
# Load ProtBERT Model
print("Load ProtBERT Model...")
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
model = BertModel.from_pretrained("Rostlab/prot_bert").to('cuda' if torch.cuda.is_available() else 'cpu')


Load ProtBERT Model...


tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

In [21]:
# Get ProtBERT embeddings
def get_bert_embedding(sequence: str, len_seq_limit: int = 1200):
    sequence_w_spaces = ' '.join(list(sequence))
    encoded_input = tokenizer(
        sequence_w_spaces,
        truncation=True,
        max_length=len_seq_limit,
        padding='max_length',
        return_tensors='pt').to('cuda' if torch.cuda.is_available() else 'cpu')
    with torch.no_grad():
        output = model(**encoded_input)
    output_hidden = output['last_hidden_state'][:, 0, :].cpu().numpy()
    return output_hidden

In [22]:
# process sequences and save embeddings
def process_sequences(data_path, save_path):
    data = pd.read_csv(data_path)
    poi_seq = data['POI Sequence'].tolist()
    embeddings_list = []
    for seq in tqdm(poi_seq):
        embedding = get_bert_embedding(sequence=seq)
        embeddings_list.append(embedding)
    embeddings_array = np.array(embeddings_list)
    np.save(save_path, embeddings_array)
    print(f"Embeddings saved to {save_path}")

In [23]:
# Define paths for train, val, and test sets
train_data_path = os.path.join(train_dir + '/cluster_train_data.csv')
val_data_path = os.path.join(val_dir +'/cluster_val_data.csv')
test_data_path = os.path.join(test_dir + '/cluster_test_data.csv')


In [24]:
train_data_path

'/content/drive/MyDrive/Protacs_new_dataset/data/cluster/train/cluster_train_data.csv'

In [None]:
data_dir

'/content/drive/MyDrive/Protacs_new_dataset/data'

In [25]:
# Define paths to save embeddings
train_save_path = os.path.join(data_dir, 'cluster/train', 'cluster_protbert_train_embeddings.npy')
val_save_path = os.path.join(data_dir, 'cluster/val', 'cluster_protbert_val_embeddings.npy')
test_save_path = os.path.join(data_dir, 'cluster/test', 'cluster_protbert_test_embeddings.npy')

In [26]:
# Process and save embeddings for train, val, and test sets
print("Processing train set...")
process_sequences(train_data_path, train_save_path)

print("Processing validation set...")
process_sequences(val_data_path, val_save_path)

print("Processing test set...")
process_sequences(test_data_path, test_save_path)

Processing train set...


100%|██████████| 1260/1260 [06:54<00:00,  3.04it/s]


Embeddings saved to /content/drive/MyDrive/Protacs_new_dataset/data/cluster/train/cluster_protbert_train_embeddings.npy
Processing validation set...


100%|██████████| 251/251 [01:22<00:00,  3.06it/s]


Embeddings saved to /content/drive/MyDrive/Protacs_new_dataset/data/cluster/val/cluster_protbert_val_embeddings.npy
Processing test set...


100%|██████████| 93/93 [00:30<00:00,  3.03it/s]

Embeddings saved to /content/drive/MyDrive/Protacs_new_dataset/data/cluster/test/cluster_protbert_test_embeddings.npy





## **Embeddings For E3 ligase**

In [27]:
# process sequences and save embeddings
def process_sequences(data_path, save_path):
    data = pd.read_csv(data_path)
    e3_seq = data['E3 Ligase Sequence'].tolist()
    embeddings_list = []
    for seq in tqdm(e3_seq):
        embedding = get_bert_embedding(sequence=seq)
        embeddings_list.append(embedding)
    embeddings_array = np.array(embeddings_list)
    np.save(save_path, embeddings_array)
    print(f"Embeddings saved to {save_path}")

In [28]:
# Define paths to save embeddings
train_save_path = os.path.join(data_dir, 'cluster/train', 'cluster_protbert_E3_train_embeddings.npy')
val_save_path = os.path.join(data_dir, 'cluster/val', 'cluster_protbert_E3_val_embeddings.npy')
test_save_path = os.path.join(data_dir, 'cluster/test', 'cluster_protbert_E3_test_embeddings.npy')

In [29]:
# Process and save embeddings for train, val, and test sets
print("Processing train set...")
process_sequences(train_data_path, train_save_path)

print("Processing validation set...")
process_sequences(val_data_path, val_save_path)

print("Processing test set...")
process_sequences(test_data_path, test_save_path)

Processing train set...


100%|██████████| 1260/1260 [06:50<00:00,  3.07it/s]


Embeddings saved to /content/drive/MyDrive/Protacs_new_dataset/data/cluster/train/cluster_protbert_E3_train_embeddings.npy
Processing validation set...


100%|██████████| 251/251 [01:21<00:00,  3.07it/s]


Embeddings saved to /content/drive/MyDrive/Protacs_new_dataset/data/cluster/val/cluster_protbert_E3_val_embeddings.npy
Processing test set...


100%|██████████| 93/93 [00:30<00:00,  3.05it/s]

Embeddings saved to /content/drive/MyDrive/Protacs_new_dataset/data/cluster/test/cluster_protbert_E3_test_embeddings.npy



