In [1]:
import sys
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
scripts_path = os.path.abspath(os.path.join(os.getcwd(), "..", "..", "scripts"))

if scripts_path not in sys.path:
    sys.path.insert(0, scripts_path)

In [3]:
import data_ingestion

In [4]:
dirpath = "/Volumes/LaCie_d2_Professional_Media/absolut_antibody/PerClass/RawBindingsPerClassMurine/extracted/processed/"
df = data_ingestion.load_processed_csvs(dirpath, ['3KR3'])

In [5]:
fpath = os.path.join(dirpath,'3KR3_D_consolidated.csv')
df = pd.read_csv(fpath,index_col=0)

In [6]:
import torch
import torch.nn as nn
import esm

# Load the ESM-2 variant (adjust the variant name if desired)
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
batch_converter = alphabet.get_batch_converter()
model.eval()  # set to evaluation mode

# Prepare sample sequences as a list of (name, sequence) tuples.
data = [
    ("protein1", "MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQ"),
    ("protein2", "MGSSHHHHHHSSGLVPRGSHMASMTGGQQMGRDLYDDDDKDP"),
    # Add more sequences as needed
]

# Convert the data into batch format required by the model.
batch_labels, batch_strs, batch_tokens = batch_converter(data)
if torch.cuda.is_available():
    model = model.to("cuda")
    batch_tokens = batch_tokens.to("cuda")

# For ESM-2, use the last layer (model.num_layers) for representations.
layer = model.num_layers
with torch.no_grad():
    results = model(batch_tokens, repr_layers=[layer], return_contacts=False)
token_representations = results["representations"][layer]

## Assume that the ESM-2 model is already loaded and used to generate token representations.
For example:
results = model(batch_tokens, repr_layers=[model.num_layers], return_contacts=False)
token_representations = results["representations"][model.num_layers]
and that "data" is a list of (name, sequence) tuples.

token_representations is a tensor of shape [batch_size, L, D] where L is the number of tokens
(including special tokens) and D is the hidden dimension (assumed to be 1280 here).

•	Simple Average Embedding:
We average the token embeddings corresponding to the actual amino acid sequence (excluding the first special token) because the start token might not capture the detailed information in the sequence. We then project this vector from 1280 dimensions to 1024 (if desired) using a linear layer.
	
 •	Advanced Embeddings:
 	1.	CLS Token Only:
Use only the model’s [CLS] (or start-of-sequence) token as the sequence representation. This is the standard approach in many transformer models for classification tasks, though it sometimes loses finer-grained information.

	2.	Mean Pooling:
Compute the average of all token representations (typically excluding the special tokens). This approach tends to capture the overall content of the sequence and is often more robust than the CLS token alone.

	3.	CLS, MIN, MAX, MEAN Concatenation:
Concatenate the CLS token with statistics computed over the sequence tokens:
	•	MIN: Elementwise minimum over token embeddings.
	•	MAX: Elementwise maximum.
	•	MEAN: Elementwise average.
This yields a vector with 4×(hidden dimension) dimensions (e.g., 4×1280 = 5120 dims for an ESM model with 1280 hidden units). This strategy is popular because it provides multiple views of the sequence’s distribution.

	4.	CLS, MEAN, and Standard Deviation Concatenation:
Instead of using min and max, you can compute the elementwise standard deviation of the token embeddings along with the CLS token and the mean. This produces a 3×(hidden dimension) vector (e.g., 3×1280 = 3840 dims) and captures both central tendency and dispersion.

	5.	Layer-wise Concatenation:
Instead of taking only the last layer’s output, you can concatenate representations from multiple layers (e.g., last and penultimate layers) to capture multi-scale information. For example, concatenating two layers would double the embedding size.

	6.	Attention-based Pooling:
Learn attention weights for each token and compute a weighted average of token embeddings. You can combine this with the CLS token or other summary statistics.

    We concatenate:

    •	The CLS token (position 0),

    •	The elementwise minimum across the sequence tokens,

    •	The elementwise maximum across the sequence tokens,

    •	The elementwise mean across the sequence tokens.

This yields a vector of size 4 × 1280 = 5120 dimensions. This approach leverages multiple statistical summaries of the token embeddings, potentially capturing richer information about the sequence.

In [7]:
# --- Option 1: Simple Average Embedding (excluding CLS token) ---
simple_embeddings = []
for i, (name, seq) in enumerate(data):
    # Exclude the first token (CLS or start-of-sequence)
    avg_embedding = token_representations[i, 1:len(seq)+1].mean(0)
    simple_embeddings.append(avg_embedding)

# Optionally project to a fixed dimension (e.g., 1024) if desired.
hidden_dim = simple_embeddings[0].shape[0]  # e.g., 1280
target_dim = 1024
if hidden_dim != target_dim:
    projector = nn.Linear(hidden_dim, target_dim)
    if torch.cuda.is_available():
        projector = projector.to("cuda")
    simple_embeddings_1024 = [
        projector(e.unsqueeze(0)).squeeze(0).detach().cpu().numpy() for e in simple_embeddings
    ]
else:
    simple_embeddings_1024 = [e.cpu().numpy() for e in simple_embeddings]

print("Simple embedding (projected to 1024 dims) shape:", simple_embeddings_1024[0].shape)

Simple embedding (projected to 1024 dims) shape: (1024,)


In [8]:
# --- Option 2: Advanced Embedding: CLS, MIN, MAX, MEAN Concatenation ---
advanced_embeddings = []
for i, (name, seq) in enumerate(data):
    # CLS token is at index 0
    cls_token = token_representations[i, 0]
    # Sequence tokens: indices 1 to len(seq)+1 (excluding special tokens)
    seq_tokens = token_representations[i, 1:len(seq)+1]
    # Compute elementwise min, max, and mean over the sequence tokens.
    min_vec = seq_tokens.min(dim=0)[0]
    max_vec = seq_tokens.max(dim=0)[0]
    mean_vec = seq_tokens.mean(dim=0)
    # Concatenate the CLS token, min, max, and mean vectors.
    concat_vec = torch.cat([cls_token, min_vec, max_vec, mean_vec], dim=0)
    advanced_embeddings.append(concat_vec.cpu().numpy())

In [9]:
advanced_embeddings[1].shape

(5120,)

In [10]:
# ---------------------------
# Option 1: CLS, MIN, MAX, MEAN Concatenation (4 x D dimensions)
# ---------------------------
advanced_embeddings_concat = []
for i, (name, seq) in enumerate(data):
    cls_token = token_representations[i, 0]  # CLS token at index 0
    # Use only the tokens corresponding to the sequence (exclude special tokens)
    seq_tokens = token_representations[i, 1:len(seq)+1]
    min_vec = seq_tokens.min(dim=0)[0]
    max_vec = seq_tokens.max(dim=0)[0]
    mean_vec = seq_tokens.mean(dim=0)
    concat_vec = torch.cat([cls_token, min_vec, max_vec, mean_vec], dim=0)
    advanced_embeddings_concat.append(concat_vec.cpu().numpy())
    
print("Advanced (CLS, MIN, MAX, MEAN) embedding shape:", advanced_embeddings_concat[0].shape)
# Expected shape: 4 * D, e.g., 4 * 1280 = 5120 dims.

# ---------------------------
# Option 2: CLS, MEAN, STD Concatenation (3 x D dimensions)
# ---------------------------
advanced_embeddings_std = []
for i, (name, seq) in enumerate(data):
    cls_token = token_representations[i, 0]  # CLS token at index 0
    seq_tokens = token_representations[i, 1:len(seq)+1]
    mean_vec = seq_tokens.mean(dim=0)
    std_vec = seq_tokens.std(dim=0)
    concat_vec = torch.cat([cls_token, mean_vec, std_vec], dim=0)
    advanced_embeddings_std.append(concat_vec.cpu().numpy())
    
print("Advanced (CLS, MEAN, STD) embedding shape:", advanced_embeddings_std[0].shape)
# Expected shape: 3 * D, e.g., 3 * 1280 = 3840 dims.

Advanced (CLS, MIN, MAX, MEAN) embedding shape: (5120,)
Advanced (CLS, MEAN, STD) embedding shape: (3840,)


In [13]:

# -----------------------
# 1. Generate Sequence Embeddings with ESM-2
# -----------------------

# Load the ESM-2 model (variant esm2_t33_650M_UR50D) and its alphabet.
esm2_model, esm2_alphabet = esm.pretrained.esm2_t33_650M_UR50D()
esm2_batch_converter = esm2_alphabet.get_batch_converter()
esm2_model.eval()

# Sample protein sequence
sequence = "MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQ"
data = [("protein1", sequence)]

# Convert data to batch format.
batch_labels, batch_strs, batch_tokens = esm2_batch_converter(data)
if torch.cuda.is_available():
    esm2_model = esm2_model.to("cuda")
    batch_tokens = batch_tokens.to("cuda")

# Generate sequence embeddings (using the last layer representation).
with torch.no_grad():
    esm2_results = esm2_model(batch_tokens, repr_layers=[esm2_model.num_layers], return_contacts=False)
# Average token representations (exclude special tokens) to get a single embedding vector.
seq_embedding = esm2_results["representations"][esm2_model.num_layers][0, 1:len(sequence)+1].mean(0).cpu().numpy()
print("ESM-2 sequence embedding shape:", seq_embedding.shape)


# -----------------------
# 2. Generate Structure Prediction (and Structural Features) with ESMFold
# -----------------------

# Load the ESMFold model (ESMFold v1)
esmfold_model, esmfold_alphabet = esm.pretrained.esmfold_v1()
esmfold_model.eval()

# ESMFold takes a raw sequence as input.
with torch.no_grad():
    # ESMFold returns a dictionary that includes predicted 3D coordinates (under key 'coords')
    # and other outputs such as pLDDT confidence scores.
    fold_result = esmfold_model(sequence)
    
# Extract the predicted structure coordinates.
coords = fold_result['coords']  # shape typically [L, 3] (for a sequence of length L)
print("Predicted structure coordinates shape:", coords.shape)

# Optionally, if ESMFold exposes internal representations (e.g., hidden states),
# you could extract them and aggregate them as "structural embeddings".
# For example, if fold_result had a key "hidden_states", you might do:
# struct_embedding = fold_result["hidden_states"][-1].mean(dim=0).cpu().numpy()
# (This depends on the ESMFold API and whether such representations are available.)
  
# -----------------------
# 3. (Optional) Fuse Sequence and Structure Embeddings
# -----------------------

# One common strategy is to concatenate the sequence embedding and a structural embedding.
# For demonstration, assume we only use the sequence embedding and the flattened structure coordinates.
# You might also compute summary statistics (e.g., mean, variance) over the coordinates.
struct_embedding = coords.mean(axis=0)  # Simple example: mean coordinate per dimension.
print("Mean structural embedding shape:", struct_embedding.shape)

# Concatenate both embeddings.
import numpy as np
combined_embedding = np.concatenate([seq_embedding, struct_embedding])
print("Combined embedding shape:", combined_embedding.shape)

ESM-2 sequence embedding shape: (1280,)


ModuleNotFoundError: No module named 'tree'

In [13]:
!pip install git+https://github.com/aqlaboratory/openfold.git

Collecting git+https://github.com/aqlaboratory/openfold.git
  Cloning https://github.com/aqlaboratory/openfold.git to /private/var/folders/lg/dh15rq991nb0srz_jbgtk8f80000gn/T/pip-req-build-vdii9_3i
  Running command git clone --filter=blob:none --quiet https://github.com/aqlaboratory/openfold.git /private/var/folders/lg/dh15rq991nb0srz_jbgtk8f80000gn/T/pip-req-build-vdii9_3i
  Resolved https://github.com/aqlaboratory/openfold.git to commit a1192c8d3a0f3004b1284aaf6437681e6b558c10
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: openfold
  Building wheel for openfold (setup.py) ... [?25ldone
[?25h  Created wheel for openfold: filename=openfold-2.0.0-cp310-cp310-macosx_14_0_arm64.whl size=316729 sha256=5f970f398e5006f5316f3c8dd9a14840a7cf1b8adc11d36cdcd53495a68626be
  Stored in directory: /private/var/folders/lg/dh15rq991nb0srz_jbgtk8f80000gn/T/pip-ephem-wheel-cache-4oav79mj/wheels/c8/68/f7/33ea4cb0f20d1303e1d5cbbdd73fdd0d5147843868ca49a9af
S