In [13]:
import pandas as pd
import numpy as np
import torch
import esm
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error


In [14]:
df = pd.read_csv(r"C:\Users\MSI\Downloads\destress_data\destress_data\destress_data_af2.csv")

df = df[['full_sequence', 'aggrescan3d_total_value']].dropna()

# start small while testing GPU stability
df_small = df.sample(n=20000, random_state=42).reset_index(drop=True)


df_small.head()


Unnamed: 0,full_sequence,aggrescan3d_total_value
0,MTRLTTLVAAILTLSQRASLGHARELRASKIFHSARDVDSEYDYVI...,-335.1561
1,MSKAQQLMARDQQAARTEEEARRKRAEMRQSYGNKFSSINLHRMRQ...,-311.1749
2,MEHEMMRKDNTFNNIFKDDIQILLEKTIQYGFIDILKYLLDKLGNN...,-277.79
3,MSLSPAAQRYEEHVLATVSGRPYNKTGNQIFFNDHKNYVPHQNPVK...,-670.0939
4,MSKRNQVSYVRPAEPAFLSRFKERVGYKEGPTVETKKIQPQLPDED...,-245.1053


In [15]:
df_small["len"] = df_small["full_sequence"].str.len()
df_small = df_small[df_small["len"] < 700].drop(columns=["len"])

model, alphabet = esm.pretrained.esm2_t6_8M_UR50D()
batch_converter = alphabet.get_batch_converter()

# move model to GPU
model = model.cuda()
model.eval()


ESM2(
  (embed_tokens): Embedding(33, 320, padding_idx=1)
  (layers): ModuleList(
    (0-5): 6 x TransformerLayer(
      (self_attn): MultiheadAttention(
        (k_proj): Linear(in_features=320, out_features=320, bias=True)
        (v_proj): Linear(in_features=320, out_features=320, bias=True)
        (q_proj): Linear(in_features=320, out_features=320, bias=True)
        (out_proj): Linear(in_features=320, out_features=320, bias=True)
        (rot_emb): RotaryEmbedding()
      )
      (self_attn_layer_norm): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
      (fc1): Linear(in_features=320, out_features=1280, bias=True)
      (fc2): Linear(in_features=1280, out_features=320, bias=True)
      (final_layer_norm): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
    )
  )
  (contact_head): ContactPredictionHead(
    (regression): Linear(in_features=120, out_features=1, bias=True)
    (activation): Sigmoid()
  )
  (emb_layer_norm_after): LayerNorm((320,), eps=1e-05, elementwis

In [17]:
def embed_sequence(seq):
    # prepare data for tokenizer
    data = [("protein", seq)]
    _, _, tokens = batch_converter(data)
    
    # move tokens to GPU
    tokens = tokens.cuda(non_blocking=True)

    try:
        with torch.no_grad():
            output = model(tokens, repr_layers=[6])
            reps = output["representations"][6][0]

            # average over all positions excluding CLS/EOS
            emb = reps[1:-1].mean(0).detach().cpu().numpy()

        # free the GPU memory from tokens
        del tokens
        torch.cuda.empty_cache()

        return emb

    except RuntimeError as e:
        if "out of memory" in str(e):
            print("⚠️ CUDA OOM — skipping long sequence")
            torch.cuda.empty_cache()
            return None
        else:
            raise e


In [18]:
embeddings = []

for seq in tqdm(df_small["full_sequence"], desc="Embedding sequences"):
    emb = embed_sequence(seq)
    if emb is not None:
        embeddings.append(emb)

embeddings = np.vstack(embeddings)
embeddings.shape


Embedding sequences: 100%|███████████████████████████████████████████████████████| 16929/16929 [28:17<00:00,  9.97it/s]


(16929, 320)

In [19]:
y = df_small['aggrescan3d_total_value'].iloc[:len(embeddings)].values


In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    embeddings, y, test_size=0.2, random_state=42
)

reg = Ridge(alpha=1.0)
reg.fit(X_train, y_train)


0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [22]:
y_pred = reg.predict(X_test)

mse = mean_squared_error(y_test, y_pred)   # no 'squared' argument
rmse = np.sqrt(mse)

print("R2:", r2_score(y_test, y_pred))
print("MSE:", mse)
print("RMSE:", rmse)

R2: 0.8146734522918654
MSE: 7039.094826721055
RMSE: 83.89931362485068


In [23]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

rf = RandomForestRegressor(
    n_estimators=400,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

mse = mean_squared_error(y_test, y_pred_rf)
rmse = np.sqrt(mse)

print("Random Forest Results:")
print("R2:", r2_score(y_test, y_pred_rf))
print("RMSE:", rmse)


Random Forest Results:
R2: 0.7474958242308316
RMSE: 97.93183053064256


In [24]:
import xgboost as xgb

xg = xgb.XGBRegressor(
    n_estimators=600,
    learning_rate=0.03,
    max_depth=7,
    subsample=0.85,
    colsample_bytree=0.85,
    reg_lambda=1.0,
    objective="reg:squarederror",
    tree_method="hist",     # GPU not used here; safe for CPU
    random_state=42
)

xg.fit(X_train, y_train)

y_pred_xg = xg.predict(X_test)

mse = mean_squared_error(y_test, y_pred_xg)
rmse = np.sqrt(mse)

print("XGBoost Results:")
print("R2:", r2_score(y_test, y_pred_xg))
print("RMSE:", rmse)


XGBoost Results:
R2: 0.8559930967302942
RMSE: 73.95733371873168


In [25]:
test_sequence = "MKTLLILALLAVALA..."  # YOUR SEQUENCE HERE
seq_emb = embed_sequence(test_sequence).reshape(1, -1)
pred = xg.predict(seq_emb)
print("Predicted Aggrescan3D total value:", pred[0])


Predicted Aggrescan3D total value: 137.06725
