# AggrescanAI  
Colab notebook user-friendy to calculate aggregation propensities using protein language models and deep neural networks.
- Input: a protein sequence
- Output: aggregation propensity profile

In [None]:
#@title Input sequence
input_sequence = "MDVFMKGLSKAKEGVVAAAEKTKQGVAEAAGKTKEGVLYVGSKTKEGVVHGVATVAEKTKEQVTNVGGAVVTGVTAVAQKTVEGAGSIAAATGFVKKDQLGKNEEGAPQEGILEDMPVDPDNEAYEMPSEEGYQDYEPEA"  #@param {type:"string"}
# To dataframe
import pandas as pd
df = pd.DataFrame({'sequence': [input_sequence]})

In [None]:
#@title Download models from HuggingFace
import os
import urllib.request
from tqdm import tqdm

os.makedirs("models", exist_ok=True)
base_url = "https://huggingface.co/alvaro-2/aggrescanai/resolve/main"
model_names = [
    f"balanced_models/balanced_model_1_1_{i}.h5" for i in range(1, 34)
]
# Download balanced models
print("Downloading balanced models...")
for fname in tqdm(model_names):
    model_url = f"{base_url}/{fname}"
    os.makedirs(os.path.dirname(f"models/balanced_models/"), exist_ok=True)
    model_path = f"models/balanced_models/{os.path.basename(fname)}"
    if not os.path.exists(model_path):
        urllib.request.urlretrieve(model_url, model_path)

# Download homology models
homology_model_names = [
    f"homology_models/cpad_hotidp90_model_cv_{i}.h5" for i in range(1, 6)
]

print("Downloading homology models...")
for fname in tqdm(homology_model_names):
    model_url = f"{base_url}/{fname}"
    os.makedirs(os.path.dirname(f"models/homology_models/"), exist_ok=True)
    model_path = f"models/homology_models/{os.path.basename(fname)}"
    if not os.path.exists(model_path):
        urllib.request.urlretrieve(model_url, model_path)

In [None]:
#@title Load models
from tensorflow.keras.models import load_model
models = [load_model(f"models/balanced_models/{os.path.basename(fname)}", compile= False) for fname in model_names]
homology_models = [load_model(f"models/homology_models/{os.path.basename(fname)}", compile= False) for fname in homology_model_names]

In [None]:
#@title Generate embedding representations
import torch
from transformers import T5Tokenizer, T5EncoderModel
from tqdm import tqdm
import re

# Load ProtT5 tokenizer and model
transformer_link = "Rostlab/prot_t5_xl_half_uniref50-enc"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = T5EncoderModel.from_pretrained(transformer_link, output_hidden_states=True).to(device).eval()
tokenizer = T5Tokenizer.from_pretrained(transformer_link, do_lower_case=False, legacy=False)

def generate_embeddings(sequence: str):
    spaced = " ".join(list(sequence))
    ids = tokenizer(spaced, add_special_tokens=True, return_tensors="pt").to(device)
    with torch.no_grad():
        out = model(input_ids=ids["input_ids"], attention_mask=ids["attention_mask"])
    return out.last_hidden_state[0, :-1].cpu().numpy()

tqdm.pandas(desc="Generating embeddings")
df["embedding"] = df["sequence"].progress_map(generate_embeddings)

In [None]:
#@title Apply soft-voting function
import numpy as np
def avg_probs(models, X, batch_size=32):
    """
    models: list of keras.Model
    X: numpy array de forma (n_samples, n_features)
    devuelve: vector numpy (n_samples,) con la probabilidad media
    """
    all_preds = []
    for m in models:
        # model.predict devuelve shape (n_samples, 1) o (n_samples,)
        p = m.predict(X, batch_size=batch_size)
        p = p.reshape(-1)  # asegurar (n_samples,)
        all_preds.append(p)
    all_preds = np.stack(all_preds, axis=0)   # (n_models, n_samples)
    return np.mean(all_preds, axis=0)         # (n_samples,)