# AggrescanAI  
Colab notebook user-friendy to calculate aggregation propensities using protein language models and deep neural networks.
- Input: a protein sequence
- Output: aggregation propensity profile

In [None]:
#@title Input sequence
input_sequence = """MDVFMKGLSKAKEGVVAAAEKTKQGVAEAAGKTKEGVLYVGSKTKEGVVHGVATVAEKTKEQVTNVGGAVVTGVTAVAQKTVEGAGSIAAATGFVKKDQLGKNEEGAPQEGILEDMPVDPDNEAYEMPSEEGYQDYEPEA"""  #@param {type:"string"}

In [None]:
#@title Download models from HuggingFace
import os
import urllib.request

os.makedirs("nodels", exist_ok=True)
base_url = "https://huggingface.co/alvaro-2/aggrescanai/tree/main"
model_names = [
    f"balanced_models/balanced_model_1_1_{i}.h5" for i in range(1, 34)
]
for fname in model_names:
    model_url = f"{base_url}{fname}"
    model_path = f"models/{os.path.basename(fname)}"
    if not os.path.exists(model_path):
        print(f"Downloading {fname}...")
        urllib.request.urlretrieve(model_url, model_path)

# Download homology models
homology_model_names = [
    f"homology_models/cpad_hotidp90_model_cv_{i}.h5" for i in range(1, 6)
]

for fname in homology_model_names:
    model_url = f"{base_url}{fname}"
    model_path = f"models/{os.path.basename(fname)}"
    if not os.path.exists(model_path):
        print(f"Downloading {fname}...")
        urllib.request.urlretrieve(model_url, model_path)

In [None]:
#@title Load models
from tensorflow.keras.models import load_model
models = [load_model(f"models/{os.path.basename(fname)}", compile= False) for fname in model_names]
homology_models = [load_model(f"models/{os.path.basename(fname)}", compile= False) for fname in homology_model_names]

In [None]:
#@title Generate embedding representations
import torch
from transformers import T5Tokenizer, T5EncoderModel
from tqdm import tqdm
import pandas as pd
import re

# Load ProtT5 tokenizer and model
transformer_link = "Rostlab/prot_t5_xl_half_uniref50-enc"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = T5EncoderModel.from_pretrained(transformer_link, output_hidden_states=True).to(device).eval()
tokenizer = T5Tokenizer.from_pretrained(transformer_link, do_lower_case=False, legacy=False)

def generate_embeddings(sequence: str):
    spaced = " ".join(list(sequence))
    ids = tokenizer(spaced, add_special_tokens=True, return_tensors="pt").to(device)
    with torch.no_grad():
        out = model(input_ids=ids["input_ids"], attention_mask=ids["attention_mask"])
    return out.last_hidden_state[0, :-1].cpu().numpy()
