# AggrescanAI  
User-friendy notebook to calculate aggregation propensities using protein language models and deep neural networks.
- Input: an uniprot id or a protein sequence.
- Output: aggregation propensity profile.  
- How? Just hit `Runtime` -> `Run all` or press ctrl+F9

# 1. Set up

In [None]:
#@title Input protein data { display-mode: "form" }

#@markdown You can also use the example sequence provided below.
# uniprot_id = ""  #@param {type:"string"}
# input_sequence = "MDVFMKGLSKAKEGVVAAAEKTKQGVAEAAGKTKEGVLYVGSKTKEGVVHGVATVAEKTKEQVTNVGGAVVTGVTAVAQKTVEGAGSIAAATGFVKKDQLGKNEEGAPQEGILEDMPVDPDNEAYEMPSEEGYQDYEPEA"  #@param {type:"string"}
use_example= False  #@param {type:"boolean"}
#@markdown Enter an UniProt ID in the box below. You can input isoforms as well! E.g.: P10636-8
uniprot_id = ""  #@param {type:"string"}
#@markdown Instead you can input a protein sequence directly in the box below. If you leave this empty, the script will attempt to fetch the sequence using the UniProt ID provided above.
input_sequence = ""  #@param {type:"string"}

if use_example:
    # Example sequence for testing
    uniprot_id = "P37840"  # Example UniProt ID
    input_sequence = "MDVFMKGLSKAKEGVVAAAEKTKQGVAEAAGKTKEGVLYVGSKTKEGVVHGVATVAEKTKEQVTNVGGAVVTGVTAVAQKTVEGAGSIAAATGFVKKDQLGKNEEGAPQEGILEDMPVDPDNEAYEMPSEEGYQDYEPEA"
#@markdown ---
import time
start_time = time.time()

In [None]:
#@title Retrieve sequence from UniProt if needed { display-mode: "form" }
#@markdown If you leave the UniProt ID empty, the default UniProt ID will be used.

# If a UniProt ID is provided, fetch the sequence from UniProt
# if not uniprot_id:
#     uniprot_id = "P37840" # Default UniProt ID if none is provided
if not use_example and uniprot_id:
    import requests
    uniprot_id = uniprot_id.strip()  # Clean up any whitespace
    #url = f"https://www.uniprot.org/uniprot/{uniprot_id}.fasta"
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
    response = requests.get(url)
    if response.status_code == 200:
        # Parse the FASTA format
        fasta_lines = response.text.strip().split('\n')
        input_sequence = ''.join(fasta_lines[1:])  # Join all lines except the first (header)
    else:
        raise ValueError(f"Failed to fetch sequence for UniProt ID {uniprot_id}. Status code: {response.status_code}")

# Clean up the sequence
input_sequence = input_sequence.replace(' ', '').replace('\n', '').upper()  # Clean up the sequence
# To dataframe
import pandas as pd
df = pd.DataFrame({'uniprot_id': [uniprot_id], 'sequence': [input_sequence]})

In [None]:
#@title Download models from HuggingFace { display-mode: "form" }
#@markdown This fetches the AggrescanAI models needed for prediction.
import os
import urllib.request
from tqdm import tqdm

os.makedirs("models", exist_ok=True)
base_url = "https://huggingface.co/alvaro-2/aggrescanai/resolve/main"
model_names = [
    f"balanced_models/balanced_model_1_1_{i}.h5" for i in range(1, 34)
]
# Download balanced models
print("Downloading balanced models...")
for fname in tqdm(model_names):
    model_url = f"{base_url}/{fname}"
    os.makedirs(os.path.dirname(f"models/balanced_models/"), exist_ok=True)
    model_path = f"models/balanced_models/{os.path.basename(fname)}"
    if not os.path.exists(model_path):
        urllib.request.urlretrieve(model_url, model_path)

# Download homology models
homology_model_names = [
    f"homology_models/cpad_hotidp90_model_cv_{i}.h5" for i in range(1, 6)
]

print("Downloading homology models...")
for fname in tqdm(homology_model_names):
    model_url = f"{base_url}/{fname}"
    os.makedirs(os.path.dirname(f"models/homology_models/"), exist_ok=True)
    model_path = f"models/homology_models/{os.path.basename(fname)}"
    if not os.path.exists(model_path):
        urllib.request.urlretrieve(model_url, model_path)

# 2. Predict

In [None]:
%%capture
#@title Load models { display-mode: "form" }
#@markdown This loads the AggrescanAI models into memory for prediction.
from tensorflow.keras.models import load_model
models = [load_model(f"models/balanced_models/{os.path.basename(fname)}", compile= False) for fname in model_names]
homology_models = [load_model(f"models/homology_models/{os.path.basename(fname)}", compile= False) for fname in homology_model_names]

In [None]:
#@title Generate embedding representations { display-mode: "form" }
#@markdown This generates embedding representations for the input protein sequence using ProtT5.
import torch
from transformers import T5Tokenizer, T5EncoderModel
from tqdm import tqdm
import re

# Load ProtT5 tokenizer and model
transformer_link = "Rostlab/prot_t5_xl_half_uniref50-enc"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = T5EncoderModel.from_pretrained(transformer_link, output_hidden_states=True).to(device).eval()
tokenizer = T5Tokenizer.from_pretrained(transformer_link, do_lower_case=False, legacy=False)

def generate_embeddings(sequence: str):
    spaced = " ".join(list(sequence))
    ids = tokenizer(spaced, add_special_tokens=True, return_tensors="pt").to(device)
    with torch.no_grad():
        out = model(input_ids=ids["input_ids"], attention_mask=ids["attention_mask"])
    return out.last_hidden_state[0, :-1].cpu().numpy()

tqdm.pandas(desc="Generating embeddings")
df["embedding"] = df["sequence"].progress_map(generate_embeddings)

In [None]:
#@title Run Predictions { display-mode: "form" }
#@markdown This runs the AggrescanAI models on the generated embeddings to predict aggregation probabilities.

# Apply soft-voting function
import numpy as np
def avg_probs(models, X, batch_size=32):
    """
    models: list of keras.Model
    X: numpy array of shape (n_samples, n_features)
    returns: numpy vector (n_samples,) with mean probability
    """
    all_preds = []
    for m in models:
        # model.predict devuelve shape (n_samples, 1) o (n_samples,)
        p = m.predict(X, batch_size=batch_size)
        p = p.reshape(-1)  # asegurar (n_samples,)
        all_preds.append(p)
    all_preds = np.stack(all_preds, axis=0)   # (n_models, n_samples)
    return np.mean(all_preds, axis=0)         # (n_samples,)

# Ensemble function
def ensemble_meta_probs(embedding, weights=(0.1, 0.9)):
    """
    embedding: np.array de forma (L, 1024)
    devuelve: vector de probabilidades de forma (L,)
    """
    # obtenemos promedios por familia
    p90 = avg_probs(homology_models, embedding)   # (L,)
    p33 = avg_probs(models, embedding)   # (L,)
    # weighted soft-voting
    return weights[0]*p90 + weights[1]*p33 # (L,)

tqdm.pandas(desc="Computing probabilities")
df["prob_vector"] = df["embedding"].progress_map(ensemble_meta_probs)

# 3. Results

In [None]:
#@title Interactive table with AggrescanAI scores { display-mode: "form" }
#@markdown Download the results as a CSV file.
import matplotlib.pyplot as plt
from IPython.display import display

def save_results_as_csv(df, filename= f"aggrescanai_results_{uniprot_id}.csv"):
    df.to_csv(filename, index=False)
    files.download(filename)
    print(f"Results saved to {filename}")

result_df = pd.DataFrame({
    "uniprot_id": [uniprot_id]* len(input_sequence),
    "position": list(range(1, len(input_sequence) + 1)),
    "residue": list(input_sequence),
    "aggrescanai_score": df["prob_vector"].values[0]
})

# Interactive table view
fig_table = go.Figure(data=[go.Table(
    header=dict(values=list(result_df.columns), fill_color='paleturquoise', align='left'),
    cells=dict(values=[result_df[col] for col in result_df.columns], fill_color='lavender', align='left')
)])
display(fig_table)

# # Download
# save_results_as_csv(result_df)


# # Save plot and table
# import base64
# from IPython.display import HTML

# # Encode Plotly figure to PNG and create download link
# try:
#     png_bytes = fig.to_image(format="png", scale=2)
#     b64 = base64.b64encode(png_bytes).decode()
#     html_button = f'''
# <a download="aggrescanai_results_{uniprot_id}.png"
#    href="data:image/png;base64,{b64}"
#    style="display:inline-block;padding:10px 20px;background-color:#4CAF50;color:white;font-weight:bold;text-decoration:none;border-radius:5px;margin-top:10px;"
#    target="_blank">
#    📥 Download Plot as PNG
# </a>
# '''
#     display(HTML(html_button))
# except Exception as e:
#    print("Could not generate PNG for download:", e)



# from google.colab import files

# import plotly.io as pio
# fig.write_html(f"aggrescanai_results_{uniprot_id}.html")
# files.download(f"aggrescanai_results_{uniprot_id}.html")
# save_results_as_csv(result_df)
# files.download(f"aggrescanai_results_{uniprot_id}.csv")



# end_time = time.time()
# print(f"Total execution time: {(end_time - start_time:.2f)/60} minutes")

In [None]:
#@title Visualize results { display-mode: "form" }
#@markdown Plot the aggregation prone probability scores across the input sequence.
# Default threshold for aggregation propensity
threshold = 0.3  #@param {type:"number", min:0, max:1, step:0.1}
import plotly.graph_objects as go
from google.colab import files

def plot_probability_profile(uniprot_id: str, threshold: float = threshold):
    """
    Plot the probability profile per position given an uniprot_id and a threshold.
    """
    # Filter
    mask = df['uniprot_id'] == uniprot_id
    if not mask.any():
        print(f"UniProt ID '{uniprot_id}' not found.")
        return
    prob_vector = df.loc[mask, 'prob_vector'].values[0]
    positions = np.arange(1, len(prob_vector) + 1)

    # Create figure
    fig = go.Figure()

    # Plot probability profile
    fig.add_trace(go.Scatter(
        x=positions,
        y=prob_vector,
        mode='lines+markers',
        name='Probability',
        hovertemplate='Position %{x}<br>Prob: %{y:.3f}<extra></extra>'
    ))

    # Set threshold line
    fig.add_trace(go.Scatter(
        x=[positions[0], positions[-1]],
        y=[threshold, threshold],
        mode='lines',
        name=f'Threshold = {threshold}',
        line=dict(color='red', dash='dash')
    ))

    fig.update_layout(
        title=f'Aggregation probability profile: {uniprot_id}',
        xaxis_title='Position',
        yaxis_title='Probability',
        hovermode='x unified'
    )
    return fig

fig = plot_probability_profile(uniprot_id)
fig.show()

# # Download figure
# fig.write_html(f"aggrescanai_results_{uniprot_id}.html")
# files.download(f"aggrescanai_results_{uniprot_id}.html")

In [None]:
#@title Download results { display-mode: "form" }
from google.colab import files

import plotly.io as pio
fig.write_html(f"aggrescanai_results_{uniprot_id}.html")
files.download(f"aggrescanai_results_{uniprot_id}.html")
save_results_as_csv(result_df)
files.download(f"aggrescanai_results_{uniprot_id}.csv")



end_time = time.time()
print(f"Total execution time: {(end_time - start_time:.2f)/60} minutes")