# AggrescanAI  
User-friendy notebook to calculate aggregation propensities using protein language models and deep neural networks.
- **Input**: an UniProt ID or a protein sequence.
- **Output**: aggregation propensity profile table and figure.  
- **How?** Just go to `Runtime` → `Run all` or press `ctrl+F9`

# 1. Set up

In [None]:
#@title Input protein data { display-mode: "form" }

#@markdown - Enter an UniProt ID in the box below. You can input isoforms as well! E.g.: P10636-8
uniprot_id = ""  #@param {type:"string"}
#@markdown - Or input a protein sequence directly in the box below. If you leave this empty, the script will attempt to fetch the sequence using the UniProt ID provided above.
input_sequence = ""  #@param {type:"string"}

#@markdown You can try the example sequence provided below:
use_example= False  #@param {type:"boolean"}

if uniprot_id or input_sequence:
    use_example = False  # If user provides input, don't use example

if use_example:
    # Example sequence for testing
    uniprot_id = "P37840"  # Example UniProt ID
    input_sequence = "MDVFMKGLSKAKEGVVAAAEKTKQGVAEAAGKTKEGVLYVGSKTKEGVVHGVATVAEKTKEQVTNVGGAVVTGVTAVAQKTVEGAGSIAAATGFVKKDQLGKNEEGAPQEGILEDMPVDPDNEAYEMPSEEGYQDYEPEA"

#@markdown ---
import time
start_time = time.time()

In [None]:
#@title Upload FASTA file (optional)
from google.colab import files
from Bio import SeqIO
import pandas as pd

uploaded = files.upload()
fasta_sequences = []

for fname in uploaded:
    for record in SeqIO.parse(fname, "fasta"):
        fasta_sequences.append({
            "uniprot_id": record.id.split("|")[1] if "|" in record.id else record.id,
            "sequence": str(record.seq).replace("\n", "").strip().upper()
        })

# If FASTA file, use it instead of manual input
if fasta_sequences:
    df = pd.DataFrame(fasta_sequences)
    print(f"{len(df)} sequences loaded from FASTA.")
else:
    # Use manual input if no FASTA file was uploaded
    if use_example or not input_sequence.strip():
        input_sequence = "MDVFMKGLSKAKEG..."  # ejemplo por defecto
        uniprot_id = "P37840"
    df = pd.DataFrame([{"uniprot_id": uniprot_id, "sequence": input_sequence.strip().replace("\n", "").upper()}])

In [None]:
#@title Retrieve sequence from UniProt if needed { display-mode: "form" }
#@markdown If you leave the UniProt ID empty, the default UniProt ID will be used.

# If a UniProt ID is provided, fetch the sequence from UniProt
# if not uniprot_id:
#     uniprot_id = "P37840" # Default UniProt ID if none is provided
if not use_example and uniprot_id:
    import requests
    uniprot_id = uniprot_id.strip()  # Clean up any whitespace
    #url = f"https://www.uniprot.org/uniprot/{uniprot_id}.fasta"
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
    response = requests.get(url)
    if response.status_code == 200:
        # Parse the FASTA format
        fasta_lines = response.text.strip().split('\n')
        input_sequence = ''.join(fasta_lines[1:])  # Join all lines except the first (header)
    else:
        raise ValueError(f"Failed to fetch sequence for UniProt ID {uniprot_id}. Status code: {response.status_code}")

# Clean up the sequence
input_sequence = input_sequence.replace(' ', '').replace('\n', '').upper()  # Clean up the sequence
if input_sequence == "":
    raise ValueError("⚠️ No sequence provided. Please provide a valid UniProt ID or a protein sequence.")
# Check if the sequence is valid
if not all(c in 'ACDEFGHIKLMNPQRSTVWY' for c in input_sequence):
    raise ValueError("⚠️ Invalid sequence provided. Please ensure the sequence contains only valid amino acid characters (A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y).")

# To dataframe
import pandas as pd
df = pd.DataFrame({'uniprot_id': [uniprot_id], 'sequence': [input_sequence]})

In [None]:
#@title Download model from HuggingFace { display-mode: "form" }
#@markdown This fetches the AggrescanAI models needed for prediction.
import os
import urllib.request
from tqdm import tqdm

os.makedirs("models", exist_ok=True)
base_url = "https://huggingface.co/alvaro-2/aggrescanai/resolve/main"

# Download homology models
homology_model_names = [
    f"homology_models/cpad_hotidp90_model_cv_{i}.h5" for i in range(1, 6)
]
print("Downloading homology models...")

for fname in tqdm(homology_model_names):
    model_url = f"{base_url}/{fname}"
    os.makedirs(os.path.dirname(f"models/homology_models/"), exist_ok=True)
    model_path = f"models/homology_models/{os.path.basename(fname)}"
    if not os.path.exists(model_path):
        urllib.request.urlretrieve(model_url, model_path)

# 2. Predict

In [None]:
%%capture
#@title Load models { display-mode: "form" }
#@markdown This loads the AggrescanAI models into memory for prediction.
from tensorflow.keras.models import load_model
homology_models = [load_model(f"models/homology_models/{os.path.basename(fname)}", compile= False) for fname in homology_model_names]

In [None]:
#@title Generate embedding representations { display-mode: "form" }
#@markdown This generates embedding representations for the input protein sequence using ProtT5.
import torch
from transformers import T5Tokenizer, T5EncoderModel
from tqdm import tqdm
import re

# Load ProtT5 tokenizer and model
transformer_link = "Rostlab/prot_t5_xl_half_uniref50-enc"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = T5EncoderModel.from_pretrained(transformer_link, output_hidden_states=True).to(device).eval()
tokenizer = T5Tokenizer.from_pretrained(transformer_link, do_lower_case=False, legacy=False)

def generate_embeddings(sequence: str):
    spaced = " ".join(list(sequence))
    ids = tokenizer(spaced, add_special_tokens=True, return_tensors="pt").to(device)
    with torch.no_grad():
        out = model(input_ids=ids["input_ids"], attention_mask=ids["attention_mask"])
    return out.last_hidden_state[0, :-1].cpu().numpy()

tqdm.pandas(desc="Generating embeddings")
df["embedding"] = df["sequence"].progress_map(generate_embeddings)

In [None]:
#@title Run Predictions { display-mode: "form" }
#@markdown This runs the AggrescanAI models on the generated embeddings to predict aggregation probabilities.

# Apply soft-voting function
import numpy as np
def avg_probs(models, X, batch_size=32):
    """
    models: list of keras.Model
    X: numpy array of shape (n_samples, n_features)
    returns: numpy vector (n_samples,) with mean probability
    """
    all_preds = []
    for m in models:
        p = m.predict(X, batch_size=batch_size)
        p = p.reshape(-1)  # Ensure p is a 1D array
        all_preds.append(p)
    all_preds = np.stack(all_preds, axis=0)   # (n_models, n_samples)
    return np.mean(all_preds, axis=0)         # (n_samples,)

def homology_meta_probs(embedding):
    """
    embedding: np.array de forma (L, 1024)
    returns: probabilities vector of shape (L,)
    """
    # apply soft-voting
    p90 = avg_probs(homology_models, embedding)   # (L,)
    return p90

tqdm.pandas(desc="Computing probabilities")
df["prob_vector_homology"] = df["embedding"].progress_map(homology_meta_probs)

# 3. Results

In [None]:
#@title Visualize results { display-mode: "form" }
#@markdown Visual comparison of raw prediction scores (shaded gray) and smoothed profile (black), with results table on the left and download options.

import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Configurable window size
window_size = 7  #@param {type:"slider", min:1, max:25, step:2}
#@markdown - Window size for moving average smoothing of the probability profile – Adjust this value to control the level of smoothing. Re-run the cell to apply and visualize the changes.

# Create smoothed vector
def moving_average(x, w=5):
    return np.convolve(x, np.ones(w)/w, mode='same')

# Select correct row based on uniprot_id
row = df[df["uniprot_id"] == uniprot_id].iloc[0]
residues = list(row["sequence"])
positions = np.arange(1, len(residues) + 1)
prob_vector = np.array(row["prob_vector_homology"])
smoothed = moving_average(prob_vector, w=window_size)

# Prepare result table
result_df = pd.DataFrame({
    "uniprot_id": [row["uniprot_id"]] * len(positions),
    "position": positions,
    "residue": residues,
    "aggrescanai_score": prob_vector
})

# Create standalone plot figure (no table)
fig_only = go.Figure()
fig_only.add_trace(go.Scatter(
    x=positions,
    y=prob_vector,
    fill='tozeroy',
    mode='lines',
    line=dict(color='lightgray'),
    name='Raw Score',
    hoverinfo='skip'
))
fig_only.add_trace(go.Scatter(
    x=positions,
    y=smoothed,
    mode='lines+markers',
    name='Smoothed (Moving Average)',
    line=dict(color='black', width=2),
    marker=dict(color='black', size=4),
    text=residues,
    hovertemplate='Position: %{x}<br>Residue: %{text}<br>Smoothed Propensity: %{y:.3f}<extra></extra>'
))
fig_only.add_trace(go.Scatter(
    x=[positions[0], positions[-1]],
    y=[0.3, 0.3],
    mode='lines',
    name='Threshold = 0.3',
    line=dict(color='red', dash='dash')
))
fig_only.update_layout(
    title=f"Aggregation Propensity Profile ({row.uniprot_id})",
    xaxis_title='Residue Position',
    yaxis_title='Aggregation Propensity',
    hovermode='x unified',
    height=700,
    width=1400,
    template='simple_white',
    legend=dict(x=0.98, y=0.95, xanchor='right', yanchor='top', bgcolor='rgba(255,255,255,0.8)', bordercolor='lightgray', borderwidth=1)
)

# Show full combined figure with table (optional, not exported)
combined_fig = make_subplots(
    rows=1, cols=2,
    shared_yaxes=False,
    horizontal_spacing=0.1,
    column_widths=[0.3, 0.7],
    specs=[[{"type": "table"}, {"type": "xy"}]]
)
combined_fig.add_trace(fig_only.data[0], row=1, col=2)
combined_fig.add_trace(fig_only.data[1], row=1, col=2)
combined_fig.add_trace(fig_only.data[2], row=1, col=2)
combined_fig.add_trace(go.Table(
    header=dict(values=list(result_df.columns), fill_color='rgba(0,0,0,0)', align='left'),
    cells=dict(values=[result_df[col].map(lambda x: f"{x:.4f}" if isinstance(x, float) else x) for col in result_df.columns], fill_color='rgba(0,0,0,0)', align='left')
), row=1, col=1)
combined_fig.update_layout(
    title=f"Aggregation Propensity Profile and Table ({row.uniprot_id})",
    hovermode='x unified',
    height=700,
    width=1500,
    template='simple_white',
    legend=dict(x=0.98, y=0.95, xanchor='right', yanchor='top', bgcolor='rgba(255,255,255,0.8)', bordercolor='lightgray', borderwidth=1)
)
combined_fig.show()

In [None]:
#@title Download results { display-mode: "form" }
#@markdown - Download the results as a CSV file and an HTML file for visualization.
from google.colab import files

def save_results_as_csv(df, filename=f"aggrescanai_results_{uniprot_id}.csv"):
    df_rounded = df.copy()
    df_rounded['aggrescanai_score'] = df_rounded['aggrescanai_score'].map(lambda x: f"{x:.4f}" if isinstance(x, float) else x)
    df_rounded.to_csv(filename, index=False)
    
    
fig_only.write_html(f"aggrescanai_results_{uniprot_id}.html")
files.download(f"aggrescanai_results_{uniprot_id}.html")
save_results_as_csv(result_df)
files.download(f"aggrescanai_results_{uniprot_id}.csv")

end_time = time.time()
print(f"Total execution time: {(end_time - start_time)/60:.2f} minutes")