# ProstT5 Model Demo

This notebook demonstrates how to use the ProstT5 model for protein sequence and structure analysis in Google Colab.

## Setup

First, let's clone the repository and install the required packages:

In [None]:
# Clone the repository
!git clone https://github.com/yourusername/llm_research_tool.git
%cd llm_research_tool

# Install requirements
!pip install -r requirements.txt

## Import the Model

Now let's load API token and import the ProstT5 model:

In [None]:
import os

# Input your Hugging Face API token
HUGGINGFACE_API_TOKEN = input("Enter your Hugging Face API token: ")
os.environ["HUGGINGFACE_API_TOKEN"] = HUGGINGFACE_API_TOKEN

In [None]:
import sys
sys.path.append('.')

from models.prost5 import ProstT5

# Initialize the model
model = ProstT5()
print(f"Model initialized on device: {model.device}")

## Example 1: Sequence to Structure Translation

Let's convert some amino acid sequences to their corresponding structures:

In [None]:
# Example sequences
sequences = [
    "MLLAVLYCLAVFALSLPGK",  # Short sequence
    "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG"  # Longer sequence
]

# Convert to structures
structures = model.sequence_to_structure(sequences)

# Print results
for seq, struct in zip(sequences, structures):
    print(f"\nSequence: {seq}")
    print(f"Structure: {struct}")

## Example 2: Structure to Sequence Translation

Now let's convert some structure strings back to amino acid sequences:

In [None]:
# Example structures
structures = [
    "h" * 20,  # Alpha helix
    "e" * 20,  # Beta sheet
    "c" * 20   # Coil
]

# Convert to sequences
sequences = model.structure_to_sequence(structures)

# Print results
for struct, seq in zip(structures, sequences):
    print(f"\nStructure: {struct}")
    print(f"Sequence: {seq}")

## Example 3: Roundtrip Translation

Let's test the roundtrip translation (sequence → structure → sequence) and calculate accuracy:

In [None]:
def calculate_sequence_accuracy(original, predicted):
    """Calculate sequence accuracy metrics."""
    if len(original) != len(predicted):
        return {
            'exact_match': 0.0,
            'per_residue_accuracy': 0.0,
            'length_mismatch': True
        }
    
    exact_match = original == predicted
    correct_residues = sum(1 for a, b in zip(original, predicted) if a == b)
    per_residue_accuracy = correct_residues / len(original)
    
    return {
        'exact_match': float(exact_match),
        'per_residue_accuracy': per_residue_accuracy,
        'length_mismatch': False
    }

# Test sequences
test_sequences = [
    "MLLAVLYCLAVFALSLPGK",
    "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG"
]

# Perform roundtrip translation
results = []
for seq in test_sequences:
    # Sequence to structure
    structure = model.sequence_to_structure([seq])[0]
    
    # Structure back to sequence
    back_translated = model.structure_to_sequence([structure])[0]
    
    # Calculate accuracy
    accuracy = calculate_sequence_accuracy(seq, back_translated)
    
    results.append({
        'original_sequence': seq,
        'intermediate_structure': structure,
        'back_translated_sequence': back_translated,
        'accuracy': accuracy
    })

# Print results
for r in results:
    print(f"\nOriginal sequence: {r['original_sequence']}")
    print(f"Intermediate structure: {r['intermediate_structure']}")
    print(f"Back-translated sequence: {r['back_translated_sequence']}")
    print(f"Exact match: {r['accuracy']['exact_match']}")
    print(f"Per-residue accuracy: {r['accuracy']['per_residue_accuracy']:.2%}")

## Example 4: Reverse Roundtrip Translation

Let's test the reverse roundtrip (structure → sequence → structure) and calculate accuracy:

In [None]:
# Test structures
test_structures = [
    "h" * 20,  # Alpha helix
    "e" * 20,  # Beta sheet
    "c" * 20   # Coil
]

# Perform reverse roundtrip translation
results = []
for struct in test_structures:
    # Structure to sequence
    sequence = model.structure_to_sequence([struct])[0]
    
    # Sequence back to structure
    back_translated = model.sequence_to_structure([sequence])[0]
    
    # Calculate accuracy
    accuracy = calculate_sequence_accuracy(struct, back_translated)
    
    results.append({
        'original_structure': struct,
        'intermediate_sequence': sequence,
        'back_translated_structure': back_translated,
        'accuracy': accuracy
    })

# Print results
for r in results:
    print(f"\nOriginal structure: {r['original_structure']}")
    print(f"Intermediate sequence: {r['intermediate_sequence']}")
    print(f"Back-translated structure: {r['back_translated_structure']}")
    print(f"Exact match: {r['accuracy']['exact_match']}")
    print(f"Per-residue accuracy: {r['accuracy']['per_residue_accuracy']:.2%}")

## Example 5: Working with FASTA Files

Let's demonstrate how to work with FASTA files:

In [None]:
from Bio import SeqIO

def read_fasta_sequences(fasta_path):
    """Read sequences from a FASTA file."""
    sequences = []
    for record in SeqIO.parse(fasta_path, "fasta"):
        sequences.append(str(record.seq))
    return sequences

# Example FASTA file path
fasta_path = "path/to/your/sequences.fasta"

# Read sequences
sequences = read_fasta_sequences(fasta_path)
print(f"Read {len(sequences)} sequences from {fasta_path}")

# Process sequences
for i, seq in enumerate(sequences, 1):
    print(f"\nSequence {i}:")
    print(f"Length: {len(seq)}")
    print(f"First 20 residues: {seq[:20]}")