# Install Prerequisites

## Install python 3.10.12

In [None]:
# Download and install Miniconda silently
!wget -nc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh  > /dev/null 2>&1
!bash Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local  > /dev/null 2>&1

# Set conda in PATH
import sys
sys.path.append('/usr/local/lib/python3.10/site-packages')

# Install Python 3.10 and pip
!conda install -q -y python=3.10.12 pip  > /dev/null 2>&1

# Install ipykernel for Python 3.10 so we can switch kernel
!pip install ipykernel  > /dev/null 2>&1

#Restart runtime
print("Restarting runtime...")
from IPython.display import Javascript
Javascript('google.colab.kernel.restart()')

Restarting runtime...


<IPython.core.display.Javascript object>

## Install requirements

In [None]:
!pip install torch pandas scikit-learn fair-esm

## Download training script from repository

In [None]:
!git clone https://github.com/bagusar2906/Protein-Crystallization-Modeling-Pipeline.git /content/training

# Train Model with Dataset

In [None]:
import os

# Change to your desired directory
os.chdir('/content/training/protein_cvae')

# Confirm the current directory
print("Current directory:", os.getcwd())

Current directory: /content/training/protein_cvae


In [None]:
import sys
sys.path.append('/content/training/protein_cvae')

import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from cvae_model import CVAE
from esm_utils import get_esm_embedding
import numpy as np

# Load dataset
df = pd.read_csv("crystallization.csv")
df = df.dropna(subset=["sequence", "resolution_class", "details", "pH", "temperature"])

# One-hot encode resolution class
encoder = OneHotEncoder(sparse_output=False)
res_class_onehot = encoder.fit_transform(df[["resolution_class"]])

# TF-IDF vector for condition details
tfidf = TfidfVectorizer(max_features=50)
condition_vec = tfidf.fit_transform(df["details"]).toarray()

# Embed protein sequences using ESM
df["embedding"] = df["sequence"].apply(lambda s: get_esm_embedding(s))
X_embed = np.stack(df["embedding"].values)

# Combine inputs
X_input = X_embed
C_cond = res_class_onehot
Y_output = np.hstack([df[["pH", "temperature"]].values, condition_vec])

# Train/test split
X_train, X_test, C_train, C_test, Y_train, Y_test = train_test_split(X_input, C_cond, Y_output, test_size=0.2)

# Convert to tensors, ensuring they are float32
X_train, C_train, Y_train = map(lambda x: torch.tensor(x, dtype=torch.float32), (X_train, C_train, Y_train))
X_test, C_test, Y_test = map(lambda x: torch.tensor(x, dtype=torch.float32), (X_test, C_test, Y_test))

# Train CVAE
model = CVAE(input_dim=1280, condition_dim=C_train.shape[1], output_dim=Y_train.shape[1])
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(10):
    model.train()
    optimizer.zero_grad()
    y_pred, mu, logvar = model(X_train, C_train)
    loss, _, _ = model.loss_function(y_pred, Y_train, mu, logvar)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}: Loss = {loss.item():.4f}")
