In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis

# Function to parse PDB files and extract sequence information
def parse_pdb_file(pdb_path):
    try:
        records = list(SeqIO.parse(pdb_path, "pdb-seqres"))
        if records:
            return str(records[0].seq)
    except Exception as e:
        print(f"Error parsing {pdb_path}: {e}")
    return None

# Function to extract features from protein sequences
def extract_features(sequence):
    try:
        # Replace non-standard amino acids and trim any whitespace
        sequence = sequence.replace('X', '').strip()
        if len(sequence) == 0:
            raise ValueError("Sequence length is zero after cleaning.")
        
        analysis = ProteinAnalysis(sequence)
        return [
            analysis.molecular_weight(),
            analysis.aromaticity(),
            analysis.instability_index(),
            analysis.isoelectric_point(),
            *analysis.secondary_structure_fraction()
        ]
    except Exception as e:
        print(f"Error extracting features: {e}")
        return None

# List of PDB files and their labels (placeholder labels for this example)
pdb_dir = "./pdb_files/"
pdb_filenames = [
    "pdb10gs.ent", "pdb1a2b.ent", "pdb1b2c.ent", "pdb1c2d.ent", "pdb1d2e.ent",
    "pdb1e2f.ent", "pdb1f2g.ent", "pdb1g2h.ent", "pdb1h2i.ent", "pdb1k2l.ent",
    "pdb1l2m.ent", "pdb1m2n.ent", "pdb1n2o.ent", "pdb1o2p.ent", "pdb1p2q.ent",
    "pdb1q2r.ent", "pdb1r2s.ent", "pdb1s2t.ent", "pdb1t2u.ent", "pdb1tup.ent",
    "pdb1u2v.ent", "pdb1v2w.ent", "pdb1w2x.ent", "pdb1x2y.ent", "pdb1y2z.ent",
    "pdb2a2a.ent", "pdb2b2b.ent", "pdb2c2c.ent", "pdb2d2d.ent", "pdb2e2e.ent",
    "pdb2f2f.ent", "pdb2h2h.ent", "pdb2hbb.ent", "pdb2i2i.ent",
    "pdb2j2j.ent", "pdb2l2l.ent", "pdb2m2m.ent", "pdb2n2n.ent",
    "pdb2o2o.ent", "pdb3ptb.ent", "pdb4hhb.ent", "pdb5ldh.ent", "pdb6abp.ent",
    "pdb7tim.ent", "pdb8tim.ent", "pdb9ins.ent"
]

labels = ["FunctionA", "FunctionB", "FunctionC"] * (len(pdb_filenames) // 3 + 1)
labels = labels[:len(pdb_filenames)]

features = []
all_labels = []

for i, pdb_filename in enumerate(pdb_filenames):
    try:
        pdb_path = os.path.join(pdb_dir, pdb_filename)
        sequence = parse_pdb_file(pdb_path)
        if sequence:
            protein_features = extract_features(sequence)
            if protein_features:
                features.append(protein_features)
                all_labels.append(labels[i])
    except Exception as e:
        print(f"Error processing PDB file {pdb_filename}: {e}")

# Check if any features were extracted successfully
if len(features) == 0:
    raise ValueError("No valid protein sequences found. Please check your PDB files.")

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(all_labels)

# Convert features to numpy array and standardize
features_np = np.array(features)
scaler = StandardScaler()
features_standardized = scaler.fit_transform(features_np)

# Check for GPU and use it if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Convert to PyTorch tensors
features_tensor = torch.tensor(features_standardized, dtype=torch.float32).to(device)
labels_tensor = torch.tensor(encoded_labels, dtype=torch.long).to(device)

# Split data into training and validation sets
features_train, features_val, labels_train, labels_val = train_test_split(features_tensor, labels_tensor, test_size=0.2, random_state=42)

# Define the neural network
class ProteinClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(ProteinClassifier, self).__init__()
        self.hidden1 = nn.Linear(input_size, 128)
        self.hidden2 = nn.Linear(128, 64)
        self.output = nn.Linear(64, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.hidden1(x))
        x = self.relu(self.hidden2(x))
        x = self.output(x)
        return x

input_size = features_tensor.shape[1]
num_classes = len(np.unique(encoded_labels))
model = ProteinClassifier(input_size, num_classes).to(device)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(features_train)
    loss = criterion(outputs, labels_train)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluation
model.eval()
with torch.no_grad():
    val_outputs = model(features_val)
    _, predicted = torch.max(val_outputs, 1)
    accuracy = (predicted == labels_val).sum().item() / labels_val.size(0)
    print(f'Validation Accuracy: {accuracy:.4f}')
    print(classification_report(labels_val.cpu(), predicted.cpu(), target_names=label_encoder.classes_))

# Predict on new data
new_data = features_tensor[:2]  # Simulate new data
model.eval()
with torch.no_grad():
    predictions = model(new_data)
    _, predicted_classes = torch.max(predictions, 1)
    print(f"Predictions: {predictions.cpu().numpy()}")
    print(f"Predicted Classes: {label_encoder.inverse_transform(predicted_classes.cpu().numpy())}")
