# Stanford RNA 3D Folding - Baseline Model v3

## 1. Imports & Config

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

CONFIG = {
    'data_dir': '../input/stanford-rna-3d-folding-2',
    'max_len': 256,
    'batch_size': 16,
    'epochs': 10,
    'lr': 1e-3,
    'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu')
}
print(f"Running on {CONFIG['device']}")

## 2. Dataset Implementation

We need to merge `train_sequences.csv` (inputs) and `train_labels.csv` (targets).

In [None]:
class RNADataset(Dataset):
    def __init__(self, data_path, max_len=256, mode='train'):
        self.mode = mode
        self.max_len = max_len
        self.base2int = {c: i for i, c in enumerate('ACGU')}
        
        # Load Data
        print("Loading dataframes...")
        self.seq_df = pd.read_csv(os.path.join(data_path, 'train_sequences.csv'))
        # self.seq_df columns: ['target_id', 'sequence', ...]
        
        if mode == 'train':
            lbl_df = pd.read_csv(os.path.join(data_path, 'train_labels.csv'))
            # lbl_df columns: ['ID', 'resname', 'resid', 'x_1', 'y_1', 'z_1', ...]
            # ID format seems to be {target_id}_{resid}
            
            # Add target_id column to labels for merging
            # We assume the part before the last underscore is target_id (or similar logic)
            # Let's inspect ID format from logs: '157D_1'
            lbl_df['target_id'] = lbl_df['ID'].apply(lambda x: x.rsplit('_', 1)[0])
            
            # Group coordinates by target_id
            # We want a list of coords for each target
            print("grouping labels...")
            self.coords_map = lbl_df.groupby('target_id')[['x_1', 'y_1', 'z_1']].apply(lambda x: x.values).to_dict()
            
            # Filter sequences that have labels
            self.seq_df = self.seq_df[self.seq_df['target_id'].isin(self.coords_map)]
            self.seq_df = self.seq_df.reset_index(drop=True)
            
    def __len__(self):
        return len(self.seq_df)

    def __getitem__(self, idx):
        row = self.seq_df.iloc[idx]
        target_id = row['target_id']
        seq_str = row['sequence']
        
        # Convert Sequence to Ints
        seq_ids = [self.base2int.get(c, 4) for c in seq_str] # 4 for unknown
        
        # Truncate
        L = len(seq_ids)
        if L > self.max_len:
            seq_ids = seq_ids[:self.max_len]
            L = self.max_len
            
        # Pad Inputs
        pad_len = self.max_len - L
        input_ids = torch.tensor(seq_ids + [4] * pad_len, dtype=torch.long)
        mask = torch.tensor([1] * L + [0] * pad_len, dtype=torch.bool)
        
        # Get Targets
        if self.mode == 'train':
            coords = self.coords_map[target_id]
            # coords is numpy array of shape (seq_len, 3)
            
            # Handle length mismatch (truncate or pad labels)
            # Note: labels length might differ from sequence length if some residues are missing coords
            # For this baseline, we assume 1-to-1 mapping or truncate to min
            
            c_len = len(coords)
            if c_len > self.max_len:
                 coords = coords[:self.max_len]
            
            # Convert to tensor
            target_tensor = torch.zeros((self.max_len, 3), dtype=torch.float32)
            target_tensor[:len(coords)] = torch.tensor(coords, dtype=torch.float32)
            
            return input_ids, target_tensor, mask
            
        return input_ids, mask

## 3. Model Architecture

In [None]:
class RNAModel(nn.Module):
    def __init__(self, vocab_size=5, embed_dim=128, nhead=4, num_layers=4):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=nhead, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(embed_dim, 3)
        
    def forward(self, x, mask=None):
        # x: [B, L]
        # mask: [B, L]
        x_embed = self.embedding(x)
        
        # Transformer expects src_key_padding_mask where True means IGNORE (pad)
        # Our mask is 1 for KEEP, 0 for PAD. So we invert it.
        padding_mask = ~mask if mask is not None else None
        
        out = self.transformer(x_embed, src_key_padding_mask=padding_mask)
        coords = self.fc(out)
        return coords

## 4. Training Loop

In [None]:
def train():
    dataset = RNADataset(CONFIG['data_dir'], max_len=CONFIG['max_len'])
    dataloader = DataLoader(dataset, batch_size=CONFIG['batch_size'], shuffle=True)
    
    model = RNAModel().to(CONFIG['device'])
    optimizer = torch.optim.Adam(model.parameters(), lr=CONFIG['lr'])
    criterion = nn.MSELoss(reduction='none') # We want to mask loss

    print("Starting training...")
    for epoch in range(CONFIG['epochs']):
        model.train()
        total_loss = 0
        count = 0
        
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}")
        for input_ids, targets, mask in progress_bar:
            input_ids = input_ids.to(CONFIG['device'])
            targets = targets.to(CONFIG['device'])
            mask = mask.to(CONFIG['device'])
            
            optimizer.zero_grad()
            preds = model(input_ids, mask)
            
            # Compute masked loss
            loss = criterion(preds, targets)
            loss = (loss.mean(dim=-1) * mask).sum() / mask.sum()
            
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            count += 1
            progress_bar.set_postfix({'loss': total_loss/count})
            
        print(f"Epoch {epoch+1} finished. Avg Loss: {total_loss/count:.4f}")
        
    torch.save(model.state_dict(), 'model.pth')
    print("Training complete.")

if __name__ == '__main__':
    train()