# Stanford RNA 3D Folding - Baseline Model

This notebook implements a starter pipeline for predicting RNA 3D structures.

## 1. Setup & Imports

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

## 2. Configuration

In [None]:
CONFIG = {
    'data_dir': '../input/stanford-rna-3d-folding-2',
    'batch_size': 32,
    'epochs': 10,
    'learning_rate': 1e-3,
    'max_len': 256,  # Placeholder max sequence length
    'embedding_dim': 128,
    'hidden_dim': 256,
}

## 3. Data Loading

We need to implement a dataset class that reads the sequences and coordinates.

In [None]:
class RNADataset(Dataset):
    def __init__(self, df, mode='train'):
        self.df = df
        self.mode = mode
        # Dictionary mapping nucleotides to integers
        self.base2int = {'A': 0, 'G': 1, 'C': 2, 'U': 3}

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        # TODO: Implement actual parsing logic based on file format
        # This is a placeholder structure
        sequence = row['sequence']
        seq_tensor = torch.tensor([self.base2int.get(c, 4) for c in sequence], dtype=torch.long)
        
        if self.mode == 'train':
            # Placeholder for targets (x, y, z coordinates)
            # We assume targets are stored or linked in the dataframe
            targets = torch.zeros((len(sequence), 3)) 
            return seq_tensor, targets
        
        return seq_tensor

## 4. Model Architecture

A simple LSTM based model to predict coordinates from sequence.

In [None]:
class RNAModel(nn.Module):
    def __init__(self, vocab_size=5, embed_dim=128, hidden_dim=256):
        super(RNAModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, 3) # Predict x, y, z

    def forward(self, x):
        # x: [batch, seq_len]
        embed = self.embedding(x)
        output, _ = self.lstm(embed)
        # output: [batch, seq_len, hidden*2]
        coords = self.fc(output)
        return coords