# Stanford RNA 3D Folding - Baseline Model

## 1. Environment & Data Inspection

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

DATA_DIR = '../input/stanford-rna-3d-folding-2'
print(f"Files in {DATA_DIR}:")
try:
    print(os.listdir(DATA_DIR))
except FileNotFoundError:
    print("Data directory not found. Are you running on Kaggle?")

# Check for parquet or csv
for f in os.listdir(DATA_DIR):
    if f.endswith('.csv') or f.endswith('.parquet'):
        print(f"Found data file: {f}")

## 2. Config

In [None]:
CONFIG = {
    'batch_size': 16,
    'epochs': 5,
    'lr': 1e-4,
    'max_len': 128,
    'embed_dim': 64,
    'nhead': 4,
    'num_layers': 2,
    'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu')
}
print(f"Running on {CONFIG['device']}")

## 3. Dataset (Draft)
We will attempt to load `train.csv` (or similar). If not found, we mock it for the code to run.

In [None]:
class RNADataset(Dataset):
    def __init__(self, data_path, max_len=128, mode='train'):
        self.mode = mode
        self.max_len = max_len
        try:
            # Try loading likely filenames
            if os.path.exists(os.path.join(data_path, 'train_sequences.csv')):
                self.df = pd.read_csv(os.path.join(data_path, 'train_sequences.csv'))
            elif os.path.exists(os.path.join(data_path, 'train.csv')):
                self.df = pd.read_csv(os.path.join(data_path, 'train.csv'))
            elif os.path.exists(os.path.join(data_path, 'test_sequences.csv')) and mode=='test':
                self.df = pd.read_csv(os.path.join(data_path, 'test_sequences.csv'))
            else:
                # Mock data if file not found (for testing pipeline)
                print("Warning: Data file not found. Using Mock Data.")
                self.df = pd.DataFrame({
                    'sequence': ['ACGU' * 10] * 100,
                    'target': [np.random.rand(40, 3).tolist()] * 100
                })
        except Exception as e:
             print(f"Error loading data: {e}")
             self.df = pd.DataFrame({'sequence': ['A']})

        self.base_map = {c: i for i, c in enumerate('ACGU')}

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        seq_str = self.df.iloc[idx]['sequence']
        # Truncate or pad
        seq_str = seq_str[:self.max_len]
        seq_ids = [self.base_map.get(c, 0) for c in seq_str]
        
        # Pad to max_len
        pad_len = self.max_len - len(seq_ids)
        seq_ids = seq_ids + [0] * pad_len
        mask = [1] * len(seq_str) + [0] * pad_len
        
        return {
            'input_ids': torch.tensor(seq_ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long)
        }

## 4. Model (Transformer)

In [None]:
class RNATransformer(nn.Module):
    def __init__(self, vocab_size=5, embed_dim=64, nhead=4, num_layers=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=nhead, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc_coords = nn.Linear(embed_dim, 3) # x, y, z
        
    def forward(self, x, mask=None):
        # x: [B, L]
        x = self.embedding(x)
        # mask logic can be added here (src_key_padding_mask)
        out = self.transformer(x)
        coords = self.fc_coords(out)
        return coords

model = RNATransformer(
    embed_dim=CONFIG['embed_dim'],
    nhead=CONFIG['nhead'],
    num_layers=CONFIG['num_layers']
).to(CONFIG['device'])
print(model)

## 5. Training Loop (Dummy Run)

In [None]:
# Initialize Dataset
dataset = RNADataset(CONFIG['data_dir'], max_len=CONFIG['max_len'])
dataloader = DataLoader(dataset, batch_size=CONFIG['batch_size'], shuffle=True)

optimizer = torch.optim.Adam(model.parameters(), lr=CONFIG['lr'])
criterion = nn.MSELoss()

print("Starting training loop...")
for epoch in range(CONFIG['epochs']):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader):
        input_ids = batch['input_ids'].to(CONFIG['device'])
        # Dummy Targets: Just trying to match input shape for this test
        targets = torch.randn(input_ids.shape[0], CONFIG['max_len'], 3).to(CONFIG['device'])
        
        optimizer.zero_grad()
        preds = model(input_ids)
        loss = criterion(preds, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
    print(f"Epoch {epoch+1}/{CONFIG['epochs']}, Loss: {total_loss/len(dataloader):.4f}")