# Stanford RNA 3D Folding - Inference & Submission

**This notebook generates predictions for the test set and creates submission.csv**

Requirements:
- Must output 5 structures per target
- Format: ID, resname, resid, x_1, y_1, z_1, ..., x_5, y_5, z_5
- Coordinates clipped to [-999.999, 9999.999]
- Runtime must be < 8 hours
- No internet access during inference

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

DATA_DIR = '../input/stanford-rna-3d-folding-2'
MODEL_PATH = '../input/your-model-path/model.pth'  # Update with your trained model path
MAX_LEN = 512
BATCH_SIZE = 16
NUM_PREDICTIONS = 5

## 1. Model Architecture (Must match training)

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]


class RNAStructurePredictor(nn.Module):
    def __init__(self, vocab_size=5, embed_dim=256, nhead=8, num_layers=6, 
                 num_predictions=5, dropout=0.1, max_len=512):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_predictions = num_predictions
        
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=4)
        self.pos_encoder = PositionalEncoding(embed_dim, max_len=max_len)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=nhead,
            dim_feedforward=embed_dim * 4,
            dropout=dropout,
            batch_first=True,
            norm_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        self.prediction_heads = nn.ModuleList([
            nn.Sequential(
                nn.Linear(embed_dim, embed_dim // 2),
                nn.ReLU(),
                nn.Dropout(dropout),
                nn.Linear(embed_dim // 2, 3)
            )
            for _ in range(num_predictions)
        ])
        
    def forward(self, x, mask=None):
        x_embed = self.embedding(x)
        x_embed = self.pos_encoder(x_embed)
        
        padding_mask = ~mask if mask is not None else None
        encoded = self.transformer(x_embed, src_key_padding_mask=padding_mask)
        
        predictions = []
        for head in self.prediction_heads:
            coords = head(encoded)
            predictions.append(coords)
        
        predictions = torch.stack(predictions, dim=3)
        return predictions

## 2. Test Dataset

In [None]:
class TestRNADataset(Dataset):
    def __init__(self, data_path, max_len=512):
        self.max_len = max_len
        self.base2int = {'A': 0, 'C': 1, 'G': 2, 'U': 3, 'N': 4}
        
        # Load test sequences
        self.seq_df = pd.read_csv(os.path.join(data_path, 'test_sequences.csv'))
        print(f"Loaded {len(self.seq_df)} test sequences")
    
    def __len__(self):
        return len(self.seq_df)
    
    def __getitem__(self, idx):
        row = self.seq_df.iloc[idx]
        target_id = row['target_id']
        seq_str = row['sequence']
        
        # Convert sequence
        seq_ids = [self.base2int.get(c, 4) for c in seq_str]
        orig_len = min(len(seq_ids), self.max_len)
        
        # Truncate or pad
        if len(seq_ids) > self.max_len:
            seq_ids = seq_ids[:self.max_len]
        else:
            seq_ids = seq_ids + [4] * (self.max_len - len(seq_ids))
        
        input_ids = torch.tensor(seq_ids, dtype=torch.long)
        mask = torch.zeros(self.max_len, dtype=torch.bool)
        mask[:orig_len] = True
        
        return input_ids, mask, target_id, orig_len, seq_str

## 3. Load Model and Generate Predictions

In [None]:
# Load model
print("Loading model...")
checkpoint = torch.load(MODEL_PATH, map_location=device)
config = checkpoint.get('config', {})

model = RNAStructurePredictor(
    vocab_size=5,
    embed_dim=config.get('embed_dim', 256),
    nhead=config.get('nhead', 8),
    num_layers=config.get('num_layers', 6),
    num_predictions=NUM_PREDICTIONS,
    dropout=config.get('dropout', 0.1),
    max_len=MAX_LEN
).to(device)

model.load_state_dict(checkpoint['model_state_dict'])
model.eval()
print("Model loaded successfully!")

In [None]:
# Create test dataset
test_dataset = TestRNADataset(DATA_DIR, max_len=MAX_LEN)
test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=2,
    pin_memory=True if torch.cuda.is_available() else False
)

print(f"Test samples: {len(test_dataset)}")
print(f"Test batches: {len(test_loader)}")

In [None]:
# Generate predictions
print("\nGenerating predictions...")
all_predictions = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Inference"):
        input_ids, mask, target_ids, orig_lens, sequences = batch
        input_ids = input_ids.to(device)
        mask = mask.to(device)
        
        # Get predictions: (batch, seq_len, 3, num_predictions)
        predictions = model(input_ids, mask)
        predictions = predictions.cpu().numpy()
        
        # Process each sequence in batch
        for i in range(len(target_ids)):
            target_id = target_ids[i]
            orig_len = orig_lens[i].item()
            sequence = sequences[i]
            
            # Get predictions for this sequence (only valid positions)
            coords = predictions[i, :orig_len, :, :]  # (orig_len, 3, num_predictions)
            
            # Clip coordinates to competition limits
            coords = np.clip(coords, -999.999, 9999.999)
            
            # Store for submission creation
            all_predictions.append({
                'target_id': target_id,
                'sequence': sequence,
                'coords': coords  # (orig_len, 3, 5)
            })

print(f"Generated predictions for {len(all_predictions)} targets")

## 4. Create Submission File

In [None]:
# Load sample submission to get the correct format
sample_sub = pd.read_csv(os.path.join(DATA_DIR, 'sample_submission.csv'))
print(f"Sample submission shape: {sample_sub.shape}")
print(f"Sample submission columns: {sample_sub.columns.tolist()}")

# Create submission dataframe
submission_rows = []

for pred_dict in tqdm(all_predictions, desc="Creating submission"):
    target_id = pred_dict['target_id']
    sequence = pred_dict['sequence']
    coords = pred_dict['coords']  # (seq_len, 3, 5)
    
    for resid in range(len(sequence)):
        resname = sequence[resid]
        
        # Create row with ID, resname, resid, and all 5 predictions
        row = {
            'ID': f"{target_id}_{resid + 1}",  # 1-indexed
            'resname': resname,
            'resid': resid + 1
        }
        
        # Add coordinates for all 5 predictions
        for pred_idx in range(NUM_PREDICTIONS):
            x, y, z = coords[resid, :, pred_idx]
            row[f'x_{pred_idx + 1}'] = x
            row[f'y_{pred_idx + 1}'] = y
            row[f'z_{pred_idx + 1}'] = z
        
        submission_rows.append(row)

submission_df = pd.DataFrame(submission_rows)
print(f"\nSubmission shape: {submission_df.shape}")
print(f"Submission columns: {submission_df.columns.tolist()}")
print("\nFirst few rows:")
print(submission_df.head())

In [None]:
# Verify submission format
required_cols = ['ID', 'resname', 'resid']
for i in range(1, NUM_PREDICTIONS + 1):
    required_cols.extend([f'x_{i}', f'y_{i}', f'z_{i}'])

print("\nVerifying submission format...")
assert list(submission_df.columns) == required_cols, "Column mismatch!"
assert len(submission_df) == len(sample_sub), f"Row count mismatch! Expected {len(sample_sub)}, got {len(submission_df)}"
print("✓ Format verification passed!")

# Check for NaN values
nan_count = submission_df.isna().sum().sum()
if nan_count > 0:
    print(f"WARNING: Found {nan_count} NaN values. Filling with 0.")
    submission_df = submission_df.fillna(0.0)

# Check coordinate ranges
coord_cols = [col for col in submission_df.columns if col.startswith(('x_', 'y_', 'z_'))]
coord_min = submission_df[coord_cols].min().min()
coord_max = submission_df[coord_cols].max().max()
print(f"\nCoordinate range: [{coord_min:.3f}, {coord_max:.3f}]")
assert coord_min >= -999.999 and coord_max <= 9999.999, "Coordinates out of range!"
print("✓ Coordinate range check passed!")

In [None]:
# Save submission
submission_df.to_csv('submission.csv', index=False)
print("\n" + "="*60)
print("✓ Submission file created successfully!")
print("="*60)
print(f"File: submission.csv")
print(f"Shape: {submission_df.shape}")
print(f"Size: {os.path.getsize('submission.csv') / 1e6:.2f} MB")