# Stanford RNA 3D Folding - Data Exploration

This notebook explores the competition data structure

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

DATA_DIR = '../input/stanford-rna-3d-folding-2'
sns.set_style('darkgrid')

## 1. Load Data

In [None]:
# Load all available CSV files
train_seq = pd.read_csv(os.path.join(DATA_DIR, 'train_sequences.csv'))
train_labels = pd.read_csv(os.path.join(DATA_DIR, 'train_labels.csv'))
val_seq = pd.read_csv(os.path.join(DATA_DIR, 'validation_sequences.csv'))
val_labels = pd.read_csv(os.path.join(DATA_DIR, 'validation_labels.csv'))
sample_sub = pd.read_csv(os.path.join(DATA_DIR, 'sample_submission.csv'))

print(f"Train sequences: {len(train_seq)}")
print(f"Train labels: {len(train_labels)}")
print(f"Validation sequences: {len(val_seq)}")
print(f"Validation labels: {len(val_labels)}")
print(f"Sample submission: {len(sample_sub)}")

## 2. Explore Sequences

In [None]:
print("\n=== TRAIN SEQUENCES ===")
print(train_seq.head())
print("\nColumns:", train_seq.columns.tolist())
print("\nInfo:")
print(train_seq.info())
print("\nSample sequence:")
print(train_seq.iloc[0]['sequence'][:100])

In [None]:
# Sequence length distribution
train_seq['seq_len'] = train_seq['sequence'].str.len()
val_seq['seq_len'] = val_seq['sequence'].str.len()

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

axes[0].hist(train_seq['seq_len'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Sequence Length')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Train Sequence Length Distribution')
axes[0].axvline(train_seq['seq_len'].median(), color='red', linestyle='--', label=f'Median: {train_seq["seq_len"].median():.0f}')
axes[0].legend()

axes[1].hist(val_seq['seq_len'], bins=50, edgecolor='black', alpha=0.7, color='orange')
axes[1].set_xlabel('Sequence Length')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Validation Sequence Length Distribution')
axes[1].axvline(val_seq['seq_len'].median(), color='red', linestyle='--', label=f'Median: {val_seq["seq_len"].median():.0f}')
axes[1].legend()

plt.tight_layout()
plt.show()

print(f"\nTrain sequence length - Min: {train_seq['seq_len'].min()}, Max: {train_seq['seq_len'].max()}, Mean: {train_seq['seq_len'].mean():.1f}")
print(f"Val sequence length - Min: {val_seq['seq_len'].min()}, Max: {val_seq['seq_len'].max()}, Mean: {val_seq['seq_len'].mean():.1f}")

In [None]:
# Nucleotide composition
def get_nucleotide_counts(sequences):
    all_seq = ''.join(sequences)
    return Counter(all_seq)

train_counts = get_nucleotide_counts(train_seq['sequence'])
val_counts = get_nucleotide_counts(val_seq['sequence'])

print("\n=== NUCLEOTIDE COMPOSITION ===")
print("Train:", train_counts)
print("Val:", val_counts)

# Plot
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
bases = ['A', 'C', 'G', 'U']
train_vals = [train_counts.get(b, 0) for b in bases]
val_vals = [val_counts.get(b, 0) for b in bases]

axes[0].bar(bases, train_vals, color=['#e74c3c', '#3498db', '#2ecc71', '#f39c12'])
axes[0].set_title('Train Nucleotide Distribution')
axes[0].set_ylabel('Count')

axes[1].bar(bases, val_vals, color=['#e74c3c', '#3498db', '#2ecc71', '#f39c12'])
axes[1].set_title('Validation Nucleotide Distribution')
axes[1].set_ylabel('Count')

plt.tight_layout()
plt.show()

## 3. Explore Labels (3D Coordinates)

In [None]:
print("\n=== TRAIN LABELS ===")
print(train_labels.head())
print("\nColumns:", train_labels.columns.tolist())
print("\nInfo:")
print(train_labels.info())

In [None]:
# Check how many structures per target
train_labels['target_id'] = train_labels['ID'].apply(lambda x: x.rsplit('_', 1)[0])

# Count number of structures (x_1, y_1, z_1 vs x_2, y_2, z_2 etc)
coord_cols = [col for col in train_labels.columns if col.startswith('x_')]
num_structures = len(coord_cols)
print(f"\nNumber of reference structures provided: {num_structures}")

# Check for missing values in coordinates
for i in range(1, num_structures + 1):
    x_col, y_col, z_col = f'x_{i}', f'y_{i}', f'z_{i}'
    missing = train_labels[[x_col, y_col, z_col]].isna().sum().sum()
    print(f"Structure {i} - Missing coordinates: {missing}")

In [None]:
# Coordinate range analysis
print("\n=== COORDINATE RANGES ===")
for i in range(1, min(4, num_structures + 1)):  # Check first 3 structures
    x_col, y_col, z_col = f'x_{i}', f'y_{i}', f'z_{i}'
    print(f"\nStructure {i}:")
    print(f"  X range: [{train_labels[x_col].min():.2f}, {train_labels[x_col].max():.2f}]")
    print(f"  Y range: [{train_labels[y_col].min():.2f}, {train_labels[y_col].max():.2f}]")
    print(f"  Z range: [{train_labels[z_col].min():.2f}, {train_labels[z_col].max():.2f}]")

In [None]:
# Residues per target
residues_per_target = train_labels.groupby('target_id').size()
print(f"\n=== RESIDUES PER TARGET ===")
print(f"Min: {residues_per_target.min()}")
print(f"Max: {residues_per_target.max()}")
print(f"Mean: {residues_per_target.mean():.1f}")
print(f"Median: {residues_per_target.median():.1f}")

plt.figure(figsize=(10, 5))
plt.hist(residues_per_target, bins=50, edgecolor='black', alpha=0.7)
plt.xlabel('Number of Residues')
plt.ylabel('Number of Targets')
plt.title('Distribution of Residues per Target')
plt.axvline(residues_per_target.median(), color='red', linestyle='--', label=f'Median: {residues_per_target.median():.0f}')
plt.legend()
plt.show()

## 4. Explore Sample Submission Format

In [None]:
print("\n=== SAMPLE SUBMISSION ===")
print(sample_sub.head(10))
print("\nColumns:", sample_sub.columns.tolist())
print("\nShape:", sample_sub.shape)

# Count number of predictions required
coord_cols = [col for col in sample_sub.columns if col.startswith('x_')]
num_predictions = len(coord_cols)
print(f"\nNumber of predictions required per target: {num_predictions}")

# Extract target IDs from sample submission
sample_sub['target_id'] = sample_sub['ID'].apply(lambda x: x.rsplit('_', 1)[0])
unique_targets = sample_sub['target_id'].nunique()
print(f"Number of unique targets in test set: {unique_targets}")

## 5. Check MSA (Multiple Sequence Alignment) Data

In [None]:
# Check if MSA directory exists
msa_dir = os.path.join(DATA_DIR, 'MSA')
if os.path.exists(msa_dir):
    msa_files = os.listdir(msa_dir)
    print(f"\n=== MSA FILES ===")
    print(f"Total MSA files: {len(msa_files)}")
    print(f"Sample MSA files: {msa_files[:5]}")
    
    # Read a sample MSA file
    if msa_files:
        sample_msa_file = os.path.join(msa_dir, msa_files[0])
        print(f"\nSample MSA file: {msa_files[0]}")
        with open(sample_msa_file, 'r') as f:
            lines = f.readlines()[:20]
            print(''.join(lines))
else:
    print("\nMSA directory not found. You may need to download it separately.")

## 6. Key Insights Summary

In [None]:
print("\n" + "="*60)
print("KEY INSIGHTS FOR MODEL DEVELOPMENT")
print("="*60)
print(f"\n1. DATA SIZE:")
print(f"   - Train targets: {len(train_seq)}")
print(f"   - Validation targets: {len(val_seq)}")
print(f"   - Test targets: {unique_targets}")

print(f"\n2. SEQUENCE CHARACTERISTICS:")
print(f"   - Length range: {train_seq['seq_len'].min()} - {train_seq['seq_len'].max()}")
print(f"   - Average length: {train_seq['seq_len'].mean():.1f}")
print(f"   - Nucleotides: A, C, G, U (standard RNA)")

print(f"\n3. PREDICTION TASK:")
print(f"   - Must predict {num_predictions} different 3D structures per RNA")
print(f"   - Each structure = (x, y, z) coordinates for each residue")
print(f"   - Coordinates are C1' atom positions in Angstroms")
print(f"   - Coordinates clipped to [-999.999, 9999.999]")

print(f"\n4. EVALUATION:")
print(f"   - Metric: TM-score (Template Modeling score)")
print(f"   - Best of 5 predictions used for each target")
print(f"   - Final score = average of all target scores")

print(f"\n5. CONSTRAINTS:")
print(f"   - Max runtime: 8 hours (GPU)")
print(f"   - No internet during inference")
print(f"   - Must output submission.csv")

print(f"\n6. ADDITIONAL DATA:")
print(f"   - MSA files available for each target")
print(f"   - PDB structural database included")
print(f"   - Extra metadata in extra/ folder")
print("\n" + "="*60)