In [None]:
import pandas as pd
import numpy as np
from scipy.spatial.transform import Rotation as R
import random
from Bio.Align import PairwiseAligner
from Bio.Seq import Seq
import time
from scipy.spatial import distance_matrix
import warnings

warnings.filterwarnings('ignore')

# Create a global aligner instance with the same parameters as pairwise2
aligner = PairwiseAligner()
aligner.mode = 'global'
aligner.match_score = 2.0
aligner.mismatch_score = -1.0
aligner.open_gap_score = -8.0
aligner.extend_gap_score = -0.3

def get_aligned_sequences(alignment):
    """Extract aligned sequences with gaps from Bio.Align alignment object."""
    # Get the original sequences
    query_seq = str(alignment.query)
    target_seq = str(alignment.target)
    
    # Get aligned segments (indices where sequences align)
    query_aligned = alignment.aligned[0]  # query sequence aligned segments
    target_aligned = alignment.aligned[1]  # target sequence aligned segments
    
    # Build gapped sequences
    a_q = []
    a_t = []
    
    q_idx = 0
    t_idx = 0
    
    for seg_q, seg_t in zip(query_aligned, target_aligned):
        # Add gaps in query before this aligned segment
        while q_idx < seg_q[0]:
            a_q.append(query_seq[q_idx])
            a_t.append('-')
            q_idx += 1
        
        # Add gaps in target before this aligned segment
        while t_idx < seg_t[0]:
            a_q.append('-')
            a_t.append(target_seq[t_idx])
            t_idx += 1
        
        # Add aligned segment
        seg_len = seg_q[1] - seg_q[0]
        for i in range(seg_len):
            a_q.append(query_seq[seg_q[0] + i])
            a_t.append(target_seq[seg_t[0] + i])
        
        q_idx = seg_q[1]
        t_idx = seg_t[1]
    
    # Add remaining characters
    while q_idx < len(query_seq):
        a_q.append(query_seq[q_idx])
        a_t.append('-')
        q_idx += 1
    
    while t_idx < len(target_seq):
        a_q.append('-')
        a_t.append(target_seq[t_idx])
        t_idx += 1
    
    return ''.join(a_q), ''.join(a_t)

# === 1. LOADING ===
DATA_PATH = '/kaggle/input/stanford-rna-3d-folding-2/'

# Load base data
train_seqs = pd.read_csv(DATA_PATH + 'train_sequences.csv')
test_seqs = pd.read_csv(DATA_PATH + 'test_sequences.csv')
train_labels = pd.read_csv(DATA_PATH + 'train_labels.csv')

# ===== ONLY CHANGE: TRY TO LOAD VALIDATION =====
try:
    validation_seqs = pd.read_csv(DATA_PATH + 'validation_sequences.csv')
    validation_labels = pd.read_csv(DATA_PATH + 'validation_labels.csv')
    print("Validation data found and will be combined with train data.")
    
    # Combine sequences
    combined_seqs = pd.concat([train_seqs, validation_seqs], ignore_index=True)
    
    # Combine labels
    combined_labels = pd.concat([train_labels, validation_labels], ignore_index=True)
    
except FileNotFoundError:
    print("Validation data not found, using only train data.")
    combined_seqs = train_seqs
    combined_labels = train_labels
# ===== END OF CHANGE =====

def process_labels(labels_df):
    coords_dict = {}
    for id_prefix, group in labels_df.groupby(lambda x: labels_df['ID'][x].rsplit('_', 1)[0]):
        coords = [group.sort_values('resid')[['x_1', 'y_1', 'z_1']].values]
        coords_dict[id_prefix] = coords[0]
    return coords_dict

# Use COMBINED data instead of only train
combined_coords_dict = process_labels(combined_labels)

# === 2. HEURISTICS (unchanged) ===

def find_similar_sequences(query_seq, train_seqs_df, train_coords_dict, top_n=5):
    similar_seqs = []
    query_seq_obj = Seq(query_seq)
    
    for _, row in train_seqs_df.iterrows():
        target_id, train_seq = row['target_id'], row['sequence']
        if target_id not in train_coords_dict: 
            continue
        if abs(len(train_seq) - len(query_seq)) / max(len(train_seq), len(query_seq)) > 0.4: 
            continue
        
        # FIX 1: Slightly changed gap penalties (-10 -> -8, -0.5 -> -0.3)
        # This helps better match templates that have gaps in loop regions
        # Using Bio.Align.PairwiseAligner instead of deprecated pairwise2
        alignments = list(aligner.align(query_seq_obj, train_seq))
        
        if alignments:
            score = alignments[0].score / (2 * min(len(query_seq), len(train_seq)))
            similar_seqs.append((target_id, train_seq, score, train_coords_dict[target_id]))
    
    similar_seqs.sort(key=lambda x: x[2], reverse=True)
    return similar_seqs[:top_n]

def adaptive_rna_constraints(coordinates, sequence, confidence=1.0):
    refined_coords = coordinates.copy()
    n_residues = len(sequence)
    
    # FIX 2: Optimized the tension strength.
    # For good templates (conf > 0.8) it becomes almost zero.
    constraint_strength = 0.7 * (1.0 - min(confidence, 0.95))
    
    # FIX 3: Narrowed the target distance range (was 5.5-6.5, now 5.8-6.1)
    # This makes the chain more "springy"/consistent
    seq_min_dist, seq_max_dist = 5.8, 6.1
    
    for i in range(n_residues - 1):
        dist = np.linalg.norm(refined_coords[i+1] - refined_coords[i])
        if dist < seq_min_dist or dist > seq_max_dist:
            target_dist = 5.95 
            direction = (refined_coords[i+1] - refined_coords[i]) / (dist + 1e-10)
            adjustment = (target_dist - dist) * constraint_strength
            refined_coords[i+1] = refined_coords[i+1] + direction * adjustment
            
    return refined_coords

def adapt_template_to_query(query_seq, template_seq, template_coords):
    # Adaptation code from the original version (most stable one)
    # Using Bio.Align.PairwiseAligner instead of deprecated pairwise2
    alignments = list(aligner.align(Seq(query_seq), Seq(template_seq)))
    if not alignments: 
        return np.zeros((len(query_seq), 3))
    
    # Extract aligned sequences with gaps from the new API
    alignment = alignments[0]
    a_q, a_t = get_aligned_sequences(alignment)
    new_coords = np.full((len(query_seq), 3), np.nan)
    q_idx, t_idx = 0, 0
    for char_q, char_t in zip(a_q, a_t):
        if char_q != '-' and char_t != '-':
            if t_idx < len(template_coords): 
                new_coords[q_idx] = template_coords[t_idx]
            q_idx += 1
            t_idx += 1
        elif char_q != '-': 
            q_idx += 1
        elif char_t != '-': 
            t_idx += 1

    # Fill NaN values
    for i in range(len(new_coords)):
        if np.isnan(new_coords[i, 0]):
            prev_v = next((j for j in range(i-1, -1, -1) if not np.isnan(new_coords[j, 0])), -1)
            next_v = next((j for j in range(i+1, len(new_coords)) if not np.isnan(new_coords[j, 0])), -1)
            if prev_v >= 0 and next_v >= 0:
                w = (i - prev_v) / (next_v - prev_v)
                new_coords[i] = (1 - w) * new_coords[prev_v] + w * new_coords[next_v]
            elif prev_v >= 0: 
                new_coords[i] = new_coords[prev_v] + [3, 0, 0]
            elif next_v >= 0: 
                new_coords[i] = new_coords[next_v] + [3, 0, 0]
            else: 
                new_coords[i] = [i * 3, 0, 0]
    return np.nan_to_num(new_coords)

def generate_rna_structure(sequence, seed=None):
    if seed: 
        np.random.seed(seed)
    n = len(sequence)
    coords = np.zeros((n, 3))
    for i in range(1, n):
        coords[i] = coords[i-1] + [random.uniform(3.8, 4.2), 0, 0]
    return coords

# === 3. PREDICT (unchanged, but uses combined data) ===

def predict_rna_structures(sequence, target_id, train_seqs_df, train_coords_dict, n_predictions=5):
    predictions = []
    similar_seqs = find_similar_sequences(sequence, train_seqs_df, train_coords_dict, top_n=n_predictions)
    
    if similar_seqs:
        for i, (template_id, template_seq, similarity, template_coords) in enumerate(similar_seqs):
            adapted = adapt_template_to_query(sequence, template_seq, template_coords)
            refined = adaptive_rna_constraints(adapted, sequence, confidence=similarity)
            
            # FIX 4: Reduced noise level for top templates (0.05 -> 0.02)
            # This keeps the best of the 5 predictions more accurate
            random_scale = max(0.01, (0.4 - similarity) * 0.1) 
            refined += np.random.normal(0, random_scale, refined.shape)
            predictions.append(refined)
                
    while len(predictions) < n_predictions:
        predictions.append(generate_rna_structure(sequence, seed=len(predictions)))
    
    return predictions[:n_predictions]

# === 4. LOOP & SAVE (unchanged) ===
all_predictions = []
for idx, row in test_seqs.iterrows():
    target_id, sequence = row['target_id'], row['sequence']
    if idx % 5 == 0: 
        print(f"Processing {idx+1}/{len(test_seqs)}")
    
    # HERE we use COMBINED data instead of train_seqs and train_coords_dict
    preds = predict_rna_structures(sequence, target_id, combined_seqs, combined_coords_dict)
    
    for j in range(len(sequence)):
        res = {'ID': f"{target_id}_{j+1}", 'resname': sequence[j], 'resid': j+1}
        for i in range(5):
            res[f'x_{i+1}'], res[f'y_{i+1}'], res[f'z_{i+1}'] = preds[i][j]
        all_predictions.append(res)

submission_df = pd.DataFrame(all_predictions)
cols = ['ID', 'resname', 'resid'] + [f'{c}_{i}' for i in range(1,6) for c in ['x','y','z']]
submission_df[cols].to_csv('submission.csv', index=False)
print("Submission.csv generated!")


In [None]:
import pandas as pd
import numpy as np
from scipy.spatial.transform import Rotation as R
import random
from Bio.Align import PairwiseAligner
from Bio.Seq import Seq
import time
from scipy.spatial import distance_matrix
import warnings

warnings.filterwarnings('ignore')

# Create a global aligner instance with the same parameters as pairwise2
aligner = PairwiseAligner()
aligner.mode = 'global'
aligner.match_score = 2.0
aligner.mismatch_score = -1.0
aligner.open_gap_score = -8.0
aligner.extend_gap_score = -0.3

def get_aligned_sequences(alignment):
    """Extract aligned sequences with gaps from Bio.Align alignment object."""
    # Get the original sequences
    query_seq = str(alignment.query)
    target_seq = str(alignment.target)
    
    # Get aligned segments (indices where sequences align)
    query_aligned = alignment.aligned[0]  # query sequence aligned segments
    target_aligned = alignment.aligned[1]  # target sequence aligned segments
    
    # Build gapped sequences
    a_q = []
    a_t = []
    
    q_idx = 0
    t_idx = 0
    
    for seg_q, seg_t in zip(query_aligned, target_aligned):
        # Add gaps in query before this aligned segment
        while q_idx < seg_q[0]:
            a_q.append(query_seq[q_idx])
            a_t.append('-')
            q_idx += 1
        
        # Add gaps in target before this aligned segment
        while t_idx < seg_t[0]:
            a_q.append('-')
            a_t.append(target_seq[t_idx])
            t_idx += 1
        
        # Add aligned segment
        seg_len = seg_q[1] - seg_q[0]
        for i in range(seg_len):
            a_q.append(query_seq[seg_q[0] + i])
            a_t.append(target_seq[seg_t[0] + i])
        
        q_idx = seg_q[1]
        t_idx = seg_t[1]
    
    # Add remaining characters
    while q_idx < len(query_seq):
        a_q.append(query_seq[q_idx])
        a_t.append('-')
        q_idx += 1
    
    while t_idx < len(target_seq):
        a_q.append('-')
        a_t.append(target_seq[t_idx])
        t_idx += 1
    
    return ''.join(a_q), ''.join(a_t)

# === 1. LOADING ===
DATA_PATH = '/kaggle/input/stanford-rna-3d-folding-2/'

# Load base data
train_seqs = pd.read_csv(DATA_PATH + 'train_sequences.csv')
test_seqs = pd.read_csv(DATA_PATH + 'test_sequences.csv')
train_labels = pd.read_csv(DATA_PATH + 'train_labels.csv')

# ===== ONLY CHANGE: TRY TO LOAD VALIDATION =====
try:
    validation_seqs = pd.read_csv(DATA_PATH + 'validation_sequences.csv')
    validation_labels = pd.read_csv(DATA_PATH + 'validation_labels.csv')
    print("Validation data found and will be combined with train data.")
    
    # Combine sequences
    combined_seqs = pd.concat([train_seqs, validation_seqs], ignore_index=True)
    
    # Combine labels
    combined_labels = pd.concat([train_labels, validation_labels], ignore_index=True)
    
except FileNotFoundError:
    print("Validation data not found, using only train data.")
    combined_seqs = train_seqs
    combined_labels = train_labels
# ===== END OF CHANGE =====

def process_labels(labels_df):
    coords_dict = {}
    for id_prefix, group in labels_df.groupby(lambda x: labels_df['ID'][x].rsplit('_', 1)[0]):
        coords = [group.sort_values('resid')[['x_1', 'y_1', 'z_1']].values]
        coords_dict[id_prefix] = coords[0]
    return coords_dict

# Use COMBINED data instead of only train
combined_coords_dict = process_labels(combined_labels)

# === 2. HEURISTICS (unchanged) ===

def find_similar_sequences(query_seq, train_seqs_df, train_coords_dict, top_n=5):
    similar_seqs = []
    query_seq_obj = Seq(query_seq)
    
    for _, row in train_seqs_df.iterrows():
        target_id, train_seq = row['target_id'], row['sequence']
        if target_id not in train_coords_dict: 
            continue
        if abs(len(train_seq) - len(query_seq)) / max(len(train_seq), len(query_seq)) > 0.4: 
            continue
        
        # FIX 1: Slightly changed gap penalties (-10 -> -8, -0.5 -> -0.3)
        # This helps better match templates that have gaps in loop regions
        # Using Bio.Align.PairwiseAligner instead of deprecated pairwise2
        alignments = list(aligner.align(query_seq_obj, train_seq))
        
        if alignments:
            score = alignments[0].score / (2 * min(len(query_seq), len(train_seq)))
            similar_seqs.append((target_id, train_seq, score, train_coords_dict[target_id]))
    
    similar_seqs.sort(key=lambda x: x[2], reverse=True)
    return similar_seqs[:top_n]

def adaptive_rna_constraints(coordinates, sequence, confidence=1.0):
    refined_coords = coordinates.copy()
    n_residues = len(sequence)
    
    # FIX 2: Optimized the tension strength.
    # For good templates (conf > 0.8) it becomes almost zero.
    constraint_strength = 0.7 * (1.0 - min(confidence, 0.95))
    
    # FIX 3: Narrowed the target distance range (was 5.5-6.5, now 5.8-6.1)
    # This makes the chain more "springy"/consistent
    seq_min_dist, seq_max_dist = 5.8, 6.1
    
    for i in range(n_residues - 1):
        dist = np.linalg.norm(refined_coords[i+1] - refined_coords[i])
        if dist < seq_min_dist or dist > seq_max_dist:
            target_dist = 5.95 
            direction = (refined_coords[i+1] - refined_coords[i]) / (dist + 1e-10)
            adjustment = (target_dist - dist) * constraint_strength
            refined_coords[i+1] = refined_coords[i+1] + direction * adjustment
            
    return refined_coords

def adapt_template_to_query(query_seq, template_seq, template_coords):
    # Adaptation code from the original version (most stable one)
    # Using Bio.Align.PairwiseAligner instead of deprecated pairwise2
    alignments = list(aligner.align(Seq(query_seq), Seq(template_seq)))
    if not alignments: 
        return np.zeros((len(query_seq), 3))
    
    # Extract aligned sequences with gaps from the new API
    alignment = alignments[0]
    a_q, a_t = get_aligned_sequences(alignment)
    new_coords = np.full((len(query_seq), 3), np.nan)
    q_idx, t_idx = 0, 0
    for char_q, char_t in zip(a_q, a_t):
        if char_q != '-' and char_t != '-':
            if t_idx < len(template_coords): 
                new_coords[q_idx] = template_coords[t_idx]
            q_idx += 1
            t_idx += 1
        elif char_q != '-': 
            q_idx += 1
        elif char_t != '-': 
            t_idx += 1

    # Fill NaN values
    for i in range(len(new_coords)):
        if np.isnan(new_coords[i, 0]):
            prev_v = next((j for j in range(i-1, -1, -1) if not np.isnan(new_coords[j, 0])), -1)
            next_v = next((j for j in range(i+1, len(new_coords)) if not np.isnan(new_coords[j, 0])), -1)
            if prev_v >= 0 and next_v >= 0:
                w = (i - prev_v) / (next_v - prev_v)
                new_coords[i] = (1 - w) * new_coords[prev_v] + w * new_coords[next_v]
            elif prev_v >= 0: 
                new_coords[i] = new_coords[prev_v] + [3, 0, 0]
            elif next_v >= 0: 
                new_coords[i] = new_coords[next_v] + [3, 0, 0]
            else: 
                new_coords[i] = [i * 3, 0, 0]
    return np.nan_to_num(new_coords)

def generate_rna_structure(sequence, seed=None):
    if seed: 
        np.random.seed(seed)
    n = len(sequence)
    coords = np.zeros((n, 3))
    for i in range(1, n):
        coords[i] = coords[i-1] + [random.uniform(3.8, 4.2), 0, 0]
    return coords

# === 3. PREDICT (unchanged, but uses combined data) ===

def predict_rna_structures(sequence, target_id, train_seqs_df, train_coords_dict, n_predictions=5):
    predictions = []
    similar_seqs = find_similar_sequences(sequence, train_seqs_df, train_coords_dict, top_n=n_predictions)
    
    if similar_seqs:
        for i, (template_id, template_seq, similarity, template_coords) in enumerate(similar_seqs):
            adapted = adapt_template_to_query(sequence, template_seq, template_coords)
            refined = adaptive_rna_constraints(adapted, sequence, confidence=similarity)
            
            # FIX 4: Reduced noise level for top templates (0.05 -> 0.02)
            # This keeps the best of the 5 predictions more accurate
            random_scale = max(0.01, (0.4 - similarity) * 0.1) 
            refined += np.random.normal(0, random_scale, refined.shape)
            predictions.append(refined)
                
    while len(predictions) < n_predictions:
        predictions.append(generate_rna_structure(sequence, seed=len(predictions)))
    
    return predictions[:n_predictions]

# === 4. LOOP & SAVE (unchanged) ===
all_predictions = []
for idx, row in test_seqs.iterrows():
    target_id, sequence = row['target_id'], row['sequence']
    if idx % 5 == 0: 
        print(f"Processing {idx+1}/{len(test_seqs)}")
    
    # HERE we use COMBINED data instead of train_seqs and train_coords_dict
    preds = predict_rna_structures(sequence, target_id, combined_seqs, combined_coords_dict)
    
    for j in range(len(sequence)):
        res = {'ID': f"{target_id}_{j+1}", 'resname': sequence[j], 'resid': j+1}
        for i in range(5):
            res[f'x_{i+1}'], res[f'y_{i+1}'], res[f'z_{i+1}'] = preds[i][j]
        all_predictions.append(res)

submission_df = pd.DataFrame(all_predictions)
cols = ['ID', 'resname', 'resid'] + [f'{c}_{i}' for i in range(1,6) for c in ['x','y','z']]
submission_df[cols].to_csv('submission.csv', index=False)
print("Submission.csv generated!")
