In [None]:
import pandas as pd
import numpy as np
from scipy.spatial.transform import Rotation as R
import random
from Bio import pairwise2
from Bio.Seq import Seq
import time
from scipy.spatial import distance_matrix
import warnings

warnings.filterwarnings('ignore')

DATA_PATH = '/kaggle/input/stanford-rna-3d-folding-2/'

train_seqs = pd.read_csv(DATA_PATH + 'train_sequences.csv')
test_seqs = pd.read_csv(DATA_PATH + 'test_sequences.csv')
train_labels = pd.read_csv(DATA_PATH + 'train_labels.csv')

try:
    validation_seqs = pd.read_csv(DATA_PATH + 'validation_sequences.csv')
    validation_labels = pd.read_csv(DATA_PATH + 'validation_labels.csv')
    print("Validation data found and will be combined with train data.")
    
    combined_seqs = pd.concat([train_seqs, validation_seqs], ignore_index=True)
    
    combined_labels = pd.concat([train_labels, validation_labels], ignore_index=True)
    
except FileNotFoundError:
    print("Validation data not found, using only train data.")
    combined_seqs = train_seqs
    combined_labels = train_labels

def process_labels(labels_df):
    coords_dict = {}
    for id_prefix, group in labels_df.groupby(lambda x: labels_df['ID'][x].rsplit('_', 1)[0]):
        coords = [group.sort_values('resid')[['x_1', 'y_1', 'z_1']].values]
        coords_dict[id_prefix] = coords[0]
    return coords_dict

combined_coords_dict = process_labels(combined_labels)


def find_similar_sequences(query_seq, train_seqs_df, train_coords_dict, top_n=5):
    similar_seqs = []
    query_seq_obj = Seq(query_seq)
    
    for _, row in train_seqs_df.iterrows():
        target_id, train_seq = row['target_id'], row['sequence']
        if target_id not in train_coords_dict: continue
        if abs(len(train_seq) - len(query_seq)) / max(len(train_seq), len(query_seq)) > 0.4: continue
        
        alignments = pairwise2.align.globalms(query_seq_obj, train_seq, 2, -1, -8, -0.3, one_alignment_only=True)
        
        if alignments:
            score = alignments[0].score / (2 * min(len(query_seq), len(train_seq)))
            similar_seqs.append((target_id, train_seq, score, train_coords_dict[target_id]))
    
    similar_seqs.sort(key=lambda x: x[2], reverse=True)
    return similar_seqs[:top_n]

def adaptive_rna_constraints(coordinates, sequence, confidence=1.0):
    refined_coords = coordinates.copy()
    n_residues = len(sequence)
    
    constraint_strength = 0.5 * (1.0 - min(confidence, 0.95))
    
    seq_min_dist, seq_max_dist = 5.8, 6.1
    
    for i in range(n_residues - 1):
        dist = np.linalg.norm(refined_coords[i+1] - refined_coords[i])
        if dist < seq_min_dist or dist > seq_max_dist:
            target_dist = 5.95 
            direction = (refined_coords[i+1] - refined_coords[i]) / (dist + 1e-10)
            adjustment = (target_dist - dist) * constraint_strength
            refined_coords[i+1] = refined_coords[i+1] + direction * adjustment
            
    return refined_coords

def adapt_template_to_query(query_seq, template_seq, template_coords):
    alignments = pairwise2.align.globalms(Seq(query_seq), Seq(template_seq), 2, -1, -8, -0.3, one_alignment_only=True)
    if not alignments: return np.zeros((len(query_seq), 3))
    
    a_q, a_t = alignments[0].seqA, alignments[0].seqB
    new_coords = np.full((len(query_seq), 3), np.nan)
    q_idx, t_idx = 0, 0
    for char_q, char_t in zip(a_q, a_t):
        if char_q != '-' and char_t != '-':
            if t_idx < len(template_coords): new_coords[q_idx] = template_coords[t_idx]
            q_idx += 1; t_idx += 1
        elif char_q != '-': q_idx += 1
        elif char_t != '-': t_idx += 1

    for i in range(len(new_coords)):
        if np.isnan(new_coords[i, 0]):
            prev_v = next((j for j in range(i-1, -1, -1) if not np.isnan(new_coords[j, 0])), -1)
            next_v = next((j for j in range(i+1, len(new_coords)) if not np.isnan(new_coords[j, 0])), -1)
            if prev_v >= 0 and next_v >= 0:
                w = (i - prev_v) / (next_v - prev_v)
                new_coords[i] = (1-w)*new_coords[prev_v] + w*new_coords[next_v]
            elif prev_v >= 0: new_coords[i] = new_coords[prev_v] + [3, 0, 0]
            elif next_v >= 0: new_coords[i] = new_coords[next_v] + [3, 0, 0]
            else: new_coords[i] = [i*3, 0, 0]
    return np.nan_to_num(new_coords)

def generate_rna_structure(sequence, seed=None):
    if seed: np.random.seed(seed)
    n = len(sequence)
    coords = np.zeros((n, 3))
    for i in range(1, n):
        coords[i] = coords[i-1] + [random.uniform(3.8, 4.2), 0, 0]
    return coords


def predict_rna_structures(sequence, target_id, train_seqs_df, train_coords_dict, n_predictions=5):
    predictions = []
    similar_seqs = find_similar_sequences(sequence, train_seqs_df, train_coords_dict, top_n=n_predictions)
    
    if similar_seqs:
        for i, (template_id, template_seq, similarity, template_coords) in enumerate(similar_seqs):
            adapted = adapt_template_to_query(sequence, template_seq, template_coords)
            refined = adaptive_rna_constraints(adapted, sequence, confidence=similarity)
            
            random_scale = max(0.02, (0.5 - similarity) * 0.1) 
            refined += np.random.normal(0, random_scale, refined.shape)
            predictions.append(refined)
                
    while len(predictions) < n_predictions:
        predictions.append(generate_rna_structure(sequence, seed=len(predictions)))
    
    return predictions[:n_predictions]

all_predictions = []
for idx, row in test_seqs.iterrows():
    target_id, sequence = row['target_id'], row['sequence']
    if idx % 5 == 0: print(f"Processing {idx+1}/{len(test_seqs)}")
    
    preds = predict_rna_structures(sequence, target_id, combined_seqs, combined_coords_dict)
    
    for j in range(len(sequence)):
        res = {'ID': f"{target_id}_{j+1}", 'resname': sequence[j], 'resid': j+1}
        for i in range(5):
            res[f'x_{i+1}'], res[f'y_{i+1}'], res[f'z_{i+1}'] = preds[i][j]
        all_predictions.append(res)

submission_df = pd.DataFrame(all_predictions)
cols = ['ID', 'resname', 'resid'] + [f'{c}_{i}' for i in range(1,6) for c in ['x','y','z']]
submission_df[cols].to_csv('submission.csv', index=False)
print("Submission.csv generated!")