In [None]:
import pandas as pd
import numpy as np
import random
import time
import warnings
from Bio import pairwise2
from Bio.Seq import Seq

warnings.filterwarnings('ignore')


NUCLEOTIDE_MAPPING = {
    'A': 'A', 'U': 'U', 'G': 'G', 'C': 'C',
    'I': 'A', '1MA': 'A', 'PSU': 'U', 'M2G': 'G', '5MC': 'C', 'T': 'U',
}

def clean_sequence(seq):
    return "".join([NUCLEOTIDE_MAPPING.get(b, 'A') for b in seq])


#====Phase 1====
DATA_PATH = '/kaggle/input/stanford-rna-3d-folding-2/'
train_seqs = pd.read_csv(DATA_PATH + 'train_sequences.csv')
test_seqs = pd.read_csv(DATA_PATH + 'test_sequences.csv')
train_labels = pd.read_csv(DATA_PATH + 'train_labels.csv')

def process_labels(labels_df):
    coords_dict = {}
    for id_prefix, group in labels_df.groupby(lambda x: labels_df['ID'][x].rsplit('_', 1)[0]):
        coords_dict[id_prefix] = group.sort_values('resid')[['x_1', 'y_1', 'z_1']].values
    return coords_dict

train_coords_dict = process_labels(train_labels)



#====Phase 2====
def adaptive_rna_constraints(coordinates, sequence, confidence=1.0):
    refined_coords = coordinates.copy()
    n = len(sequence)
    strength = 0.68 * (1.0 - min(confidence, 0.96))
    
    for _ in range(2):
        for i in range(n - 1):
            p1, p2 = refined_coords[i], refined_coords[i+1]
            dist = np.linalg.norm(p2 - p1)
            if dist > 0:
                adj = (5.95 - dist) * strength * 0.45
                refined_coords[i+1] += (p2 - p1) / dist * adj
            
            if i < n - 2:
                p3 = refined_coords[i+2]
                dist2 = np.linalg.norm(p3 - p1)
                if dist2 > 0:
                    adj2 = (10.2 - dist2) * strength * 0.25
                    refined_coords[i+2] += (p3 - p1) / dist2 * adj2
    return refined_coords


#====Phase 3=====
def adapt_template_to_query(query_seq, template_seq, template_coords):
    q_c = clean_sequence(query_seq)
    t_c = clean_sequence(template_seq)
    
    alignments = pairwise2.align.globalms(Seq(q_c), Seq(t_c), 2, -1, -7, -0.25, one_alignment_only=True)
    if not alignments: return np.zeros((len(query_seq), 3))
    
    a_q, a_t = alignments[0].seqA, alignments[0].seqB
    new_coords = np.full((len(query_seq), 3), np.nan)
    q_idx, t_idx = 0, 0
    
    for cq, ct in zip(a_q, a_t):
        if cq != '-' and ct != '-':
            if t_idx < len(template_coords): new_coords[q_idx] = template_coords[t_idx]
            q_idx += 1; t_idx += 1
        elif cq != '-': q_idx += 1
        elif ct != '-': t_idx += 1

    for i in range(len(new_coords)):
        if np.isnan(new_coords[i, 0]):
            prev_v = next((j for j in range(i-1, -1, -1) if not np.isnan(new_coords[j, 0])), -1)
            next_v = next((j for j in range(i+1, len(new_coords)) if not np.isnan(new_coords[j, 0])), -1)
            if prev_v >= 0 and next_v >= 0:
                w = (i - prev_v) / (next_v - prev_v)
                new_coords[i] = (1-w)*new_coords[prev_v] + w*new_coords[next_v]
            elif prev_v >= 0: new_coords[i] = new_coords[prev_v] + [3.5, 0, 0]
            elif next_v >= 0: new_coords[i] = new_coords[next_v] + [3.5, 0, 0]
            else: new_coords[i] = [i*3.5, 0, 0]
    return np.nan_to_num(new_coords)

#=====Phase 4=====
def find_similar_sequences(query_seq, train_seqs_df, train_coords_dict, top_n=5):
    similar = []
    q_c = clean_sequence(query_seq)
    for _, row in train_seqs_df.iterrows():
        t_id, t_seq = row['target_id'], row['sequence']
        if t_id not in train_coords_dict: continue
        if abs(len(t_seq) - len(query_seq)) / max(len(t_seq), len(query_seq)) > 0.4: continue
        
        t_c = clean_sequence(t_seq)
        alns = pairwise2.align.globalms(Seq(q_c), Seq(t_c), 2, -1, -7, -0.25, one_alignment_only=True)
        if alns:
            score = alns[0].score / (2 * min(len(query_seq), len(t_seq)))
            similar.append((t_id, t_seq, score, train_coords_dict[t_id]))
    
    similar.sort(key=lambda x: x[2], reverse=True)
    return similar[:top_n]

def predict_rna_structures(sequence, target_id, train_seqs_df, train_coords_dict, n_predictions=5):
    predictions = []
    similar_seqs = find_similar_sequences(sequence, train_seqs_df, train_coords_dict, top_n=n_predictions)
    
    for i in range(n_predictions):
        if i < len(similar_seqs):
            t_id, t_seq, sim, t_coords = similar_seqs[i]
            adapted = adapt_template_to_query(sequence, t_seq, t_coords)
            refined = adaptive_rna_constraints(adapted, sequence, confidence=sim)
            
            # ШУМ: Слот 0 - чистый, остальные - микро-шум
            noise = 0.0 if i == 0 else max(0.006, (0.38 - sim) * 0.07)
            if noise > 0: refined += np.random.normal(0, noise, refined.shape)
            predictions.append(refined)
        else:
            n = len(sequence)
            coords = np.zeros((n, 3))
            for j in range(1, n): coords[j] = coords[j-1] + [4.0, 0, 0]
            predictions.append(coords)
    return predictions

#====Phase 5====
all_predictions = []
start_time = time.time()
for idx, row in test_seqs.iterrows():
    if idx % 10 == 0: print(f"Processing {idx} | {time.time()-start_time:.1f}s")
    tid, seq = row['target_id'], row['sequence']
    preds = predict_rna_structures(seq, tid, train_seqs, train_coords_dict)
    for j in range(len(seq)):
        res = {'ID': f"{tid}_{j+1}", 'resname': seq[j], 'resid': j+1}
        for i in range(5):
            res[f'x_{i+1}'], res[f'y_{i+1}'], res[f'z_{i+1}'] = preds[i][j]
        all_predictions.append(res)

sub = pd.DataFrame(all_predictions)
cols = ['ID', 'resname', 'resid'] + [f'{c}_{i}' for i in range(1,6) for c in ['x','y','z']]
sub[cols].to_csv('submission.csv', index=False)
print("ALL COMPLETED. SUBMIT!!")