In [1]:
import os
from collections import defaultdict
import random
import csv

In [2]:
def generate_negative_samples(positive_pairs, protein_sequences, shuffle_prob=0.5):
    negative_pairs = []
    shuffled_seqs = {}

    for pair in positive_pairs:
        shuffle_target = "A" if random.random() < shuffle_prob else "B"
        prot_a, prot_b = pair

        seq_a = protein_sequences[prot_a]
        seq_b = protein_sequences[prot_b]

        if shuffle_target == "A":
            prot_id = prot_a + "_shuffled"
            shuffled_seq = shuffle_sequence_with_2let(seq_a)
            negative_pairs.append((prot_id, prot_b))
        else:
            prot_id = prot_b + "_shuffled"
            shuffled_seq = shuffle_sequence_with_2let(seq_b)
            negative_pairs.append((prot_a, prot_id))

        
        if prot_id not in shuffled_seqs:
            shuffled_seqs[prot_id] = shuffled_seq

    return negative_pairs, shuffled_seqs


def shuffle_sequence_with_2let(sequence):
    if len(sequence) <= 2:
        return sequence

    seq_list = list(sequence)

    dipeptide_freq = defaultdict(list)
    for i in range(1, len(seq_list) - 1):
        current = seq_list[i]
        next_aa = seq_list[i + 1]
        dipeptide_freq[current].append(next_aa)

    shuffled_seq = [seq_list[0]]

    for i in range(1, len(seq_list)):
        if i == 1 or len(dipeptide_freq[shuffled_seq[-1]]) == 0:
            possible_next = [seq_list[j]
                             for j in range(1, len(seq_list)) if j != i]
            if not possible_next:
                next_aa = random.choice(seq_list[1:])
            else:
                next_aa = random.choice(possible_next)
        else:
            next_aa = random.choice(dipeptide_freq[shuffled_seq[-1]])

        shuffled_seq.append(next_aa)

    return "".join(shuffled_seq)

In [3]:
spe = "rice"

positive_action_file = os.path.join(spe, "positive_action.tsv")
positive_seq_file = os.path.join(spe, "positive_seq.tsv")

action_file = os.path.join(spe, "action.tsv")
seq_file = os.path.join(spe, "seq.tsv")

protein_sequences = {}
with open(positive_seq_file, "r") as f:
    reader = csv.reader(f, delimiter="\t")
    for row in reader:
        if len(row) >= 2:
            protein_id, seq = row[0], row[1]
            protein_sequences[protein_id] = seq

positive_pairs = []
with open(positive_action_file, "r") as f:
    reader = csv.reader(f, delimiter="\t")
    for row in reader:
        if len(row) >= 2:
            positive_pairs.append((row[0], row[1]))

negative_pairs, shuffled_seqs = generate_negative_samples(positive_pairs, protein_sequences)

with open(action_file, "w", newline="") as f:
    writer = csv.writer(f, delimiter="\t")
    for pair in positive_pairs:
        writer.writerow([pair[0], pair[1], 1])
    for pair in negative_pairs:
        writer.writerow([pair[0], pair[1], 0])

with open(seq_file, "w", newline="") as f:
    writer = csv.writer(f, delimiter="\t")
    for protein_id, seq in protein_sequences.items():
        writer.writerow([protein_id, seq])
    for protein_id, seq in shuffled_seqs.items():
        writer.writerow([protein_id, seq])
