In [1]:
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

def load_sequences(pretrain_file, validation_file):
    # 加载预训练集
    pretrain_sequences = pd.read_csv(pretrain_file, header=None)[0].tolist()
    
    # 加载外部验证数据集
    validation_sequences = []
    for record in SeqIO.parse(validation_file, "fasta"):
        validation_sequences.append(str(record.seq))
    
    return set(pretrain_sequences), set(validation_sequences)

def check_uniqueness(generated_sequences):
    unique_sequences = set(generated_sequences)
    unique_count = len(unique_sequences)
    total_count = len(generated_sequences)
    uniqueness_ratio = (unique_count / total_count) * 100
    return unique_sequences, unique_count, uniqueness_ratio

def check_novelty(generated_sequences, pretrain_sequences, validation_sequences):
    new_sequences = []
    pretrain_repeats = 0
    validation_repeats = 0

    for seq in generated_sequences:
        if seq not in pretrain_sequences and seq not in validation_sequences:
            new_sequences.append(seq)
        if seq in pretrain_sequences:
            pretrain_repeats += 1
        if seq in validation_sequences:
            validation_repeats += 1

    novel_count = len(new_sequences)
    total_count = len(generated_sequences)
    novelty_ratio = (novel_count / total_count) * 100
    return new_sequences, novel_count, novelty_ratio, pretrain_repeats, validation_repeats

def filter_sequences(generated_file, pretrain_file, validation_file):
    # 直接读取生成的序列文件中的每一行作为一个序列
    generated_sequences = pd.read_csv(generated_file, header=None)[0].tolist()
    pretrain_sequences, validation_sequences = load_sequences(pretrain_file, validation_file)
    
    unique_sequences, unique_count, uniqueness_ratio = check_uniqueness(generated_sequences)
    novel_sequences, novel_count, novelty_ratio, pretrain_repeats, validation_repeats = check_novelty(generated_sequences, pretrain_sequences, validation_sequences)
    
    filtered_sequences = [seq for seq in unique_sequences & set(novel_sequences) if 'B' not in seq]
    filtered_count = len(filtered_sequences)
    filtered_ratio = (filtered_count / len(generated_sequences)) * 100
    
    return filtered_sequences, filtered_count, uniqueness_ratio, novelty_ratio, filtered_ratio, pretrain_repeats, validation_repeats

def save_filtered_sequences(filtered_sequences, output_file_csv, output_file_fasta):
    # 保存为CSV格式
    output_file_csv = "data/" + output_file_csv
    pd.DataFrame(filtered_sequences, columns=["sequence"]).to_csv(output_file_csv, index=False)
    
    # 保存为FASTA格式
    output_file_fasta = "data/" + output_file_fasta
    fasta_sequences = [SeqRecord(Seq(seq), id=str(index), description="") for index, seq in enumerate(filtered_sequences)]
    SeqIO.write(fasta_sequences, output_file_fasta, "fasta")

# 示例使用
generated_file = 'NS3_finetune_model/sampled_sequences_temp1.25.csv'
pretrain_file = 'data/Antiviral_pretrain_dataset.csv'
validation_file = 'data/Peptipedia_all_peptides.fasta'
output_file_csv = 'Filtered_Unique_Novel.csv'
output_file_fasta = 'Filtered_Unique_Novel.fasta'

filtered_sequences, filtered_count, uniqueness_ratio, novelty_ratio, filtered_ratio, pretrain_repeats, validation_repeats = filter_sequences(generated_file, pretrain_file, validation_file)

print(f"Filtered sequences count: {filtered_count}")
print(f"Uniqueness ratio: {uniqueness_ratio:.2f}%")
print(f"Novelty ratio: {novelty_ratio:.2f}%")
print(f"Filtered ratio after uniqueness and novelty: {filtered_ratio:.2f}%")
print(f"Sequences repeated in pretrain set: {pretrain_repeats}")
print(f"Sequences repeated in validation set: {validation_repeats}")

save_filtered_sequences(filtered_sequences, output_file_csv, output_file_fasta)


Filtered sequences count: 337
Uniqueness ratio: 83.90%
Novelty ratio: 72.23%
Filtered ratio after uniqueness and novelty: 67.81%
Sequences repeated in pretrain set: 135
Sequences repeated in validation set: 138
