# Gather and Preprocess Valid/Test Sets for Evaluation

## Imports

In [None]:
import ast
import glob
import gzip
import os
import shutil
import sys
import tempfile

import biotite.structure.io
import numpy as np
import pandas as pd

from na_eval_utils import (
    read_cluster_ids_text_file
)

## Helper Functions

In [None]:
def get_exclusive_cluster_subset(dataset_df, 
                                 cluster_ids_column_name, 
                                 dataset_cluster_ids):
    """
    Get a subset of the dataset dataframe where the cluster ids in the
    specified column are all in the provided cluster ids for the dataset.
    """
    # Subset the dataframe to only include rows where all cluster ids
    # are in the provided cluster ids.
    dataset_df_subset = dataset_df[
        dataset_df[cluster_ids_column_name].apply(
            lambda cluster_ids_str: all(
                [cluster_id in dataset_cluster_ids for cluster_id in ast.literal_eval(cluster_ids_str)]
            )
        )
    ].copy()

    return dataset_df_subset

def get_length_subset(dataset_df,
                      max_length,
                      min_length):
    """
    Get a subset of the dataset dataframe where the assembly lengths
    are within the specified range.
    """
    valid_length_entries = set()
    for i in range(len(dataset_df)):
        row_dict = dataset_df.iloc[i].to_dict()

        asmb_lengths_path = row_dict["asmb_lengths_path"]

        # Read the assembly lengths from the file.
        asmb_lengths = np.load(asmb_lengths_path, allow_pickle = True).item()

        for assembly_id in asmb_lengths:
            (macromolecule_L, protein_L, dna_L, rna_L) = asmb_lengths[assembly_id]

            if macromolecule_L <= max_length and macromolecule_L >= min_length:
                valid_length_entries.add(row_dict["id"])
                break
        
    # Subset the dataframe to only include rows where the assembly lengths
    # are valid.
    dataset_df_subset = dataset_df[
        dataset_df["id"].apply(
            lambda id: id in valid_length_entries
        )
    ].copy()

    return dataset_df_subset

def get_rna_monomer_subset(dataset_df):
    """
    Get a subset of the dataset dataframe where the entry is a RNA monomer.
    """
    rna_monomer_entries = set()
    for i in range(len(dataset_df)):
        row_dict = dataset_df.iloc[i].to_dict()

        # Get the chain types.
        sequences_path = row_dict["sequences_path"]

        sequences_df = pd.read_csv(sequences_path)

        protein_chains = 0
        dna_chains = 0
        rna_chains = 0
        dna_rna_hybrid_chains = 0
        for chain_type in sequences_df["chain_type"]:
            if chain_type == "polypeptide(L)":
                protein_chains += 1
            elif chain_type == "polydeoxyribonucleotide":
                dna_chains += 1
            elif chain_type == "polyribonucleotide":
                rna_chains += 1
            elif chain_type == "polydeoxyribonucleotide/polyribonucleotide hybrid":
                dna_rna_hybrid_chains += 1

        if rna_chains == 1 and dna_chains == 0 and protein_chains == 0 and dna_rna_hybrid_chains == 0:
            rna_monomer_entries.add(row_dict["id"])
        
    dataset_df_subset = dataset_df[
        dataset_df["id"].apply(
            lambda id: id in rna_monomer_entries
        )
    ].copy()

    return dataset_df_subset

def get_ppm_subset(dataset_df):
    """
    Get a subset of the dataset dataframe where the PPM paths are not empty.
    """
    # Subset the dataframe to only include rows where the PPM paths are not empty.
    dataset_df_subset = dataset_df[
        dataset_df["ppm_paths"].apply(
            lambda ppm_paths_str: len(ast.literal_eval(ppm_paths_str)) > 0
        )
    ].copy()

    return dataset_df_subset


def get_entries_in_same_clusters_as_specified_entries(
        dataset_df,
        entry_ids,
        cluster_ids_column_name,
):
    """
    Get a subset of the dataset dataframe where the clusters fall into the
    same clusters as the specified entries.
    """
    entry_cluster_ids = set()
    for entry_id in entry_ids:
        row_dict = dataset_df[dataset_df["id"] == entry_id].iloc[0].to_dict()
        cluster_ids = ast.literal_eval(row_dict[cluster_ids_column_name])
        entry_cluster_ids.update(cluster_ids)

    dataset_df_subset = dataset_df[
        dataset_df[cluster_ids_column_name].apply(
            lambda cluster_ids_str: all(
                [cluster_id in entry_cluster_ids for cluster_id in ast.literal_eval(cluster_ids_str)]
            )
        )
    ].copy()

    return dataset_df_subset

def get_polymer_type_statistics(dataset_df):
    """
    Print the number of different types of entries in the dataset.
    The types of entries are:
    - DNA
    - RNA
    - DNA/RNA hybrid
    - Protein/DNA
    - Protein/RNA
    - Protein/DNA/RNA hybrid
    """
    dna_entries = set()
    rna_entries = set()
    dna_rna_hybrid_entries = set()
    protein_dna_entries = set()
    protein_rna_entries = set()
    protein_dna_rna_hybrid_entries = set()
    for i in range(len(dataset_df)):
        row_dict = dataset_df.iloc[i].to_dict()

        nucleic_acid_chain_cluster_ids_chain_types = ast.literal_eval(
            row_dict["nucleic_acid_chain_cluster_ids_chain_types"]
        )
        protein_chain_cluster_ids_chain_types = ast.literal_eval(
            row_dict["protein_chain_cluster_ids_chain_types"]
        )

        has_protein = len(protein_chain_cluster_ids_chain_types) > 0
        has_dna = "polydeoxyribonucleotide" in nucleic_acid_chain_cluster_ids_chain_types
        has_rna = "polyribonucleotide" in nucleic_acid_chain_cluster_ids_chain_types
        has_dna_rna_hybrid = "polydeoxyribonucleotide/polyribonucleotide hybrid" in nucleic_acid_chain_cluster_ids_chain_types

        if has_protein and has_dna and not has_rna and not has_dna_rna_hybrid:
            protein_dna_entries.add(row_dict["id"])
        elif has_protein and has_rna and not has_dna and not has_dna_rna_hybrid:
            protein_rna_entries.add(row_dict["id"])
        elif has_protein and has_dna_rna_hybrid and not has_dna and not has_rna:
            protein_dna_rna_hybrid_entries.add(row_dict["id"])
        elif has_dna and not has_protein and not has_rna and not has_dna_rna_hybrid:
            dna_entries.add(row_dict["id"])
        elif has_rna and not has_protein and not has_dna and not has_dna_rna_hybrid:
            rna_entries.add(row_dict["id"])
        elif has_dna_rna_hybrid and not has_protein and not has_dna and not has_rna:
            dna_rna_hybrid_entries.add(row_dict["id"])

    print("Number of total entries:", len(dataset_df))
    print("Number of DNA entries:", len(dna_entries))
    print("Number of RNA entries:", len(rna_entries))
    print("Number of DNA/RNA hybrid entries:", len(dna_rna_hybrid_entries))
    print("Number of protein/DNA entries:", len(protein_dna_entries))
    print("Number of protein/RNA entries:", len(protein_rna_entries))
    print("Number of protein/DNA/RNA hybrid entries:", len(protein_dna_rna_hybrid_entries))

def get_ppm_statistics(dataset_df):
    """
    Print the number of different types of entries in the dataset.
    The types of entries are:
    - PPM
    - PPM from crystal
    - PPM from distillation
    """
    # Count the number of different types of entries.
    ppm_entries = set()
    ppm_from_crystal_entries = set()
    ppm_from_distillation_entries = set()
    for i in range(len(dataset_df)):
        row_dict = dataset_df.iloc[i].to_dict()

        ppm_paths = ast.literal_eval(
            row_dict["ppm_paths"]
        )

        has_ppm = len(ppm_paths) > 0
        ppm_from_crystal = row_dict["dataset_name"] == "rcsb_cif_na"
        ppm_from_distillation = (row_dict["dataset_name"] == "rf2na_distillation_cis_bp") or (row_dict["dataset_name"] == "rf2na_distillation_transfac")

        if has_ppm:
            ppm_entries.add(row_dict["id"])
            if ppm_from_crystal:
                ppm_from_crystal_entries.add(row_dict["id"])
            elif ppm_from_distillation:
                ppm_from_distillation_entries.add(row_dict["id"])

    print("Number of total entries:", len(dataset_df))
    print("Number of PPM entries:", len(ppm_entries))
    print("Number of PPM from crystal entries:", len(ppm_from_crystal_entries))
    print("Number of PPM from distillation entries:", len(ppm_from_distillation_entries))

def load_rna_solo_paths(rfam_pdb_directory, bgsu_pdb_directory):
    """
    Load the RNA-Solo paths from the BGSU and RFAM directories. Load the 
    BGSU paths first, then the BGSU paths.
    """
    rfam_pattern = os.path.join(
        rfam_pdb_directory,
        "*",
        "*.pdb"
    )
    bgsu_pattern = os.path.join(
        bgsu_pdb_directory,
        "*",
        "*.pdb"
    )

    pdb_id_to_rfam_paths = dict()
    for pdb_path in glob.glob(rfam_pattern):
        basename = os.path.basename(pdb_path)
        if basename.startswith("PDB_0000"):
            pdb_id = basename.split("PDB_0000")[1].split("_")[0].lower()
        else:
            pdb_id = basename.split("_")[0].lower()
        
        if pdb_id not in pdb_id_to_rfam_paths:
            pdb_id_to_rfam_paths[pdb_id] = []
        pdb_id_to_rfam_paths[pdb_id].append(pdb_path)

    pdb_id_to_bgsu_paths = dict()
    for pdb_path in glob.glob(bgsu_pattern):
        basename = os.path.basename(pdb_path)
        if basename.startswith("PDB_0000"):
            pdb_id = basename.split("PDB_0000")[1].split("_")[0].lower()
        else:
            pdb_id = basename.split("_")[0].lower()
        
        if pdb_id not in pdb_id_to_bgsu_paths:
            pdb_id_to_bgsu_paths[pdb_id] = []
        
        pdb_id_to_bgsu_paths[pdb_id].append(pdb_path)
    
    pdb_id_to_rna_solo_paths = dict()
    for pdb_id in set(pdb_id_to_rfam_paths.keys()).union(
        set(pdb_id_to_bgsu_paths.keys())
    ):
        pdb_id_to_rna_solo_paths[pdb_id] = []

        # For the sake of the pseudoknot set, flip the order of the
        # RFAM and BGSU paths for these two PDB IDs.
        # This is due to issues with the RFAM pdbs for these two entries.
        if pdb_id == "1vc5" or pdb_id == "4znp":
            if pdb_id in pdb_id_to_bgsu_paths:
                pdb_id_to_rna_solo_paths[pdb_id].extend(pdb_id_to_bgsu_paths[pdb_id])
            if pdb_id in pdb_id_to_rfam_paths:
                pdb_id_to_rna_solo_paths[pdb_id].extend(pdb_id_to_rfam_paths[pdb_id])
        else:
            if pdb_id in pdb_id_to_rfam_paths:
                pdb_id_to_rna_solo_paths[pdb_id].extend(pdb_id_to_rfam_paths[pdb_id])
            if pdb_id in pdb_id_to_bgsu_paths:
                pdb_id_to_rna_solo_paths[pdb_id].extend(pdb_id_to_bgsu_paths[pdb_id])
    
    return pdb_id_to_rna_solo_paths

def convert_cif_to_pdb(dataset_df, 
                       pdb_output_directory,  
                       use_rna_solo = False,
                       pdb_id_to_rna_solo_paths = None):
    """
    Convert the CIF files in the dataset dataframe to PDB files.
    """
    if use_rna_solo:
        assert(pdb_id_to_rna_solo_paths is not None)
    
    os.makedirs(pdb_output_directory)
    
    dataset_df = dataset_df.copy()
    successfully_converted_ids = set()
    pdb_id_to_structure_path = dict()
    pdb_id_to_original_structure_path = dict()
    pdb_id_to_copied_structure_path = dict()
    for i in range(len(dataset_df)):
        row_dict = dataset_df.iloc[i].to_dict()

        # Fetch the original structure path.
        original_structure_path = row_dict["structure_path"]
        
        # Compute the destination structure path.
        destination_structure_path = os.path.join(
            pdb_output_directory,
            row_dict["id"] + ".pdb"
        )

        if use_rna_solo:
            # Use the first RNA-Solo path if it exists.
            rna_solo_paths = pdb_id_to_rna_solo_paths.get(row_dict["id"], [])
            if len(rna_solo_paths) == 0:
                continue
            structure_path_to_copy = rna_solo_paths[0]
        else:
            structure_path_to_copy = original_structure_path
        
        # Use a temp file to handle gzipped files.
        try:
            if structure_path_to_copy.endswith(".gz"):
                structure_ext = os.path.splitext(structure_path_to_copy[:-3])[1]
                
                with (gzip.open(structure_path_to_copy, "rb") as f_in,
                    tempfile.NamedTemporaryFile(suffix = structure_ext) as tmp):
                    tmp.write(f_in.read())
                    tmp.flush()
                    atom_array = biotite.structure.io.load_structure(tmp.name)
                    tmp.close()
            else:
                atom_array = biotite.structure.io.load_structure(structure_path_to_copy)
            
            biotite.structure.io.save_structure(destination_structure_path, atom_array)
        except:
            continue
            
        # Add the id to the set of successfully converted ids.
        successfully_converted_ids.add(row_dict["id"])

        # Replace the destination structure path, original structure path,
        # and copied structure path in the dataframe.
        pdb_id_to_structure_path[row_dict["id"]] = destination_structure_path
        pdb_id_to_original_structure_path[row_dict["id"]] = original_structure_path
        pdb_id_to_copied_structure_path[row_dict["id"]] = structure_path_to_copy
    
    # Create a new dataframe with the successfully converted ids.
    dataset_subset_df = dataset_df[
        dataset_df["id"].apply(
            lambda id: id in successfully_converted_ids
        )
    ].copy()

    # Replace the structure path in the new dataframe with the destination
    # structure path.
    dataset_subset_df["structure_path"] = dataset_subset_df["id"].apply(
        lambda id: pdb_id_to_structure_path[id]
    )
    dataset_subset_df["original_structure_path"] = dataset_subset_df["id"].apply(
        lambda id: pdb_id_to_original_structure_path[id]
    )
    dataset_subset_df["copied_structure_path"] = dataset_subset_df["id"].apply(
        lambda id: pdb_id_to_copied_structure_path[id]
    )

    return dataset_subset_df

## Load the RNASolo Paths

In [None]:
pdb_id_to_rna_solo_paths = load_rna_solo_paths(
    rfam_pdb_directory = "/home/akubaney/projects/data/rfam_rnasolo_2025_04_07/pdb",
    bgsu_pdb_directory = "/home/akubaney/projects/data/bgsu_rnasolo_2025_05_01/pdb"
)

## Create the PDB Output Directory and the CSV Output Directory

In [None]:
pdb_output_directory = os.path.abspath("./evaluation_pdbs")
csv_output_directory = os.path.abspath("./evaluation_csvs")

In [None]:
if os.path.exists(pdb_output_directory):
    shutil.rmtree(pdb_output_directory)
os.makedirs(pdb_output_directory)

if os.path.exists(csv_output_directory):
    shutil.rmtree(csv_output_directory)
os.makedirs(csv_output_directory)

In [None]:
design_valid_csv_path = "/home/akubaney/projects/na_mpnn/data/datasets/design_dataset_v2/valid.csv"
design_valid_cluster_ids_path = "/home/akubaney/projects/na_mpnn/data/datasets/design_dataset_v2/valid_nucleic_acid_chain_cluster_ids.txt"

design_test_csv_path = "/home/akubaney/projects/na_mpnn/data/datasets/design_dataset_v2/test.csv"
design_test_cluster_ids_path = "/home/akubaney/projects/na_mpnn/data/datasets/design_dataset_v2/test_nucleic_acid_chain_cluster_ids.txt"

### Validation Set

In [None]:
design_valid_pdbs_output_directory = os.path.join(pdb_output_directory, "design_valid")
design_valid_csv_output_path = os.path.join(csv_output_directory, "design_valid.csv")

design_valid_df = pd.read_csv(design_valid_csv_path)
design_valid_cluster_ids = read_cluster_ids_text_file(design_valid_cluster_ids_path)
print("Original valid dataset size:", len(design_valid_df))

design_valid_df = get_exclusive_cluster_subset(
    design_valid_df, 
    "nucleic_acid_chain_cluster_ids", 
    design_valid_cluster_ids
)
print("Valid dataset size after exclusive cluster subset:", len(design_valid_df))

design_valid_df = get_length_subset(
    design_valid_df,
    max_length = 1000,
    min_length = 20
)
print("Valid dataset size after length subset:", len(design_valid_df))

design_valid_df = convert_cif_to_pdb(
    design_valid_df, 
    design_valid_pdbs_output_directory, 
    use_rna_solo = False,
    pdb_id_to_rna_solo_paths = None
)
print("Valid dataset size after CIF to PDB conversion:", len(design_valid_df))

get_polymer_type_statistics(design_valid_df)

design_valid_df.to_csv(design_valid_csv_output_path, index = False)
design_valid_df

### Test Set

In [None]:
design_test_pdbs_output_directory = os.path.join(pdb_output_directory, "design_test")
design_test_csv_output_path = os.path.join(csv_output_directory, "design_test.csv")

design_test_df = pd.read_csv(design_test_csv_path)
design_test_cluster_ids = read_cluster_ids_text_file(design_test_cluster_ids_path)
print("Original test dataset size:", len(design_test_df))

design_test_df = get_exclusive_cluster_subset(
    design_test_df, 
    "nucleic_acid_chain_cluster_ids", 
    design_test_cluster_ids
)
print("Test dataset size after exclusive cluster subset:", len(design_test_df))

design_test_df = get_length_subset(
    design_test_df,
    max_length = 1000,
    min_length = 20
)
print("Test dataset size after length subset:", len(design_test_df))

design_test_df = convert_cif_to_pdb(
    design_test_df, 
    design_test_pdbs_output_directory, 
    use_rna_solo = False,
    pdb_id_to_rna_solo_paths = None
)
print("Test dataset size after CIF to PDB conversion:", len(design_test_df))

get_polymer_type_statistics(design_test_df)

design_test_df.to_csv(design_test_csv_output_path, index = False)
design_test_df

### RNA Monomer Test Set

In [None]:
design_rna_monomer_test_pdbs_output_directory = os.path.join(pdb_output_directory, "design_rna_monomer_test")
design_rna_monomer_test_csv_output_path = os.path.join(csv_output_directory, "design_rna_monomer_test.csv")

design_test_df = pd.read_csv(design_test_csv_path)
design_test_cluster_ids = read_cluster_ids_text_file(design_test_cluster_ids_path)
print("Original test dataset size:", len(design_test_df))

design_test_df = get_exclusive_cluster_subset(
    design_test_df, 
    "nucleic_acid_chain_cluster_ids", 
    design_test_cluster_ids
)
print("Test dataset size after exclusive cluster subset:", len(design_test_df))

design_rna_monomer_test_df = get_rna_monomer_subset(design_test_df)
print("Test dataset size after RNA monomer subset:", len(design_rna_monomer_test_df))

design_rna_monomer_test_df = get_length_subset(
    design_rna_monomer_test_df,
    max_length = 1000,
    min_length = 20
)
print("Test dataset size after length subset:", len(design_rna_monomer_test_df))

design_rna_monomer_test_df = convert_cif_to_pdb(
    design_rna_monomer_test_df, 
    design_rna_monomer_test_pdbs_output_directory, 
    use_rna_solo = True,
    pdb_id_to_rna_solo_paths = pdb_id_to_rna_solo_paths
)
print("Test dataset size after CIF to PDB conversion:", len(design_rna_monomer_test_df))

get_polymer_type_statistics(design_rna_monomer_test_df)

design_rna_monomer_test_df.to_csv(design_rna_monomer_test_csv_output_path, index = False)
design_rna_monomer_test_df

### Pseudoknot Test Set

In [None]:
pseudoknot_pdb_ids = ["7kd1", "3q3z", "4plx", "2m8k", "4oqu", "7kga", "1drz", "7qr4", "2miy", "4znp"]

design_pseudoknot_test_pdbs_output_directory = os.path.join(pdb_output_directory, "design_pseudoknot_test")
design_pseudoknot_test_csv_output_path = os.path.join(csv_output_directory, "design_pseudoknot_test.csv")

design_test_df = pd.read_csv(design_test_csv_path)
design_test_cluster_ids = read_cluster_ids_text_file(design_test_cluster_ids_path)
print("Original test dataset size:", len(design_test_df))

design_test_df = get_exclusive_cluster_subset(
    design_test_df, 
    "nucleic_acid_chain_cluster_ids", 
    design_test_cluster_ids
)
print("Test dataset size after exclusive cluster subset:", len(design_test_df))

design_pseudoknot_test_df = get_entries_in_same_clusters_as_specified_entries(
    design_test_df,
    pseudoknot_pdb_ids,
    "nucleic_acid_chain_cluster_ids"
)
print("Test dataset size after pseudoknot subset:", len(design_pseudoknot_test_df))

design_pseudoknot_test_df = get_length_subset(
    design_pseudoknot_test_df,
    max_length = 1000,
    min_length = 20
)
print("Test dataset size after length subset:", len(design_pseudoknot_test_df))

design_pseudoknot_test_df = convert_cif_to_pdb(
    design_pseudoknot_test_df, 
    design_pseudoknot_test_pdbs_output_directory, 
    use_rna_solo = True,
    pdb_id_to_rna_solo_paths = pdb_id_to_rna_solo_paths
)
print("Test dataset size after CIF to PDB conversion:", len(design_pseudoknot_test_df))

get_polymer_type_statistics(design_pseudoknot_test_df)

design_pseudoknot_test_df.to_csv(design_pseudoknot_test_csv_output_path, index = False)
design_pseudoknot_test_df

## Specificity Dataset

In [None]:
specificity_valid_csv_path = "/home/akubaney/projects/na_mpnn/data/datasets/specificity_dataset_v2/valid.csv"
specificity_valid_cluster_ids_path = "/home/akubaney/projects/na_mpnn/data/datasets/specificity_dataset_v2/valid_protein_chain_cluster_ids.txt"

specificity_test_csv_path = "/home/akubaney/projects/na_mpnn/data/datasets/specificity_dataset_v2/test.csv"
specificity_test_cluster_ids_path = "/home/akubaney/projects/na_mpnn/data/datasets/specificity_dataset_v2/test_protein_chain_cluster_ids.txt"

### Valid Set

In [None]:
specificity_valid_pdbs_output_directory = os.path.join(pdb_output_directory, "specificity_valid")
specificity_valid_csv_output_path = os.path.join(csv_output_directory, "specificity_valid.csv")

specificity_valid_df = pd.read_csv(specificity_valid_csv_path)
specificity_valid_cluster_ids = read_cluster_ids_text_file(specificity_valid_cluster_ids_path)
print("Original valid dataset size:", len(specificity_valid_df))

specificity_valid_df = get_exclusive_cluster_subset(
    specificity_valid_df, 
    "protein_chain_cluster_ids", 
    specificity_valid_cluster_ids
)
print("Valid dataset size after exclusive cluster subset:", len(specificity_valid_df))

specificity_valid_df = get_ppm_subset(specificity_valid_df)
print("Valid dataset size after PPM subset:", len(specificity_valid_df))

specificity_valid_df = get_length_subset(
    specificity_valid_df,
    max_length = 1000,
    min_length = 20
)
print("Valid dataset size after length subset:", len(specificity_valid_df))

specificity_valid_df = convert_cif_to_pdb(
    specificity_valid_df, 
    specificity_valid_pdbs_output_directory, 
    use_rna_solo = False,
    pdb_id_to_rna_solo_paths = None
)
print("Valid dataset size after CIF to PDB conversion:", len(specificity_valid_df))

get_ppm_statistics(specificity_valid_df)

specificity_valid_df.to_csv(specificity_valid_csv_output_path, index = False)
specificity_valid_df

### Test Set

In [None]:
specificity_test_pdbs_output_directory = os.path.join(pdb_output_directory, "specificity_test")
specificity_test_csv_output_path = os.path.join(csv_output_directory, "specificity_test.csv")

specificity_test_df = pd.read_csv(specificity_test_csv_path)
specificity_test_cluster_ids = read_cluster_ids_text_file(specificity_test_cluster_ids_path)
print("Original test dataset size:", len(specificity_test_df))

specificity_test_df = get_exclusive_cluster_subset(
    specificity_test_df, 
    "protein_chain_cluster_ids", 
    specificity_test_cluster_ids
)
print("Test dataset size after exclusive cluster subset:", len(specificity_test_df))

specificity_test_df = get_ppm_subset(specificity_test_df)
print("Test dataset size after PPM subset:", len(specificity_test_df))

specificity_test_df = get_length_subset(
    specificity_test_df,
    max_length = 1000,
    min_length = 20
)
print("Test dataset size after length subset:", len(specificity_test_df))

specificity_test_df = convert_cif_to_pdb(
    specificity_test_df, 
    specificity_test_pdbs_output_directory, 
    use_rna_solo = False,
    pdb_id_to_rna_solo_paths = None
)
print("Test dataset size after CIF to PDB conversion:", len(specificity_test_df))

get_ppm_statistics(specificity_test_df)

specificity_test_df.to_csv(specificity_test_csv_output_path, index = False)
specificity_test_df