# RCSB CIF Nucleic Acid Dataset

Andrew Kubaney (akubaney)

This notebook can be used to process the nucleic acid-containing entries in the RCSB dataset.

## Setup

In [None]:
import os
import shutil
import io
import collections

import numpy as np
import pandas as pd

## Paths and Directories

In [None]:
# Path to the preprocessed csv from the RCSB CIF dataset.
initial_csv_path = "/home/akubaney/projects/na_mpnn/data/datasets/rcsb_cif/pdb_21Jan2025.csv"

# Directory containing the CIFs.
structure_directory = "/databases/rcsb/cif"

# Path to DeepPBS PDB+chain ID to PCM ID mapping.
pdb_chain_id_and_pcm_id_path = "./jaspar_h11mo_cluster_wise_dna_containing_dataset.npy"

# Directories containing the raw PCMs.
raw_pcm_directory_jaspar = "/home/akubaney/projects/data/jaspar_2025_02_05/pcms"
raw_pcm_directory_hocomoco_v11_human = "/home/akubaney/projects/data/hocomoco_v11_2025_02_05/human/pcms"
raw_pcm_directory_hocomoco_v11_mouse = "/home/akubaney/projects/data/hocomoco_v11_2025_02_05/mouse/pcms"

# Directories for preprocessed data.
preprocessed_data_directory = "./preprocessed_data"
preprocessed_data_directory = os.path.abspath(preprocessed_data_directory)

preprocessed_ppms_directory = "./preprocessed_ppms"
preprocessed_ppms_directory = os.path.abspath(preprocessed_ppms_directory)

## I/O Functions

In [None]:
def read_text_file(path):
    with open(path, mode="rt") as f:
        return f.read()

## Load Initial CSV

This notebook starts from the RCSB CIF dataset

In [None]:
df = pd.read_csv(initial_csv_path)

# Turn strings into lists.
tolist = lambda l : l[1:-1].replace("'","").split(", ")
for key in ('poly','poly_type','nonpoly','poly_sequence'):
    df[key] = df[key].apply(tolist)

df

## Filter based on Number of Heavy Atoms, Coverage, Number of Unknown Residues, Resolution, and Presence of Nucleic Acid

In [None]:
# Filter based on number of heavy atoms and coverage.
sel = ((df.num_heavy>=100) & (df.coverage>=0.9))

# Filter sequences with too many unknown residues.
def seq_filter(seqs):

    maxX = 20
    
    Lmax = 0 if len(seqs)<1 else max([len(s) for s in seqs])
    s = "".join(seqs)
    L = len(s)
    if Lmax<=maxX:
        return True

    top_aa = collections.Counter(s).most_common(1)[0]
    if top_aa[0]=='X' and top_aa[1]>maxX:
        return False
    else:
        return True

sel = sel & (df.poly_sequence.apply(seq_filter))

# Filter sequences based on resolution.
# In this case, include nan resolution, since this captures NMR structures.
sel = sel & ((df.resolution<=3.5) | (np.isnan(df.resolution)))

# Filter entries with no nucleic acid.
def chains_contain_nucleic_acid(chain_types):
    return "polydeoxyribonucleotide/polyribonucleotide hybrid" in chain_types or \
           "polydeoxyribonucleotide" in chain_types or \
           "polyribonucleotide" in chain_types

sel = sel & (df.poly_type.apply(chains_contain_nucleic_acid))

df = df[sel].copy()

df.reset_index(inplace = True)

In [None]:
df

## Compute the Structure Path and Drop Unnecessary Columns

In [None]:
df["id"] = df["label"]

df["structure_path"] = structure_directory + os.sep + df["id"].str.slice(1,3) + os.sep + df["id"] + ".cif.gz"

df["dataset_name"] = "rcsb_cif_na"

df = df[["id", "structure_path", "date", "dataset_name"]].copy()

In [None]:
df

## Preprocess the Structure Files into Interface Masks, Base Pair Masks, Sequence, etc.

In [None]:
# Save a temporary dataframe, to be used to do preprocessing.
df.to_csv("./preprocessing_input.csv", index = False)

In [None]:
if os.path.exists(preprocessed_data_directory):
    shutil.rmtree(preprocessed_data_directory)
os.makedirs(preprocessed_data_directory)

Run the following, starting from the directory that this script lives in.

```
cd /home/akubaney/projects/na_mpnn/data

dataset_directory="./datasets/rcsb_cif_na"

input_csv_path=$dataset_directory"/preprocessing_input.csv"
output_directory=$dataset_directory"/preprocessed_data"
preprocessing_tmp_path=$dataset_directory"/preprocessing_tmp.out"

rm $preprocessing_tmp_path

sbatch --output=$preprocessing_tmp_path --array=0-499 ./preprocess_dataset.sh $input_csv_path $output_directory
```

In [None]:
# Read the preprocessing input dataframe.
df = pd.read_csv("./preprocessing_input.csv")

In [None]:
# Remove examples that failed preprocessing.
failed_directory = os.path.join(preprocessed_data_directory, "bad")
failed_preprocessing_ids = []
reasons_for_failure_count = dict()
for file_name in os.listdir(failed_directory):
    id = os.path.splitext(file_name)[0]
    failed_preprocessing_ids.append(id)

    file_path = os.path.join(failed_directory, file_name)
    reason_for_failure = read_text_file(file_path)
    reasons_for_failure_count[reason_for_failure] = reasons_for_failure_count.get(reason_for_failure, 0) + 1

print(failed_preprocessing_ids)
print(len(failed_preprocessing_ids))
print(reasons_for_failure_count)

df = df[np.logical_not(np.isin(df.id, failed_preprocessing_ids))].copy()

In [None]:
df

In [None]:
# Load preprocessed file paths
preprocessed_attribute_names = os.listdir(preprocessed_data_directory)
preprocessed_attribute_names.remove("bad")
for attribute_name in preprocessed_attribute_names:
    attribute_path_dict = dict()
    for id in df.id:
        if attribute_name == "sequences":
            extension = ".csv"
        else:
            extension = ".npy"
        attribute_path = os.path.join(preprocessed_data_directory, attribute_name, id + extension)
        assert(os.path.exists(attribute_path))
        attribute_path_dict[id] = attribute_path
    df[attribute_name + "_path"] = df.id.map(attribute_path_dict)

In [None]:
df

## Preprocess PCMs into PPMs

In [None]:
def load_ppm_from_raw_pcm(raw_pcm_path, pcm_format):
    """
    Given a path to a raw pcm, return an Lx4 numpy array of the ppm.

    Arguments:
        raw_pcm_path (str): the path to the raw pcm.
    
    Returns:
        ppm_df (np.float64 np.ndarray): an Lx4 dataframe of the ppm, where the 
            columns are A, C, G, T.
    """
    pcm_text = read_text_file(raw_pcm_path)
    pcm_text = pcm_text.strip()

    # Jaspar format.
    if pcm_format == "jaspar":
        # Exclude the header.
        pcm_lines = pcm_text.split("\n")[1:]

        # Dictionary to create the dataframe.
        data_dict = dict()

        # Extract the counts for each base.
        for line in pcm_lines:
            # Standardize the line.
            line = line.strip()
            line = line.replace(" ]", "")
            while "  " in line:
                line = line.replace("  ", " ")
            
            # Extract the base name and the counts array text.
            base, base_counts_str = line.split(" [")
            base_counts_str = base_counts_str.strip()
            base_counts = list(map(lambda count_str: int(count_str), base_counts_str.split(" ")))

            data_dict[base] = base_counts
        
        # Create the pcm dataframe.
        pcm_df = pd.DataFrame(data_dict)
    # Hocomoco format.
    elif pcm_format == "hocomoco":
        # Exclude the header.
        pcm_lines = pcm_text.split("\n")[1:]

        # Read the pcm dataframe.
        pcm_df = pd.read_csv(io.StringIO("\n".join(pcm_lines)), sep = "\t", names = ["A", "C", "G", "T"])
    else:
        raise ValueError(f"Invalid pcm_format: {pcm_format}")
        
    # Turn the pcm into a ppm.
    ppm_df = pcm_df.div(pcm_df.sum(axis = 1), axis = 0)

    return ppm_df

def preprocess_pcms_into_ppms(raw_pcm_directory, ppm_output_directory, pcm_format):
    os.makedirs(ppm_output_directory, exist_ok = True)

    for raw_pcm_file_name in os.listdir(raw_pcm_directory):
        raw_pcm_path = os.path.join(raw_pcm_directory, raw_pcm_file_name)

        # Remove the extension from the file name. Note, this allows for file
        # names that have '.' in the name.
        if raw_pcm_file_name.endswith(".jaspar"):
            ppm_name = raw_pcm_file_name
        else:
            ppm_name = os.path.splitext(raw_pcm_file_name)[0]

        # Load the raw pcm and preprocess it into a ppm.
        ppm_df = load_ppm_from_raw_pcm(raw_pcm_path, pcm_format)

        # Determine the ppm output path.
        ppm_output_path = os.path.join(ppm_output_directory, ppm_name + ".csv")

        # Assert that nothing exists at the output path.
        assert(not os.path.exists(ppm_output_path))

        # Save the ppm.
        ppm_df.to_csv(ppm_output_path, index = False)

In [None]:
if os.path.exists(preprocessed_ppms_directory):
    shutil.rmtree(preprocessed_ppms_directory)
os.makedirs(preprocessed_ppms_directory)

In [None]:
preprocess_pcms_into_ppms(raw_pcm_directory_jaspar, preprocessed_ppms_directory, "jaspar")
preprocess_pcms_into_ppms(raw_pcm_directory_hocomoco_v11_human, preprocessed_ppms_directory, "hocomoco")
preprocess_pcms_into_ppms(raw_pcm_directory_hocomoco_v11_mouse, preprocessed_ppms_directory, "hocomoco")

## Match PPMs with PDB IDs

In [None]:
# Load the PDB+chain ID to PCM label from DeepPBS. This data comes 
# pre-clustered; it will be re-clustered later.
pdb_chain_id_and_pcm_id = np.load(pdb_chain_id_and_pcm_id_path, allow_pickle = True)

pdb_id_to_ppm_paths = dict()
missing_ppm_ids = []
for cluster_list in pdb_chain_id_and_pcm_id:
    for (pdb_chain_id, chain_pcm_ids) in cluster_list:
        pdb_id, chain_id = pdb_chain_id.split("_")
        
        ppm_paths = []
        for pcm_id in chain_pcm_ids:
            ppm_path = os.path.join(preprocessed_ppms_directory, pcm_id + ".csv")
            if os.path.exists(ppm_path):
                ppm_paths.append(ppm_path)
            else:
                missing_ppm_ids.append(pcm_id)
        
        if len(ppm_paths) > 0:
            if pdb_id not in pdb_id_to_ppm_paths:
                pdb_id_to_ppm_paths[pdb_id] = []
            
            pdb_id_to_ppm_paths[pdb_id].append(tuple(ppm_paths))

print(pdb_id_to_ppm_paths)
print(len(pdb_id_to_ppm_paths))
print(missing_ppm_ids)
print(len(missing_ppm_ids))

In [None]:
df["ppm_paths"] = df.id.apply(lambda id: pdb_id_to_ppm_paths.get(id, []))

In [None]:
df

## Save the Preprocessing Output Dataframe.

In [None]:
df.to_csv("./preprocessing_output.csv", index = False)