# RF2NA CisBP Distillation Set

This notebook can be used to process Lily McHugh's CisBP structure-predicted specificity dataset into a csv that can be utilized for MPNN training.

## Setup

In [None]:
import os
import shutil
import io

import numpy as np
import pandas as pd

## Paths and Directories

In [None]:
# Path to the csv from Lily's distillation set.
initial_csv_path = "/projects/ml/prot_dna/prot_na_distill.v3.csv"

# Directory containing the predicted PDBs.
structure_directory = "/projects/ml/prot_dna/distill_v2/filtered"

# Path to the Gene ID -> PPM Code mapping.
tf_information_path = "/home/akubaney/projects/data/cisBP_2021_06_23/TF_Information.txt"

# Directory containing the raw ppms.
raw_ppm_directory = "/home/akubaney/projects/data/cisBP_2021_06_23/ppms"

# Directories for preprocessed data.
preprocessed_ppms_directory = "./preprocessed_ppms"
preprocessed_ppms_directory = os.path.abspath(preprocessed_ppms_directory)

preprocessed_data_directory = "./preprocessed_data"
preprocessed_data_directory = os.path.abspath(preprocessed_data_directory)

## I/O Functions

In [None]:
def read_text_file(path):
    with open(path, mode="rt") as f:
        return f.read()

## Load Initial CSV

This notebook starts from the Lily's CSV for the distillation set.

In [None]:
df = pd.read_csv(initial_csv_path)

In [None]:
df

## Filter based on i_pae and plddt

In [None]:
df = df[(df.i_pae <= 6) & (df.plddt >= 0.85)].copy()
df.reset_index(inplace = True)

In [None]:
df

## Compute the ID, Structure Path, Date, and Dataset Name and Drop Unnecessary Columns.

In [None]:
df["id"] = df["gene_id"] + "_" + df["DNA sequence"]

df["structure_path"] = structure_directory + os.sep + df["gene_id"].str.slice(0, 2) + os.sep + df["id"] + ".pdb"

df["date"] = "1970-01-01"

df["dataset_name"] = "rf2na_distillation_cis_bp"

df = df[["id", "structure_path", "date", "dataset_name", "gene_id"]].copy()

In [None]:
df

## Preprocess the Structure Files into Interface Masks, Base Pair Masks, Sequence, etc.

In [None]:
# Save a temporary dataframe, to be used to do preprocessing.
df.to_csv("./preprocessing_input.csv", index = False)

In [None]:
if os.path.exists(preprocessed_data_directory):
    shutil.rmtree(preprocessed_data_directory)
os.makedirs(preprocessed_data_directory)

Run the following, starting from the directory that this script lives in.

```
cd /home/akubaney/projects/na_mpnn/data

dataset_directory="./datasets/rf2na_distillation_cis_bp"

input_csv_path=$dataset_directory"/preprocessing_input.csv"
output_directory=$dataset_directory"/preprocessed_data"
preprocessing_tmp_path=$dataset_directory"/preprocessing_tmp.out"

rm $preprocessing_tmp_path

sbatch --output=$preprocessing_tmp_path --array=0-499 ./preprocess_dataset.sh $input_csv_path $output_directory
```

In [None]:
# Read the preprocessing input dataframe.
df = pd.read_csv("./preprocessing_input.csv")

In [None]:
# Remove examples that failed preprocessing.
failed_directory = os.path.join(preprocessed_data_directory, "bad")
failed_preprocessing_ids = []
reasons_for_failure_count = dict()
for file_name in os.listdir(failed_directory):
    id = os.path.splitext(file_name)[0]
    failed_preprocessing_ids.append(id)

    file_path = os.path.join(failed_directory, file_name)
    reason_for_failure = read_text_file(file_path)
    reasons_for_failure_count[reason_for_failure] = reasons_for_failure_count.get(reason_for_failure, 0) + 1

print(failed_preprocessing_ids)
print(len(failed_preprocessing_ids))
print(reasons_for_failure_count)

df = df[np.logical_not(np.isin(df.id, failed_preprocessing_ids))].copy()

In [None]:
df

In [None]:
# Load preprocessed file paths
preprocessed_attribute_names = os.listdir(preprocessed_data_directory)
preprocessed_attribute_names.remove("bad")
for attribute_name in preprocessed_attribute_names:
    attribute_path_dict = dict()
    for id in df.id:
        if attribute_name == "sequences":
            extension = ".csv"
        else:
            extension = ".npy"
        attribute_path = os.path.join(preprocessed_data_directory, attribute_name, id + extension)
        assert(os.path.exists(attribute_path))
        attribute_path_dict[id] = attribute_path
    df[attribute_name + "_path"] = df.id.map(attribute_path_dict)

In [None]:
df

## Preprocess PPMs

In [None]:
def load_raw_ppm(raw_ppm_path):
    """
    Given a path to a raw ppm, return an Lx4 numpy array of the ppm.

    Arguments:
        raw_ppm_path (str): the path to the raw ppm.
    
    Returns:
        ppm_df (np.float64 np.ndarray): an Lx4 dataframe of the ppm, where the 
            columns are A, C, G, T.
    """
    ppm_text = read_text_file(raw_ppm_path)

    # Remove unnecessary text at the top.
    ppm_array_text = "position" + ppm_text.split("Pos")[-1]

    # Read the ppm as a csv.
    ppm_df = pd.read_csv(io.StringIO(ppm_array_text), sep = "\t")

    # Get rid of the position column.
    ppm_df = ppm_df.drop(columns = ["position"])

    return ppm_df

def preprocess_ppms(raw_ppm_directory, ppm_output_directory):
    os.makedirs(ppm_output_directory, exist_ok = True)
    
    for raw_ppm_file_name in os.listdir(raw_ppm_directory):
        raw_ppm_path = os.path.join(raw_ppm_directory, raw_ppm_file_name)

        # Remove the extension from the file name. Note, this allows for file
        # names that have '.' in the name.
        ppm_name = os.path.splitext(raw_ppm_file_name)[0]

        # Load the ppm.
        ppm_df = load_raw_ppm(raw_ppm_path)

        # Save the ppm.
        ppm_output_path = os.path.join(ppm_output_directory, ppm_name + ".csv")

        # Assert that nothing exists at the output path.
        assert(not os.path.exists(ppm_output_path))

        # Save the ppm.
        ppm_df.to_csv(ppm_output_path, index = False)

In [None]:
if os.path.exists(preprocessed_ppms_directory):
    shutil.rmtree(preprocessed_ppms_directory)
os.makedirs(preprocessed_ppms_directory)

In [None]:
preprocess_ppms(raw_ppm_directory, preprocessed_ppms_directory)

## Match PPMs with PDB IDs

Use the TF_Information to Pair Structures and PPMs

In [None]:
# Load the information from the TF_Information.txt file.
tf_information_str = read_text_file(tf_information_path)

# Make some formatting corrections.
tf_information_str = tf_information_str.replace(",\t", ",").replace(",\n", "\n")

# Load the information as a dataframe.
tf_information_df = pd.read_csv(io.StringIO(tf_information_str), sep = "\t")

# Comptue the ppm paths from the motif ids.
gene_id_to_ppm_paths = dict()
missing_ppm_ids = []
for gene_id, ppm_ids_str in zip(tf_information_df["Gene_ID"], tf_information_df["Motif_ID"]):
    ppm_ids = ppm_ids_str.split(",")

    # Turn the ppm ids into ppm paths, and record them if they exist.
    ppm_paths = []
    for ppm_id in ppm_ids:
        ppm_path = os.path.join(preprocessed_ppms_directory, ppm_id + ".csv")
        if os.path.exists(ppm_path):
            ppm_paths.append(ppm_path)
        else:
            missing_ppm_ids.append(ppm_id)
    
    if len(ppm_paths) > 0:
        gene_id_to_ppm_paths[gene_id] = [tuple(ppm_paths)]

print(gene_id_to_ppm_paths)
print(len(gene_id_to_ppm_paths))
print(missing_ppm_ids)
print(len(missing_ppm_ids))

In [None]:
df["ppm_paths"] = df.gene_id.apply(lambda gene_id: gene_id_to_ppm_paths.get(gene_id, []))

# The gene ID column is no longer needed.
df = df.drop(columns = ["gene_id"])

In [None]:
df

## Save the Preprocessing Output Dataframe.

In [None]:
df.to_csv("./preprocessing_output.csv", index = False)