# RF2NA Transfac Distillation Set

This notebook can be used to process Lily McHugh's Transfac structure-predicted specificity dataset into a csv that can be utilized for MPNN training.

## Setup

In [None]:
import os
import shutil
import io

import numpy as np
import pandas as pd

## Paths and Directories

In [None]:
# Path to the csv from Lily's distillation set.
initial_csv_path = "/projects/ml/prot_dna/transfac/fasta_v2/rf2_scores.csv"

# Directory containing the predicted PDBs.
structure_directory = "/projects/ml/prot_dna/transfac/fasta_v2"

# Path to factor ID to PCM ID mapping.
factor_id_to_ppm_id_csv_path = "/home/akubaney/projects/data/transfac_2023_05_30/factor_df_seq_mat_best_out.csv"

# Path to the file containing the raw pcms.
raw_pcms_path = "/home/akubaney/projects/data/transfac_2023_05_30/matrix.dat"

# Directories for preprocessed data.
preprocessed_ppms_directory = "./preprocessed_ppms"
preprocessed_ppms_directory = os.path.abspath(preprocessed_ppms_directory)

preprocessed_data_directory = "./preprocessed_data"
preprocessed_data_directory = os.path.abspath(preprocessed_data_directory)

## I/O Functions

In [None]:
def read_text_file(path):
    with open(path, mode="rt") as f:
        return f.read()

## Load Initial CSV

This notebook starts from the Lily's CSV for the distillation set.

In [None]:
df = pd.read_csv(initial_csv_path)

In [None]:
df

## Filter based on i_pae and plddt

In [None]:
df = df[(df.i_pae <= 6) & (df.plddt >= 0.85)].copy()
df.reset_index(inplace = True)

In [None]:
df

## Compute the ID, Structure Path, Date, and Dataset Name and Drop Unnecessary Columns.

In [None]:
df["id"] = df["tag"] + "_pred"

df["factor_id"] = df["tag"].str.slice(0, 6)

df["structure_path"] = structure_directory + os.sep + df["factor_id"].str.slice(1, 3) + os.sep + df["factor_id"] + os.sep + df["id"] + ".pdb"

df["date"] = "1970-01-01"

df["dataset_name"] = "rf2na_distillation_transfac"

df = df[["id", "structure_path", "date", "dataset_name", "factor_id"]].copy()

In [None]:
df

## Preprocess the Structure Files into Interface Masks, Base Pair Masks, Sequence, etc.

In [None]:
# Save a temporary dataframe, to be used to do preprocessing.
df.to_csv("./preprocessing_input.csv", index = False)

In [None]:
if os.path.exists(preprocessed_data_directory):
    shutil.rmtree(preprocessed_data_directory)
os.makedirs(preprocessed_data_directory)

Run the following, starting from the directory that this script lives in.

```
cd /home/akubaney/projects/na_mpnn/data

dataset_directory="./datasets/rf2na_distillation_transfac"

input_csv_path=$dataset_directory"/preprocessing_input.csv"
output_directory=$dataset_directory"/preprocessed_data"
preprocessing_tmp_path=$dataset_directory"/preprocessing_tmp.out"

rm $preprocessing_tmp_path

sbatch --output=$preprocessing_tmp_path --array=0-999 ./preprocess_dataset.sh $input_csv_path $output_directory
```

In [None]:
# Read the preprocessing input dataframe.
df = pd.read_csv("./preprocessing_input.csv")

In [None]:
# Remove examples that failed preprocessing.
failed_directory = os.path.join(preprocessed_data_directory, "bad")
failed_preprocessing_ids = []
reasons_for_failure_count = dict()
for file_name in os.listdir(failed_directory):
    id = os.path.splitext(file_name)[0]
    failed_preprocessing_ids.append(id)

    file_path = os.path.join(failed_directory, file_name)
    reason_for_failure = read_text_file(file_path)
    reasons_for_failure_count[reason_for_failure] = reasons_for_failure_count.get(reason_for_failure, 0) + 1

print(failed_preprocessing_ids)
print(len(failed_preprocessing_ids))
print(reasons_for_failure_count)

df = df[np.logical_not(np.isin(df.id, failed_preprocessing_ids))].copy()

In [None]:
df

In [None]:
# Load preprocessed file paths
preprocessed_attribute_names = os.listdir(preprocessed_data_directory)
preprocessed_attribute_names.remove("bad")
for attribute_name in preprocessed_attribute_names:
    attribute_path_dict = dict()
    for id in df.id:
        if attribute_name == "sequences":
            extension = ".csv"
        else:
            extension = ".npy"
        attribute_path = os.path.join(preprocessed_data_directory, attribute_name, id + extension)
        assert(os.path.exists(attribute_path))
        attribute_path_dict[id] = attribute_path
    df[attribute_name + "_path"] = df.id.map(attribute_path_dict)

In [None]:
df

## Preprocess PPMs

In [None]:
def preprocess_ppms(all_pcms_path, ppm_output_directory):
    """
    Given a path to a the file containing all the pcms and an output directory,
    preprocesse the pcms into ppms and save them in the output directory.

    Arguments:
        all_pcms_path (str): the path to the text file containing all the pcms.
        ppm_output_directory (str): the directory to save the ppms in.

    Side Effects:
        Saves the ppms in the output directory.
    """
    os.makedirs(ppm_output_directory, exist_ok = True)

    # Load the text from 
    all_pcms_text = read_text_file(all_pcms_path)
    pcm_entries = all_pcms_text.strip().split("//\n")[1:]

    # Load each pcm, convert into ppm, and save.
    for pcm_entry in pcm_entries:
        ppm_id = None
        pcm_lines = []
        is_reading_pcm_lines = False
        for line in pcm_entry.strip().split("\n"):
            # Clean up the line formatting.
            line = line.strip()
            while "  " in line:
                line = line.replace("  ", " ")

            # Record the PPM ID.
            if line.startswith("AC"):
                ppm_id = line.split(" ")[1]
            # Record the PCM header and start reading pcm lines.
            elif line.startswith("P0"):
                # Add an extra temporary column to catch the extra column of data
                # in the pcm.
                if len(line.split(" ")) == 5:
                    line += " TEMP"
                
                pcm_lines.append(line)
                is_reading_pcm_lines = True
            # Record the PCM line.
            elif is_reading_pcm_lines:
                if line.startswith("XX"):
                    break
                else:
                    pcm_lines.append(line)

        # Create the pcm and remove unnecessary columns.
        pcm_df = pd.read_csv(io.StringIO("\n".join(pcm_lines)), delimiter = " ")
        pcm_df = pcm_df.drop(columns = ["P0", "TEMP"])
        
        # Turn the pcm into a ppm.
        ppm_df = pcm_df.div(pcm_df.sum(axis = 1), axis = 0)

        # Determine the ppm output path.
        ppm_output_path = os.path.join(ppm_output_directory, ppm_id + ".csv")

        # Assert that nothing exists at the output path.
        assert(not os.path.exists(ppm_output_path))

        # Save the ppm.
        ppm_df.to_csv(ppm_output_path, index = False)

In [None]:
if os.path.exists(preprocessed_ppms_directory):
    shutil.rmtree(preprocessed_ppms_directory)
os.makedirs(preprocessed_ppms_directory)

In [None]:
preprocess_ppms(raw_pcms_path, preprocessed_ppms_directory)

## Match PPMs with PDB IDs

Use the TF_Information to Pair Structures and PPMs

In [None]:
# Load the dataframe containing the factor id and ppm ids.
factor_id_to_ppm_id_df = pd.read_csv(factor_id_to_ppm_id_csv_path)

# Compute the ppm paths from the matrix ids.
factor_id_to_ppm_paths = dict()
missing_ppm_ids = []
factor_id_with_no_ppm_ids = []
for factor_id, ppm_ids_str in zip(factor_id_to_ppm_id_df["factor_id"], factor_id_to_ppm_id_df["matrix_ids"]):
    # If the ppm ids are empty, continue.
    if pd.isna(ppm_ids_str):
        factor_id_with_no_ppm_ids.append(factor_id)
        continue

    # Split the string representation of the ppm ids.
    if ppm_ids_str.startswith(";"):
        ppm_ids_str = ppm_ids_str[1:]
    ppm_ids = ppm_ids_str.split(";")

    # Turn the ppm ids into ppm paths, and record them if they exist.
    ppm_paths = []
    for ppm_id in ppm_ids:
        ppm_path = os.path.join(preprocessed_ppms_directory, ppm_id + ".csv")
        if os.path.exists(ppm_path):
            ppm_paths.append(ppm_path)
        else:
            missing_ppm_ids.append(ppm_id)
    
    if len(ppm_paths) > 0:
        factor_id_to_ppm_paths[factor_id] = [tuple(ppm_paths)]

# print(factor_id_to_ppm_paths)
print(len(factor_id_to_ppm_paths))
print(missing_ppm_ids)
print(len(missing_ppm_ids))
print(factor_id_with_no_ppm_ids)
print(len(factor_id_with_no_ppm_ids))

In [None]:
df["ppm_paths"] = df.factor_id.apply(lambda factor_id: factor_id_to_ppm_paths.get(factor_id, []))

# The gene ID column is no longer needed.
df = df.drop(columns = ["factor_id"])

In [None]:
df

## Save the Preprocessing Output Dataframe.

In [None]:
df.to_csv("./preprocessing_output.csv", index = False)