# Preprocessing
Wang, D., Zhang, C., Wang, B., Li, B., Wang, Q., Liu, D., Wang, H., Zhou, Y., Shi, L., Lan, F., & Wang, Y. (2019). Optimized CRISPR guide RNA design for two high-fidelity Cas9 variants by deep learning. Nature Communications, 10(1), 4284.

This file was used to preprocess the data obtained from supplementary table 2.

NOTE: You will need the optional dependancies to run this file.\
Ensure you have downloaded the human reference genome and built the bowtie2 index.

In [None]:
# Information required to extend sequnces
bowtie2Bin = "bowtie2"
samtoolsBin = "samtools"
referenceHumanGenome = "human.fna"
bowtie2Index = "human_bt"

In [None]:
import pandas as pd
from Bio.SeqUtils import MeltingTemp as mt
from Bio.Seq import Seq
import numpy as np
import torch as t
from subprocess import run
import os

In [None]:
def encode(seq, z='ATCG'):
    return [list(map(lambda x: 1 if x==c else 0, z)) for c in seq]
assert(encode('ATCG') == [[1,0,0,0], [0,1,0,0], [0,0,1,0], [0,0,0,1]])

In [None]:
def rc(dna):
    try:
        complements = str.maketrans('acgtrymkbdhvACGTRYMKBDHV', 'tgcayrkmvhdbTGCAYRKMVHDB')
        rcseq = dna.translate(complements)[::-1]
        return rcseq
    except Exception as e:
        return None

In [None]:
wangData = pd.read_excel("wang_2019_table_s2.xlsx", skiprows=1)
wangData.rename({"gRNA_Seq" : "seq", "PAM": "pam", "Wt_Efficiency": "Indel freq"}, axis=1, inplace=True)
wangData = wangData[["seq", "pam", "Indel freq"]]
wangData.dropna(subset=["Indel freq"], inplace=True)
wangData = wangData[wangData["pam"].str[1:] == "GG"]
wangData["seq"] = wangData["seq"] + wangData["pam"]
wangData = wangData[["seq", "Indel freq"]]
wangData.drop_duplicates(subset=["seq"])

In [None]:
with open("input.txt", 'w') as outFile :
    outFile.writelines([f"{x}\n" for x in wangData["seq"].to_list()])
command = [
    bowtie2Bin,
    "-x",
    bowtie2Index,
    "-p",
    "32",
    "--reorder",
    "--no-hd",
    "-t",
    "-r",
    "-U",
    "input.txt",
    "-S",
    "output.txt"
]
run(command, check=True)

forwarded = 0
dropped = 0
reversed = []
with open("output.txt", "r") as inFile, open("samtools-faidx-region-file.txt", "w") as outFile:
    for line in inFile:
        entries = line.split("\t")
        if ("XM:i:0" in entries) and ("XS:i:0" not in entries):
            forwarded += 1
            outFile.write(f"{entries[2]}:{int(entries[3])-4}-{int(entries[3])+26}\n")
            if (entries[1] == "16"):
                reversed.append(True)
            else:
                reversed.append(False)
        else:
            dropped += 1
            length = wangData.shape[0]
            wangData.drop(wangData[wangData["seq"] == entries[9]].index, inplace=True)
            if (length == wangData.shape[0]):
                wangData.drop(wangData[wangData["seq"] == rc(entries[9])].index, inplace=True)
            if (length == wangData.shape[0]):
                print(f"Couldn't remove {entries[9]}")

wangData.insert(0,"rc",reversed)

command = [
    samtoolsBin,
    'faidx',
    referenceHumanGenome,
    '-r',
    'samtools-faidx-region-file.txt',
    '-o',
    'samtools-faidx-region-file-out.txt'
]
run(command, check=True)

with open("samtools-faidx-region-file-out.txt", "r") as inFile:
    wangData["samtools_raw"] = [x.strip().upper() for x in inFile.readlines()][1::2]
wangData["seq"] = wangData.apply(lambda x: rc(x.samtools_raw)[:-1] if  x.rc else x.samtools_raw[:-1], axis=1)
wangData = wangData[["seq", "Indel freq"]]
wangData["Indel freq"] = wangData["Indel freq"] * 100
os.unlink("input.txt")
os.unlink("output.txt")
os.unlink("samtools-faidx-region-file.txt")
os.unlink("samtools-faidx-region-file-out.txt")

In [None]:
onehotEncoded = []

for seq in wangData["seq"]:
    onehotEncoded.append(np.array(encode(seq)).transpose().tolist())

wangData.insert(1, "Onehot Encoding", onehotEncoded)
wangData.head()

In [None]:
meltingTemp = []

for seq in wangData["seq"]:
    myseq = Seq(seq)
    meltingTemp.append(mt.Tm_NN(myseq))

wangData.insert(2, "Melting Point", meltingTemp)
wangData.head()

In [None]:
onehot = []
response = []
meltingpoint = []

for rowIdx, row in wangData.iterrows():
    onehot.append(row["Onehot Encoding"])
    response.append(float(row["Indel freq"]))
    meltingpoint.append(float(row["Melting Point"]))

_onehot = t.tensor(onehot, dtype=t.float32)
_response = t.tensor(response, dtype=t.float32)
_meltingpoint = t.tensor(meltingpoint, dtype=t.float32)

t.save(_onehot, f'wang_2019_X.pt')
t.save(_response, f'wang_2019_Y.pt')
t.save(_meltingpoint, f'wang_2019_Features.pt')

print(_onehot.shape, _response.shape, _meltingpoint.shape)

In [None]:
wangData.to_csv("wang_2019.csv", index=False)