# Preprocessing
Xiang, X., Corsi, G. I., Anthon, C., Qu, K., Pan, X., Liang, X., Han, P., Dong, Z., Liu, L., Zhong, J., Ma, T., Wang, J., Zhang, X., Jiang, H., Xu, F., Liu, X., Xu, X., Wang, J., Yang, H., Bolund, L., Church, G. M., Lin, L., Gorodkin, J., & Luo, Y. (2021). Enhancing CRISPR-Cas9 gRNA efficiency prediction by data integration and deep learning. Nature Communications, 12(1), 3238.

This file was used to preprocess the data obtained from supplementary data 1, pages spCas9_eff_D8-dox and spCas9_eff_D10-dox.

NOTE: You will need the optional dependancies to run this file.\
Ensure you have downloaded the human reference genome and built the bowtie2 index.

In [None]:
# Information required to extend sequnces
# NOTE: If you get errors about files not exisings, try using absolute paths
bowtie2Bin = "bowtie2"
samtoolsBin = "samtools"
referenceHumanGenome = "human.fna"
bowtie2Index = "human_bt"

In [None]:
import pandas as pd
from Bio.SeqUtils import MeltingTemp as mt
from Bio.Seq import Seq
import numpy as np
import torch as t
from subprocess import run
import os

In [None]:
def encode(seq, z='ATCG'):
    return [list(map(lambda x: 1 if x==c else 0, z)) for c in seq]
assert(encode('ATCG') == [[1,0,0,0], [0,1,0,0], [0,0,1,0], [0,0,0,1]])

In [None]:
def rc(dna):
    try:
        complements = str.maketrans('acgtrymkbdhvACGTRYMKBDHV', 'tgcayrkmvhdbTGCAYRKMVHDB')
        rcseq = dna.translate(complements)[::-1]
        return rcseq
    except Exception as e:
        return None

In [None]:
xiangData_8 = pd.read_excel("xiang_2021_d1_spCas9_eff_D8-dox.xlsx")
xiangData_8 = xiangData_8[["gRNA", "total_indel_eff"]]
xiangData_8.rename({"gRNA": "seq", "total_indel_eff":"Indel freq"}, axis=1, inplace=True)
xiangData_10 = pd.read_excel("xiang_2021_d1_spCas9_eff_D10-dox.xlsx")
xiangData_10 = xiangData_10[["gRNA", "total_indel_eff"]]
xiangData_10.rename({"gRNA": "seq", "total_indel_eff":"Indel freq"}, axis=1, inplace=True)
xiangData = pd.merge(xiangData_8, xiangData_10, on="seq", how="inner")
xiangData["Indel freq"] = xiangData.mean(axis=1)
xiangData = xiangData.drop_duplicates(subset=["seq"])
xiangData.head()

In [None]:
with open("input.txt", 'w') as outFile :
    outFile.writelines([f"{x}\n" for x in xiangData["seq"].to_list()])
command = [
    bowtie2Bin,
    "-x",
    bowtie2Index,
    "-p",
    "32",
    "--reorder",
    "--no-hd",
    "-t",
    "-r",
    "-U",
    "input.txt",
    "-S",
    "output.txt"
]
run(command, check=True)

forwarded = 0
dropped = 0
reversed = []
with open("output.txt", "r") as inFile, open("samtools-faidx-region-file.txt", "w") as outFile:
    for line in inFile:
        entries = line.split("\t")
        if ("XM:i:0" in entries) and ("XS:i:0" not in entries):
            forwarded += 1
            outFile.write(f"{entries[2]}:{int(entries[3])-4}-{int(entries[3])+26}\n")
            if (entries[1] == "16"):
                reversed.append(True)
            else:
                reversed.append(False)
        else:
            dropped += 1
            length = xiangData.shape[0]
            xiangData.drop(xiangData[xiangData["seq"] == entries[9]].index, inplace=True)
            if (length == xiangData.shape[0]):
                xiangData.drop(xiangData[xiangData["seq"] == rc(entries[9])].index, inplace=True)
            if (length == xiangData.shape[0]):
                print(f"Couldn't remove {entries[9]}")

xiangData.insert(0,"rc",reversed)

command = [
    samtoolsBin,
    'faidx',
    referenceHumanGenome,
    '-r',
    'samtools-faidx-region-file.txt',
    '-o',
    'samtools-faidx-region-file-out.txt'
]
run(command, check=True)

with open("samtools-faidx-region-file-out.txt", "r") as inFile:
    xiangData["samtools_raw"] = [x.strip().upper() for x in inFile.readlines()][1::2]
xiangData["seq"] = xiangData.apply(lambda x: rc(x.samtools_raw)[:-1] if  x.rc else x.samtools_raw[:-1], axis=1)
xiangData = xiangData[["seq", "Indel freq"]]
os.unlink("input.txt")
os.unlink("output.txt")
os.unlink("samtools-faidx-region-file.txt")
os.unlink("samtools-faidx-region-file-out.txt")

In [None]:
onehotEncoded = []

for seq in xiangData["seq"]:
    onehotEncoded.append(np.array(encode(seq)).transpose().tolist())

xiangData.insert(1, "Onehot Encoding", onehotEncoded)
xiangData.head()

In [None]:
meltingTemp = []

for seq in xiangData["seq"]:
    myseq = Seq(seq)
    meltingTemp.append(mt.Tm_NN(myseq))

xiangData.insert(2, "Melting Point", meltingTemp)
xiangData.head()

In [None]:
onehot = []
response = []
meltingpoint = []

for rowIdx, row in xiangData.iterrows():
    onehot.append(row["Onehot Encoding"])
    response.append(float(row["Indel freq"]))
    meltingpoint.append(float(row["Melting Point"]))

_onehot = t.tensor(onehot, dtype=t.float32)
_response = t.tensor(response, dtype=t.float32)
_meltingpoint = t.tensor(meltingpoint, dtype=t.float32)

t.save(_onehot, f'xiang_2021_X.pt')
t.save(_response, f'xiang_2021_Y.pt')
t.save(_meltingpoint, f'xiang_2021_Features.pt')

print(_onehot.shape, _response.shape, _meltingpoint.shape)

In [None]:
xiangData.to_csv("xiang_2021.csv", index=False)