# Preprocessing
Kim, N., Kim, H. K., Lee, S., Seo, J. H., Choi, J. W., Park, J., Min, S., Yoon, S., Cho, S.-R., & Kim, H. H. (2020). Prediction of the sequence-specific cleavage activity of Cas9 variants. Nature Biotechnology, 38(11), 1328-1336.

This file was used to preprocess the data obtained from supplementary table 8.

In [None]:
import pandas as pd
from Bio.SeqUtils import MeltingTemp as mt
from Bio.Seq import Seq
import numpy as np
import torch as t

In [None]:
def encode(seq, z='ATCG'):
    return [list(map(lambda x: 1 if x==c else 0, z)) for c in seq]
assert(encode('ATCG') == [[1,0,0,0], [0,1,0,0], [0,0,1,0], [0,0,0,1]])

In [None]:
kimData = pd.read_excel("kim_2020_table_s8.xlsx")
kimData.rename({"SpCas9": "Indel freq", "Input sequence = Target sequence + neighboring sequences = \n(4 nt left neighboring sequence + 20 nt protospacer (blue lowercase) + 3 nt PAM (RED BOLD UPPERCASE)+ 3nt right neighboring sequence)" : "seq"}, axis=1, inplace=True)
kimData = kimData[["seq", "Indel freq"]]
kimData["seq"] = kimData["seq"].str.upper()
kimData.insert(1, "pam", kimData["seq"].str[24:27])
kimData.drop(kimData[kimData["pam"].str[1:] != "GG"].index, inplace=True)
kimData.drop(kimData[kimData["Indel freq"] == "n.a."].index, inplace=True)
kimData.drop_duplicates(["seq"], inplace=True)

In [None]:
onehotEncoded = []

for seq in kimData["seq"]:
    onehotEncoded.append(np.array(encode(seq)).transpose().tolist())

kimData.insert(1, "Onehot Encoding", onehotEncoded)
kimData.head()

In [None]:
meltingTemp = []

for seq in kimData["seq"]:
    myseq = Seq(seq)
    meltingTemp.append(mt.Tm_NN(myseq))

kimData.insert(2, "Melting Point", meltingTemp)
kimData.head()

In [None]:
onehot = []
response = []
meltingpoint = []

for rowIdx, row in kimData.iterrows():
    onehot.append(row["Onehot Encoding"])
    response.append(float(row["Indel freq"]))
    meltingpoint.append(float(row["Melting Point"]))

_onehot = t.tensor(onehot, dtype=t.float32)
_response = t.tensor(response, dtype=t.float32)
_meltingpoint = t.tensor(meltingpoint, dtype=t.float32)

t.save(_onehot, f'kim_2020_X.pt')
t.save(_response, f'kim_2020_Y.pt')
t.save(_meltingpoint, f'kim_2020_Features.pt')

print(_onehot.shape, _response.shape, _meltingpoint.shape)

In [None]:
kimData.to_csv("kim_2020.csv", index=False)