# Preprocessing
Kim, H. K., Kim, Y., Lee, S., Min, S., Bae, J. Y., Choi, J. W., Park, J., Jung, D., Yoon, S., & Kim, H. H. (2019). SpCas9 activity prediction by DeepSpCas9, a deep learning–based model with high generalization performance. Science Advances, 5(11), eaax9249. 

This file was used to preprocess the data obtained from supplementary tables 1 and 2.

In [None]:
import pandas as pd
from Bio.SeqUtils import MeltingTemp as mt
from Bio.Seq import Seq
import numpy as np
import torch as t

In [None]:
def encode(seq, z='ATCG'):
    return [list(map(lambda x: 1 if x==c else 0, z)) for c in seq]
assert(encode('ATCG') == [[1,0,0,0], [0,1,0,0], [0,0,1,0], [0,0,0,1]])

In [None]:
kimData_train = pd.read_excel("kim_2019_table_s1.xlsx")
kimData_test = pd.read_excel("kim_2019_table_s2.xlsx")
kimData_test.rename({"Background subtracted indel frequencies\n(average, %)" : "Background subtracted indel (%)"}, axis=1, inplace=True)
kimData_train = kimData_train[["Target context sequence (4+20+3+3)", "Background subtracted indel (%)"]]
kimData_test = kimData_test[["Target context sequence (4+20+3+3)", "Background subtracted indel (%)"]]
kimData = pd.concat((kimData_train, kimData_test))
kimData.drop_duplicates("Target context sequence (4+20+3+3)", inplace=True)
kimData.rename({"Target context sequence (4+20+3+3)" : "seq", "Background subtracted indel (%)": "Indel freq"}, axis=1, inplace=True)

In [None]:
onehotEncoded = []

for seq in kimData["seq"]:
    onehotEncoded.append(np.array(encode(seq)).transpose().tolist())

kimData.insert(1, "Onehot Encoding", onehotEncoded)
kimData.head()

In [None]:
meltingTemp = []

for seq in kimData["seq"]:
    myseq = Seq(seq)
    meltingTemp.append(mt.Tm_NN(myseq))

kimData.insert(2, "Melting Point", meltingTemp)
kimData.head()

In [None]:
onehot = []
response = []
meltingpoint = []

for rowIdx, row in kimData.iterrows():
    onehot.append(row["Onehot Encoding"])
    response.append(float(row["Indel freq"]))
    meltingpoint.append(float(row["Melting Point"]))

_onehot = t.tensor(onehot, dtype=t.float32)
_response = t.tensor(response, dtype=t.float32)
_meltingpoint = t.tensor(meltingpoint, dtype=t.float32)

t.save(_onehot, f'kim_2019_X.pt')
t.save(_response, f'kim_2019_Y.pt')
t.save(_meltingpoint, f'kim_2019_Features.pt')

print(_onehot.shape, _response.shape, _meltingpoint.shape)

In [None]:
kimData.to_csv("kim_2019.csv", index=False)