In [None]:
import pandas as pd
from Bio.SeqUtils import MeltingTemp as mt
from Bio.Seq import Seq
import numpy as np
import torch as t

In [None]:
wang_2019 = pd.read_csv("wang_2019/wang_2019.csv")
wang_2019.rename({"Indel freq" : "Indel freq Wang 2019"}, axis=1, inplace=True)
wang_2019.drop(["Onehot Encoding", "Melting Point"], axis=1, inplace=True)

In [None]:
kim_2019 = pd.read_csv("kim_2019/kim_2019.csv")
kim_2019.rename({"Indel freq" : "Indel freq Kim 2019"}, axis=1, inplace=True)
kim_2019.drop(["Onehot Encoding", "Melting Point"], axis=1, inplace=True)

In [None]:
kim_2020 = pd.read_csv("kim_2020/kim_2020.csv")
kim_2020.rename({"Indel freq" : "Indel freq Kim 2020"}, axis=1, inplace=True)
kim_2020.drop(["Onehot Encoding", "Melting Point"], axis=1, inplace=True)

In [None]:
xiang_2021 = pd.read_csv("xiang_2021/xiang_2021.csv")
xiang_2021.rename({"Indel freq" : "Indel freq Xiang 2021"}, axis=1, inplace=True)
xiang_2021.drop(["Onehot Encoding", "Melting Point"], axis=1, inplace=True)

In [None]:
mergedData = pd.merge(wang_2019, kim_2019, on=['seq'], how='outer')
mergedData = pd.merge( mergedData, kim_2020, on=['seq'], how='outer')
mergedData = pd.merge( mergedData, xiang_2021, on=['seq'], how='outer')
mergedData['Indel freq'] = mergedData.mean(axis=1)
print(mergedData.shape)
mergedData.head()

In [None]:
def encode(seq, z='ATCG'):
    return [list(map(lambda x: 1 if x==c else 0, z)) for c in seq]
assert(encode('ATCG') == [[1,0,0,0], [0,1,0,0], [0,0,1,0], [0,0,0,1]])

In [None]:
onehotEncoded = []

for seq in mergedData["seq"]:
    onehotEncoded.append(np.array(encode(seq)).tolist())

mergedData.insert(1, "Onehot Encoding", onehotEncoded)
mergedData.head()

In [None]:
meltingTemp = []

for seq in mergedData["seq"]:
    myseq = Seq(seq)
    meltingTemp.append(mt.Tm_NN(myseq))

mergedData.insert(2, "Melting Point", meltingTemp)
mergedData.head()

In [None]:
onehot = []
response = []
meltingpoint = []

for rowIdx, row in mergedData.iterrows():
    onehot.append(row["Onehot Encoding"])
    response.append(float(row["Indel freq"]))
    meltingpoint.append(float(row["Melting Point"]))

_onehot = t.tensor(onehot, dtype=t.float32)
_response = t.tensor(response, dtype=t.float32)
_meltingpoint = t.tensor(meltingpoint, dtype=t.float32)

t.save(_onehot, f'merged_X.pt')
t.save(_response, f'merged_Y.pt')
t.save(_meltingpoint, f'merged_Features.pt')

print(_onehot.shape, _response.shape, _meltingpoint.shape)