In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import matplotlib

pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['figure.dpi'] = 200

sns.set(font='SimHei', style='whitegrid',\
        palette='muted',context= 'paper') 

warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
train_data = pd.read_csv('../../data/protein/origin/1_train.csv')
test_data = pd.read_csv('../../data/protein/origin/1_test.csv')

train_data, val_data = train_test_split(train_data, test_size=0.3, random_state=2021)
concat_data = [train_data, val_data, test_data]

len(train_data), len(val_data), len(test_data)

(54271, 23260, 19383)

In [3]:
def conver_text(text):
    gene_dict={'A':np.array([[1,0,0,0],]).T,
              'C':np.array([[0,1,0,0],]).T,
              'G':np.array([[0,0,1,0],]).T,
              'T':np.array([[0,0,0,1],]).T}
    final = gene_dict[text[0]]
    i = 1
    
    while i < len(text):
        now_gene = text[i]
        converted = gene_dict[now_gene]
        final = np.concatenate((final, converted), axis=1)
        i = i + 1
    return final.T

In [4]:
for i in range(len(concat_data)):
    data = concat_data[i]
    data.reset_index(drop=True, inplace=True)
    data['one_hot'] = np.zeros((len(data), 1))
    data['one_hot'] = data['one_hot'].astype('object')
    for j in range(len(data)):
        data['one_hot'][j] = conver_text(data['text'][j])

In [5]:
train_data['one_hot'][0].shape

(101, 4)

In [6]:
name_list = ['train', 'valid', 'test']

In [7]:
for i in range(len(concat_data)):
    data = concat_data[i]
    npdata = data['one_hot'][0].reshape(1, 101, 4)
    labels = data['label'][0].reshape(1, -1)
    j = 1
    while j < len(data):
        npdata = np.concatenate((npdata, data['one_hot'][j].reshape(1,101,4)), axis=0)
        labels = np.concatenate((labels, data['label'][j].reshape(1,-1)), axis=0)
        j = j + 1
    np.save('../../data/protein/trans/'+name_list[i]+'.npy', npdata)
    np.save('../../data/protein/trans/'+name_list[i]+'_labels.npy', labels)

In [8]:
a = np.load('../../data/protein/trans/train.npy')
a.shape

(54271, 101, 4)

In [9]:
a[1,:,:]

array([[0, 1, 0, 0],
       [0, 1, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 1],
       [0, 0, 1, 0],
       [0, 1, 0, 0],
       [1, 0, 0, 0],
       [0, 0, 1, 0],
       [1, 0, 0, 0],
       [0, 0, 1, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 1, 0],
       [1, 0, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 1, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 0, 1, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [1, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 1, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [1, 0, 0, 0],
       [0, 1, 0, 0],
       [1, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       [0, 1, 0, 0],
       [1, 0, 0, 0],
       [1, 0,

In [8]:
b = np.load('../../data/protein/trans/train_labels.npy')
b.shape

(54271, 1)