# Import

In [10]:
import pandas as pd
import os
import random
import numpy as np
import warnings
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.model_selection import train_test_split

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
warnings.filterwarnings(action='ignore') 

# Data preprocessing

In [3]:
df = pd.read_csv('train.csv')

df["frag_3"] = 0
df["frag_4"] = 0

for i in tqdm(range(len(df))):
    df["frag_3"][i] = df["frag_1"][i][::-1]
    df["frag_4"][i] = df["frag_2"][i][::-1]
    
df

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1699067/1699067 [00:17<00:00, 96960.54it/s]


Unnamed: 0,ID,frag_1,frag_2,h_bond_distance,frag_3,frag_4
0,TRAIN_0000000,TLGR,VALV,15.0,RGLT,VLAV
1,TRAIN_0000001,AILTCPF,GRIVPRF,20.0,FPCTLIA,FRPVIRG
2,TRAIN_0000002,LRLSCA,MQLYVT,17.0,ACSLRL,TVYLQM
3,TRAIN_0000003,TASVVCLLNN,SLTLTSSLSY,25.0,NNLLCVVSAT,YSLSSTLTLS
4,TRAIN_0000004,VKLYL,ILVSD,19.0,LYLKV,DSVLI
...,...,...,...,...,...,...
1699062,TRAIN_1699062,LMLVHYEGYL,IELLDINFIL,24.0,LYGEYHVLML,LIFNIDLLEI
1699063,TRAIN_1699063,MLIT,IFPV,16.0,TILM,VPFI
1699064,TRAIN_1699064,IYLGRYEGWYS,MYNEESVWTVV,24.0,SYWGEYRGLYI,VVTWVSEENYM
1699065,TRAIN_1699065,KFWLIYTD,CTTLYNHC,21.0,DTYILWFK,CHNYLTTC


In [4]:
df["check_cha"] = 0
df["check_num"] = 0

sosu = ["V", "L", "I", "A", "F", "Y", "M", "W", "G", "P"] # 소수성
kuk = ["T", "S", "N","Q","C"] # 극성
san = ["E","D"] # 산성
yum = ["H","R","K"] # 염기성

In [5]:
"""
4가지 경우의 수
frag_1 첫글자 - frag_2 첫글자
frag_1 첫글자 - frag_2 마지막글자
frag_1 마지막글자 - frag_2 첫글자
frag_1 마지막글자 - frag_2 마지막글자
"""

"""
그 안에서 
1. 산성 - 염기성인지 찾음
2. 염기성 - 산성인지 찾음
"""

for i in tqdm(range(len(df))):

    """
    frag_1 의 첫글자 - frag_2 첫글자
    """        
        
    if df["frag_1"][i][0] in san and df["frag_2"][i][0] in yum:
        df["check_cha"][i] = "1-sanyum"
    if df["frag_1"][i][0] in yum and df["frag_2"][i][0] in san:
        df["check_cha"][i] = "1-sanyum"    
    
    """
    frag_1 첫글자 - frag_2 마지막글자
    """

    if df["frag_1"][i][0] in san and df["frag_2"][i][-1] in yum:
        df["check_cha"][i] = "2-sanyum"
    if df["frag_1"][i][0] in yum and df["frag_2"][i][-1] in san:
        df["check_cha"][i] = "2-sanyum"
        
    """
    frag_1 마지막글자 - frag_2 첫글자
    """

    if df["frag_1"][i][-1] in san and df["frag_2"][i][0] in yum:
        df["check_cha"][i] = "3-sanyum"
    if df["frag_1"][i][-1] in yum and df["frag_2"][i][0] in san:
        df["check_cha"][i] = "3-sanyum"            
        
        
    """
    frag_1 마지막글자 - frag_2 마지막글자
    """

    if df["frag_1"][i][-1] in san and df["frag_2"][i][-1] in yum:
        df["check_cha"][i] = "4-sanyum"
    if df["frag_1"][i][-1] in yum and df["frag_2"][i][-1] in san:
        df["check_cha"][i] = "4-sanyum"    

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1699067/1699067 [00:33<00:00, 51022.16it/s]


In [7]:
df["comb"] = 0

"""
df["comb1"] = df["frag_1"] + df["frag_2"] 첨글자 + 첨글자
df["comb2"] = df["frag_1"] + df["frag_4"] 첨글자 + 마지막 글자
df["comb3"] = df["frag_2"] + df["frag_1"] 마지막 글자 + 첨글자
df["comb4"] = df["frag_2"] + df["frag_3"] 마지막 글자 + 마지막 글자
"""
#산성염기성 결합이면 combination 값을 ['comb']에 넣음 (두 단백질 붙인 sequence)
for i in tqdm(range(len(df))):
    if df["check_cha"][i] == "4-sanyum":
        df["comb"][i] = df["frag_2"][i] + df["frag_3"][i]
    if df["check_cha"][i] == "3-sanyum":
        df["comb"][i] = df["frag_2"][i] + df["frag_1"][i]
    if df["check_cha"][i] == "2-sanyum":
        df["comb"][i] = df["frag_1"][i] + df["frag_4"][i]
    if df["check_cha"][i] == "1-sanyum":
        df["comb"][i] = df["frag_1"][i] + df["frag_2"][i]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1699067/1699067 [00:15<00:00, 110416.78it/s]


In [8]:
# 소수성-소수성/ 극성-극성은 뺌
df = df[df["check_cha"] != 0]

df = df[["comb", "h_bond_distance"]]

df = df.reset_index(drop = True)


## 결합된 두 Protein의 길이 32로 제한

In [9]:
df["comb1"] = 0

for i in tqdm(range(len(df))):
    if len(df["comb"][i])<=32:
        df["comb1"][i] = df["comb"][i]
    else:
        df["comb1"][i] = df["comb"][i][(len(df["comb"][i])//2) -16:(len(df["comb"][i])//2) + 16]

df = df[["comb1", "h_bond_distance"]]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 157000/157000 [00:01<00:00, 136916.80it/s]


In [12]:
CFG = {
    'SEQ_MAX_LEN':32,
    'EPOCHS':6,
    'LEARNING_RATE':3e-4,
    'BATCH_SIZE':1024,
    'SEED':41
}

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

x_train, x_test, _, _ = train_test_split(df, df["h_bond_distance"], test_size=0.3, random_state=CFG['SEED'])

# Data preprocessing
## 알파벳 총 20개 사용

In [13]:
def get_preprocessing(df):
    # 그냥 알파벳 순서
    alpha_map = {
        '<PAD>': 0, 'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5,
        'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10,
        'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15,
        'S': 16, 'T': 17, 'V': 18, 'W': 19,
        'Y': 20
    }

    frag1_list = []
    
    for frag1 in tqdm(df['comb1']):
        frag1_pad = [alpha_map['<PAD>'] for _ in range(CFG['SEQ_MAX_LEN'])]
        frag1_seq = [alpha_map[x] for x in frag1]

        if CFG['SEQ_MAX_LEN']<len(frag1):
            frag1_pad[:len(frag1)] = frag1_seq[:CFG['SEQ_MAX_LEN']]
        else:
            frag1_pad[:len(frag1)] = frag1_seq[:]

        frag1_list.append(frag1_pad)
    print('Done.')
    return frag1_list

In [14]:
class CustomDataset(Dataset):
    def __init__(self, frag1_list, dist_list):
        self.frag1_list = frag1_list
        self.dist_list = dist_list
        
    def __getitem__(self, index):
        self.frag1 = self.frag1_list[index]
        
        if self.dist_list is not None:
            self.dist = self.dist_list[index]
            return torch.tensor(self.frag1), self.dist
        else:
            return torch.tensor(self.frag1)
    
    def __len__(self):
        return len(self.frag1_list)

In [15]:
train_frag1_list = get_preprocessing(x_train)
val_frag1_list = get_preprocessing(x_test)

train_dataset = CustomDataset(train_frag1_list, x_train['h_bond_distance'].values)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

val_dataset = CustomDataset(val_frag1_list, x_test['h_bond_distance'].values)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 109900/109900 [00:00<00:00, 404897.21it/s]


Done.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 47100/47100 [00:00<00:00, 507976.93it/s]

Done.





# Model (CNN + LSTM)

In [None]:
class BaseModel(nn.Module):
    def __init__(self,
                 embed_dim=140,
                 seq_dim=512,
                 lstm_bidirect=False
                 ):
        super(BaseModel, self).__init__()
        # Embedding Layer

        self.frag1_embed = nn.Embedding(num_embeddings=21,
                                        embedding_dim=embed_dim,
                                        padding_idx=0
                                        )

        self.c1 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=4)
        self.p1 = nn.MaxPool1d(4, stride=1)
        self.c2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3)
        self.p2 = nn.MaxPool1d(3, stride=1)
        self.c3 = nn.Conv1d(in_channels=128, out_channels=64, kernel_size=2)
        self.p3 = nn.MaxPool1d(2, stride=1)
        self.c4 = nn.Conv1d(in_channels=64, out_channels=32, kernel_size=1)
        self.p4 = nn.MaxPool1d(1, stride=1)

        # LSTM
        self.frag1_lstm = nn.LSTM(input_size=128,
                                  hidden_size=seq_dim,
                                  batch_first=True,
                                  bidirectional=lstm_bidirect
                                  )

        # Classifier
        in_channels = seq_dim
        self.regressor = nn.Sequential(
            nn.Linear(in_features=in_channels, out_features=512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Linear(in_features=512, out_features=256),
            nn.Linear(in_features=256, out_features=128),
            nn.Linear(in_features=128, out_features=1)
        )

    def forward(self, frag1):
        BATCH_SIZE = frag1.size(0)
        # Get Embedding Vector

        frag1 = self.frag1_embed(frag1)

        ###########################################################
        frag1 = self.c1(frag1)
        frag1 = self.p1(frag1)
        frag1 = F.relu(frag1)
        frag1 = self.c2(frag1)
        frag1 = self.p2(frag1)
        frag1 = F.relu(frag1)
        frag1 = self.c3(frag1)
        frag1 = self.p3(frag1)
        frag1 = F.relu(frag1)
        frag1 = self.c4(frag1)
        frag1 = self.p4(frag1)
        # frag1 = F.relu(frag1)
        ################################################################

        # LSTM
        frag1_hidden, _ = self.frag1_lstm(frag1)
        frag1_hidden = frag1_hidden[:, -1, :]

        # Feature Concat -> Binary Classifier
        x = self.regressor(frag1_hidden)
        return x


def train(model, optimizer, train_loader, test_loader, scheduler, device):
    model.to(device)
    criterion = nn.L1Loss().to(device)

    best_model = None
    best_score = 9999999

    for epoch in range(1, CFG['EPOCHS'] + 1):
        model.train()
        train_loss = []
        for frag1, dist in tqdm(iter(train_loader)):
            frag1 = frag1.to(device)
            dist = dist.float().to(device)

            optimizer.zero_grad()

            output = model(frag1)
            loss = criterion(output, dist)

            loss.backward()
            optimizer.step()

            train_loss.append(loss.item())

        val_loss, val_score = validation(model, val_loader, criterion, device)
        print(
            f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}] Val RMSE : [{val_score:.5f}]')

        if scheduler is not None:
            scheduler.step(val_score)

        if best_score > val_score:
            best_score = val_score
            best_model = model

    return best_model


from sklearn.metrics import mean_squared_error


def validation(model, val_loader, criterion, device):
    model.eval()
    preds = []
    trues = []
    val_loss = []
    with torch.no_grad():
        for frag1, dist in tqdm(iter(val_loader)):
            frag1 = frag1.to(device)
            dist = dist.float().to(device)

            model_pred = model(frag1)

            loss = criterion(model_pred, dist)

            model_pred = model_pred.squeeze(1).to('cpu')
            preds += model_pred.tolist()
            trues += dist.tolist()

            val_loss.append(loss.item())

    val_score = mean_squared_error(trues, preds, squared=False)  # squared=False : RMSE
    return np.mean(val_loss), val_score


model = BaseModel()

model.eval()
optimizer = torch.optim.Adam(params=model.parameters(), lr=CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2,
                                                       threshold_mode='abs', min_lr=1e-8, verbose=True)

infer_model = train(model, optimizer, train_loader, val_loader, scheduler, device)

# test dataset

In [None]:
test_df = pd.read_csv('test.csv')

test_df["frag_3"] = 0
test_df["frag_4"] = 0

from tqdm import tqdm

for i in tqdm(range(len(test_df))):
    test_df["frag_3"][i] = test_df["frag_1"][i][::-1]
    test_df["frag_4"][i] = test_df["frag_2"][i][::-1]

test_df["check_cha"] = 0
test_df["check_num"] = 0

for i in tqdm(range(len(test_df))):

    if test_df["frag_1"][i][0] in san and test_df["frag_2"][i][0] in yum:
        test_df["check_cha"][i] = "1-sanyum"
    if test_df["frag_1"][i][0] in yum and test_df["frag_2"][i][0] in san:
        test_df["check_cha"][i] = "1-sanyum"

    if test_df["frag_1"][i][0] in san and test_df["frag_2"][i][-1] in yum:
        test_df["check_cha"][i] = "2-sanyum"
    if test_df["frag_1"][i][0] in yum and test_df["frag_2"][i][-1] in san:
        test_df["check_cha"][i] = "2-sanyum"

    if test_df["frag_1"][i][-1] in san and test_df["frag_2"][i][0] in yum:
        test_df["check_cha"][i] = "3-sanyum"
    if test_df["frag_1"][i][-1] in yum and test_df["frag_2"][i][0] in san:
        test_df["check_cha"][i] = "3-sanyum"

    if test_df["frag_1"][i][-1] in san and test_df["frag_2"][i][-1] in yum:
        test_df["check_cha"][i] = "4-sanyum"
    if test_df["frag_1"][i][-1] in yum and test_df["frag_2"][i][-1] in san:
        test_df["check_cha"][i] = "4-sanyum"

test_df["comb"] = 0


for i in tqdm(range(len(test_df))):

    if test_df["check_cha"][i] == "4-sanyum":
        test_df["comb"][i] = test_df["frag_2"][i] + test_df["frag_3"][i]
    if test_df["check_cha"][i] == "3-sanyum":
        test_df["comb"][i] = test_df["frag_2"][i] + test_df["frag_1"][i]
    if test_df["check_cha"][i] == "2-sanyum":
        test_df["comb"][i] = test_df["frag_1"][i] + test_df["frag_4"][i]
    if test_df["check_cha"][i] == "1-sanyum":
        test_df["comb"][i] = test_df["frag_1"][i] + test_df["frag_2"][i]

    else:
        test_df["comb"][i] = test_df["frag_1"][i] + test_df["frag_2"][i]

test_df = test_df.reset_index(drop=True)

test_df["comb1"] = 0

for i in tqdm(range(len(test_df))):
    if len(test_df["comb"][i]) <= 32:
        test_df["comb1"][i] = test_df["comb"][i]
    else:
        test_df["comb1"][i] = test_df["comb"][i][(len(test_df["comb"][i]) // 2) - 16:(len(test_df["comb"][i]) // 2) + 16]

test_df = test_df[["comb1", "comb"]]

test_frag1_list = get_preprocessing(test_df)

test_dataset = CustomDataset(test_frag1_list, None)
test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

def inference(model, test_loader, device):
    model.to(device)
    model.eval()

    preds = []
    with torch.no_grad():
        for frag1 in tqdm(iter(test_loader)):
            frag1 = frag1.to(device)

            model_pred = model(frag1)

            model_pred = model_pred.squeeze(1).to('cpu')
            preds += model_pred.tolist()
    return preds

preds = inference(infer_model, test_loader, device)

submit = pd.read_csv('sample_submission.csv')
submit['h_bond_distance'] = preds


submit.to_csv('submition_real1_C1.csv', index=False)