In [1]:
from torch import nn, optim
from torch.utils.data import DataLoader
import torch
import preprocessing
from sklearn.model_selection import train_test_split
import pandas as pd
import torch.nn.functional as F
from scipy.stats import pearsonr
from train import train_model, test_model
from model_BiLSTM_v1 import BiLSTM
from torch.utils.tensorboard import SummaryWriter

In [2]:
# MPS 장치가 사용 가능한지 확인
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

Using device: mps


In [3]:
new_deepmass_df = pd.read_csv("./data/renewal_deepmass.tsv", sep="\t", low_memory=False)
new_hela1_df = pd.read_csv("./data/renewal_hela1.tsv", sep="\t", low_memory=False)
new_hela2_df = pd.read_csv("./data/renewal_hela2.tsv", sep="\t", low_memory=False)

In [4]:
max_seq_len = 35
max_intens_len = 70

In [5]:
# model setting
input_size = 1
hidden_size = 128
num_layers = 2
output_size = max_intens_len  # 출력 크기는 인텐시티 길이

model = BiLSTM(input_size, hidden_size, num_layers, output_size).to(device)

In [6]:
class CosineSimilarityLoss(nn.Module):
    def __init__(self):
        super(CosineSimilarityLoss, self).__init__()
        self.cosine_similarity = nn.CosineSimilarity(dim=1, eps=1e-6)

    def forward(self, y_pred, y_true):
        return 1 - self.cosine_similarity(y_pred, y_true).mean()

# 손실 함수 및 옵티마이저
criterion = CosineSimilarityLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

### Trial #1: HeLa1 training:validation:test / 8:1:1

In [7]:
tb_writer = SummaryWriter("./tensorboard_logs/trial1")

In [8]:
train_df, val_test_df = train_test_split(new_hela1_df, test_size=0.2, random_state=44)
val_df, test_df = train_test_split(val_test_df, test_size=0.5, random_state=44)

print(f"total df: {len(new_hela1_df)}, train df: {len(train_df)}, val df: {len(val_df)}, test df: {len(test_df)}")

total df: 16762, train df: 13409, val df: 1676, test df: 1677


In [9]:
train_dataset = preprocessing.retrieve_dataset(train_df, max_seq_len, max_intens_len)
val_dataset = preprocessing.retrieve_dataset(val_df, max_seq_len, max_intens_len)
test_dataset = preprocessing.retrieve_dataset(test_df, max_seq_len, max_intens_len)

train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=True)

hela2_dataset = preprocessing.retrieve_dataset(new_hela2_df, max_seq_len, max_intens_len)
deepmass_dataset = preprocessing.retrieve_dataset(new_deepmass_df, max_seq_len, max_intens_len)
hela2_dataloader = DataLoader(hela2_dataset, batch_size=128, shuffle=True)
deepmass_dataloader = DataLoader(deepmass_dataset, batch_size=128, shuffle=True)

In [10]:
tb_writer.get_logdir()

'./tensorboard_logs/trial1'

In [11]:
epochs = 100
trained_model, train_res = train_model(model, criterion, optimizer, epochs, train_dataloader, val_dataloader, device, tb_writer, 20)

test_res = test_model(trained_model, criterion, test_dataloader, device, tb_writer, "hela1 10% test")
hela2_res = test_model(trained_model, criterion, hela2_dataloader, device, tb_writer, "hela2 test")
deepmass_res = test_model(trained_model, criterion, deepmass_dataloader, device, tb_writer, "deepmass test")
tb_writer.close()

Epoch [1/100], Train Loss: 0.4624, Validation Loss: 0.3050
Train Cosine Similarity: 0.5373, Train PCC: 0.4906
Validation Cosine Similarity: 0.6941, Validation PCC: 0.6568
Epoch [2/100], Train Loss: 0.3066, Validation Loss: 0.3031
Train Cosine Similarity: 0.6934, Train PCC: 0.6557
Validation Cosine Similarity: 0.6969, Validation PCC: 0.6596
Epoch [3/100], Train Loss: 0.2867, Validation Loss: 0.2718
Train Cosine Similarity: 0.7132, Train PCC: 0.6763
Validation Cosine Similarity: 0.7297, Validation PCC: 0.6951
Epoch [4/100], Train Loss: 0.2681, Validation Loss: 0.2682
Train Cosine Similarity: 0.7319, Train PCC: 0.6974
Validation Cosine Similarity: 0.7340, Validation PCC: 0.7000
Epoch [5/100], Train Loss: 0.2638, Validation Loss: 0.2653
Train Cosine Similarity: 0.7363, Train PCC: 0.7023
Validation Cosine Similarity: 0.7371, Validation PCC: 0.7039
Epoch [6/100], Train Loss: 0.2589, Validation Loss: 0.2517
Train Cosine Similarity: 0.7411, Train PCC: 0.7079
Validation Cosine Similarity: 0.743

### Trial #2: DeepMass(20% sampling) training:validation:test / 8:1:1

In [10]:
# initialize model
model = BiLSTM(input_size, hidden_size, num_layers, output_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

tb_writer = SummaryWriter("./tensorboard_logs/trial2")
sampled_df = new_deepmass_df.sample(frac=0.2, random_state=44)
train_df, val_test_df = train_test_split(sampled_df, test_size=0.2, random_state=44)
val_df, test_df = train_test_split(val_test_df, test_size=0.5, random_state=44)

print(f"total df: {len(sampled_df)}, train df: {len(train_df)}, val df: {len(val_df)}, test df: {len(test_df)}")

train_dataset = preprocessing.retrieve_dataset(train_df, max_seq_len, max_intens_len)
val_dataset = preprocessing.retrieve_dataset(val_df, max_seq_len, max_intens_len)
test_dataset = preprocessing.retrieve_dataset(test_df, max_seq_len, max_intens_len)

train_dataloader = DataLoader(train_dataset, batch_size=2048, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=True)

hela2_dataset = preprocessing.retrieve_dataset(new_hela2_df, max_seq_len, max_intens_len)
hela1_dataset = preprocessing.retrieve_dataset(new_hela1_df, max_seq_len, max_intens_len)
hela2_dataloader = DataLoader(hela2_dataset, batch_size=128, shuffle=True)
hela1_dataloader = DataLoader(hela1_dataset, batch_size=128, shuffle=True)

epochs = 300
trained_model, train_res = train_model(model, criterion, optimizer, epochs, train_dataloader, val_dataloader, device, tb_writer, 20)

test_res = test_model(trained_model, criterion, test_dataloader, device, tb_writer, "deepmass(40%) 10% test")
hela2_res = test_model(trained_model, criterion, hela2_dataloader, device, tb_writer, "hela2 test")
hela1_res = test_model(trained_model, criterion, hela1_dataloader, device, tb_writer, "hela1 test")
tb_writer.close()

total df: 113092, train df: 90473, val df: 11309, test df: 11310
Epoch [1/300], Train Loss: 0.3757, Validation Loss: 0.2759
Train Cosine Similarity: 0.6227, Train PCC: 0.5625
Validation Cosine Similarity: 0.7240, Validation PCC: 0.6747
Epoch [2/300], Train Loss: 0.2579, Validation Loss: 0.2433
Train Cosine Similarity: 0.7417, Train PCC: 0.6944
Validation Cosine Similarity: 0.7566, Validation PCC: 0.7126
Epoch [3/300], Train Loss: 0.2375, Validation Loss: 0.2278
Train Cosine Similarity: 0.7625, Train PCC: 0.7189
Validation Cosine Similarity: 0.7723, Validation PCC: 0.7308
Epoch [4/300], Train Loss: 0.2253, Validation Loss: 0.2271
Train Cosine Similarity: 0.7746, Train PCC: 0.7329
Validation Cosine Similarity: 0.7730, Validation PCC: 0.7318
Epoch [5/300], Train Loss: 0.2208, Validation Loss: 0.2159
Train Cosine Similarity: 0.7792, Train PCC: 0.7382
Validation Cosine Similarity: 0.7841, Validation PCC: 0.7443
Epoch [6/300], Train Loss: 0.2162, Validation Loss: 0.2153
Train Cosine Similari

### Trial #3: DeepMass(20%, normalized Y) training:validation:test / 8:1:1

In [15]:
import importlib
import preprocessing

In [17]:
importlib.reload(preprocessing)

<module 'preprocessing' from '/Users/cellkey-ai/Workplace/onboarding_reproducing/preprocessing.py'>

In [18]:
# initialize model
model = BiLSTM(input_size, hidden_size, num_layers, output_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.005)

tb_writer = SummaryWriter("./tensorboard_logs/trial3")
sampled_df = new_deepmass_df.sample(frac=0.2, random_state=44)
train_df, val_test_df = train_test_split(sampled_df, test_size=0.2, random_state=44)
val_df, test_df = train_test_split(val_test_df, test_size=0.5, random_state=44)

print(f"total df: {len(sampled_df)}, train df: {len(train_df)}, val df: {len(val_df)}, test df: {len(test_df)}")

train_dataset = preprocessing.retrieve_dataset(train_df, max_seq_len, max_intens_len, normalize_intens=True)
val_dataset = preprocessing.retrieve_dataset(val_df, max_seq_len, max_intens_len, normalize_intens=True)
test_dataset = preprocessing.retrieve_dataset(test_df, max_seq_len, max_intens_len, normalize_intens=True)

train_dataloader = DataLoader(train_dataset, batch_size=2048, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=True)

hela2_dataset = preprocessing.retrieve_dataset(new_hela2_df, max_seq_len, max_intens_len, normalize_intens=True)
hela1_dataset = preprocessing.retrieve_dataset(new_hela1_df, max_seq_len, max_intens_len, normalize_intens=True)
hela2_dataloader = DataLoader(hela2_dataset, batch_size=128, shuffle=True)
hela1_dataloader = DataLoader(hela1_dataset, batch_size=128, shuffle=True)

epochs = 300
trained_model, train_res = train_model(model, criterion, optimizer, epochs, train_dataloader, val_dataloader, device, tb_writer, 20)

test_res = test_model(trained_model, criterion, test_dataloader, device, tb_writer, "deepmass(20%) 10% test")
hela2_res = test_model(trained_model, criterion, hela2_dataloader, device, tb_writer, "hela2 test")
hela1_res = test_model(trained_model, criterion, hela1_dataloader, device, tb_writer, "hela1 test")
tb_writer.close()

total df: 113092, train df: 90473, val df: 11309, test df: 11310
Epoch [1/300], Train Loss: 0.3323, Validation Loss: 0.2616
Train Cosine Similarity: 0.6665, Train PCC: 0.6109
Validation Cosine Similarity: 0.7383, Validation PCC: 0.6910
Epoch [2/300], Train Loss: 0.2534, Validation Loss: 0.2343
Train Cosine Similarity: 0.7463, Train PCC: 0.7006
Validation Cosine Similarity: 0.7655, Validation PCC: 0.7233
Epoch [3/300], Train Loss: 0.2280, Validation Loss: 0.2186
Train Cosine Similarity: 0.7720, Train PCC: 0.7302
Validation Cosine Similarity: 0.7814, Validation PCC: 0.7413
Epoch [4/300], Train Loss: 0.2204, Validation Loss: 0.2487
Train Cosine Similarity: 0.7800, Train PCC: 0.7393
Validation Cosine Similarity: 0.7514, Validation PCC: 0.7083
Epoch [5/300], Train Loss: 0.2217, Validation Loss: 0.2106
Train Cosine Similarity: 0.7781, Train PCC: 0.7373
Validation Cosine Similarity: 0.7895, Validation PCC: 0.7505
Epoch [6/300], Train Loss: 0.2098, Validation Loss: 0.2091
Train Cosine Similari

### Trial #4: HeLa1(normalized response) training:validation:test / 8:1:1

In [21]:
# initialize model
model = BiLSTM(input_size, hidden_size, num_layers, output_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

tb_writer = SummaryWriter("./tensorboard_logs/trial4")
train_df, val_test_df = train_test_split(new_hela1_df, test_size=0.2, random_state=44)
val_df, test_df = train_test_split(val_test_df, test_size=0.5, random_state=44)

print(f"total df: {len(sampled_df)}, train df: {len(train_df)}, val df: {len(val_df)}, test df: {len(test_df)}")

train_dataset = preprocessing.retrieve_dataset(train_df, max_seq_len, max_intens_len, normalize_intens=True)
val_dataset = preprocessing.retrieve_dataset(val_df, max_seq_len, max_intens_len, normalize_intens=True)
test_dataset = preprocessing.retrieve_dataset(test_df, max_seq_len, max_intens_len, normalize_intens=True)

train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=True)

hela2_dataset = preprocessing.retrieve_dataset(new_hela2_df, max_seq_len, max_intens_len, normalize_intens=True)
deepmass_dataset = preprocessing.retrieve_dataset(new_deepmass_df, max_seq_len, max_intens_len, normalize_intens=True)
hela2_dataloader = DataLoader(hela2_dataset, batch_size=128, shuffle=True)
deepmass_dataloader = DataLoader(deepmass_dataset, batch_size=128, shuffle=True)

epochs = 300
trained_model, train_res = train_model(model, criterion, optimizer, epochs, train_dataloader, val_dataloader, device, tb_writer, 20)

test_res = test_model(trained_model, criterion, test_dataloader, device, tb_writer, "hela1 10% test")
hela2_res = test_model(trained_model, criterion, hela2_dataloader, device, tb_writer, "hela2 test")
deepmass_res = test_model(trained_model, criterion, deepmass_dataloader, device, tb_writer, "deepmass test")
tb_writer.close()

total df: 113092, train df: 13409, val df: 1676, test df: 1677
Epoch [1/300], Train Loss: 0.4400, Validation Loss: 0.3024
Train Cosine Similarity: 0.5597, Train PCC: 0.5224
Validation Cosine Similarity: 0.6936, Validation PCC: 0.6565
Epoch [2/300], Train Loss: 0.3073, Validation Loss: 0.3022
Train Cosine Similarity: 0.6927, Train PCC: 0.6552
Validation Cosine Similarity: 0.6943, Validation PCC: 0.6570
Epoch [3/300], Train Loss: 0.2938, Validation Loss: 0.2708
Train Cosine Similarity: 0.7062, Train PCC: 0.6692
Validation Cosine Similarity: 0.7284, Validation PCC: 0.6936
Epoch [4/300], Train Loss: 0.2695, Validation Loss: 0.2725
Train Cosine Similarity: 0.7305, Train PCC: 0.6959
Validation Cosine Similarity: 0.7289, Validation PCC: 0.6946
Epoch [5/300], Train Loss: 0.2649, Validation Loss: 0.2589
Train Cosine Similarity: 0.7351, Train PCC: 0.7009
Validation Cosine Similarity: 0.7386, Validation PCC: 0.7052
Epoch [6/300], Train Loss: 0.2598, Validation Loss: 0.2613
Train Cosine Similarity