In [1]:
from torch import nn, optim
from torch.utils.data import DataLoader
import torch
import preprocessing
from sklearn.model_selection import train_test_split
import pandas as pd
import torch.nn.functional as F
from scipy.stats import pearsonr
from train import train_model, test_model

In [2]:
# MPS 장치가 사용 가능한지 확인
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

Using device: mps


In [3]:
class BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(BiLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # BiLSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_size * 2, output_size)
    
    def forward(self, x):
        # Initialize hidden state and cell state
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device) # 2 for bidirectional
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)
        
        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))
        
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

In [4]:
new_deepmass_df = pd.read_csv("./data/renewal_deepmass.tsv", sep="\t", low_memory=False)
new_hela1_df = pd.read_csv("./data/renewal_hela1.tsv", sep="\t", low_memory=False)
new_hela2_df = pd.read_csv("./data/renewal_hela2.tsv", sep="\t", low_memory=False)


In [40]:
sampled_df = new_deepmass_df.sample(frac=0.4, random_state=44)
train_df, val_df = train_test_split(sampled_df, test_size=0.3, random_state=44)

max_seq_len = 35
max_intens_len = 70

train_dataset = preprocessing.retrieve_dataset(train_df, max_seq_len, max_intens_len)
val_dataset = preprocessing.retrieve_dataset(val_df, max_seq_len, max_intens_len)
hela1_dataset = preprocessing.retrieve_dataset(new_hela1_df, max_seq_len, max_intens_len)
hela2_dataset = preprocessing.retrieve_dataset(new_hela2_df, max_seq_len, max_intens_len)

In [41]:
train_dataloader = DataLoader(train_dataset, batch_size=258, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=258, shuffle=True)
hela1_dataloader = DataLoader(hela1_dataset, batch_size=128, shuffle=True)
hela2_dataloader = DataLoader(hela2_dataset, batch_size=128, shuffle=True)

In [26]:
input_size = 1  # 입력 크기는 1 (각 시퀀스 값 하나씩 입력)
hidden_size = 128  # 예시로 설정
num_layers = 2  # 예시로 설정
output_size = max_intens_len  # 출력 크기는 인텐시티 길이

model = BiLSTM(input_size, hidden_size, num_layers, output_size).to(device)

In [27]:
class CosineSimilarityLoss(nn.Module):
    def __init__(self):
        super(CosineSimilarityLoss, self).__init__()
        self.cosine_similarity = nn.CosineSimilarity(dim=1, eps=1e-6)

    def forward(self, y_pred, y_true):
        return 1 - self.cosine_similarity(y_pred, y_true).mean()

# 손실 함수 및 옵티마이저
criterion = CosineSimilarityLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [2]:
train_res = train_model(model, criterion, optimizer, 50, train_dataloader, val_dataloader, device)
hela1_test_res = test_model(model, criterion, hela1_dataloader, device)
hela2_test_res = test_model(model, criterion, hela2_dataloader, device)

NameError: name 'model' is not defined

In [32]:
def batch_pearsonr(x, y):
    """
    벡터화된 방식으로 배치의 피어슨 상관 계수를 계산합니다.
    """
    mean_x = torch.mean(x, dim=1, keepdim=True)
    mean_y = torch.mean(y, dim=1, keepdim=True)
    xm = x - mean_x
    ym = y - mean_y
    r_num = torch.sum(xm * ym, dim=1)
    r_den = torch.sqrt(torch.sum(xm ** 2, dim=1) * torch.sum(ym ** 2, dim=1))
    r = r_num / r_den
    return r

In [42]:
num_epochs = 30

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    cosine_similarities = []
    pearson_coefficients = []

    for sequences, intensities in train_dataloader:
        sequences = sequences.float().unsqueeze(2).to(device)  # (batch_size, sequence_length, input_size)
        intensities = intensities.float().to(device)

        # 순전파
        outputs = model(sequences)

        # 손실 계산
        loss = criterion(outputs, intensities)
        train_loss += loss.item()

        # 역전파 및 옵티마이저 스텝
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # 코사인 유사도 계산
        cosine_similarity = F.cosine_similarity(outputs, intensities, dim=1)
        cosine_similarities.extend(cosine_similarity.cpu().detach().numpy())

        # 벡터화된 피어슨 상관 계수 계산
        pearson_coefficient = batch_pearsonr(outputs, intensities)
        pearson_coefficients.extend(pearson_coefficient.cpu().detach().numpy())

    train_loss /= len(train_dataloader)
    mean_cosine_similarity = sum(cosine_similarities) / len(cosine_similarities)
    mean_pearson_coefficient = sum(pearson_coefficients) / len(pearson_coefficients)

    model.eval()
    val_loss = 0.0
    cosine_similarities_val = []
    pearson_coefficients_val = []
    with torch.no_grad():
        for sequences, intensities in val_dataloader:
            sequences = sequences.float().unsqueeze(2).to(device)
            intensities = intensities.float().to(device)

            outputs = model(sequences)
            loss = criterion(outputs, intensities)
            val_loss += loss.item()
            
            # 코사인 유사도 계산 (Validation)
            cosine_similarity_val = F.cosine_similarity(outputs, intensities, dim=1)
            cosine_similarities_val.extend(cosine_similarity_val.cpu().detach().numpy())

            # 벡터화된 피어슨 상관 계수 계산 (Validation)
            pearson_coefficient_val = batch_pearsonr(outputs, intensities)
            pearson_coefficients_val.extend(pearson_coefficient_val.cpu().detach().numpy())

    val_loss /= len(val_dataloader)
    mean_cosine_similarity_val = sum(cosine_similarities_val) / len(cosine_similarities_val)
    mean_pearson_coefficient_val = sum(pearson_coefficients_val) / len(pearson_coefficients_val)

    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')
    print(f'Train Cosine Similarity: {mean_cosine_similarity:.4f}, Train PCC: {mean_pearson_coefficient:.4f}')
    print(f'Validation Cosine Similarity: {mean_cosine_similarity_val:.4f}, Validation PCC: {mean_pearson_coefficient_val:.4f}')

print("Training complete.")

Epoch [1/30], Train Loss: 0.0445, Validation Loss: 0.0443
Train Cosine Similarity: 0.9556, Train PCC: 0.9447
Validation Cosine Similarity: 0.9556, Validation PCC: 0.9448
Epoch [2/30], Train Loss: 0.0425, Validation Loss: 0.0423
Train Cosine Similarity: 0.9576, Train PCC: 0.9471
Validation Cosine Similarity: 0.9576, Validation PCC: 0.9472
Epoch [3/30], Train Loss: 0.0414, Validation Loss: 0.0423
Train Cosine Similarity: 0.9586, Train PCC: 0.9484
Validation Cosine Similarity: 0.9577, Validation PCC: 0.9474
Epoch [4/30], Train Loss: 0.0403, Validation Loss: 0.0406
Train Cosine Similarity: 0.9598, Train PCC: 0.9498
Validation Cosine Similarity: 0.9593, Validation PCC: 0.9493
Epoch [5/30], Train Loss: 0.0392, Validation Loss: 0.0408
Train Cosine Similarity: 0.9609, Train PCC: 0.9512
Validation Cosine Similarity: 0.9592, Validation PCC: 0.9492
Epoch [6/30], Train Loss: 0.0384, Validation Loss: 0.0398
Train Cosine Similarity: 0.9617, Train PCC: 0.9521
Validation Cosine Similarity: 0.9601, Val

In [43]:
model.eval()
cosine_similarities = []
pearson_coefficients = []

with torch.no_grad():
    for sequences, intensities in hela1_dataloader:
        sequences = sequences.float().unsqueeze(2).to(device)
        intensities = intensities.float().to(device)

        outputs = model(sequences)
        loss = criterion(outputs, intensities)
        val_loss += loss.item()
        
        # 코사인 유사도 계산 (Validation)
        cosine_similarity = F.cosine_similarity(outputs, intensities, dim=1)
        cosine_similarities.extend(cosine_similarity.cpu().detach().numpy())

        # 벡터화된 피어슨 상관 계수 계산 (Validation)
        pearson_coefficient = batch_pearsonr(outputs, intensities)
        pearson_coefficients.extend(pearson_coefficient.cpu().detach().numpy())

# 평균 코사인 유사도 및 피어슨 상관 계수 계산
mean_cosine_similarity = sum(cosine_similarities) / len(cosine_similarities)
mean_pearson_coefficient = sum(pearson_coefficients) / len(pearson_coefficients)

print(f'Mean Cosine Similarity: {mean_cosine_similarity:.4f}')
print(f'Mean Pearson Correlation Coefficient: {mean_pearson_coefficient:.4f}')

Mean Cosine Similarity: 0.6926
Mean Pearson Correlation Coefficient: 0.6615


In [44]:
model.eval()
cosine_similarities = []
pearson_coefficients = []

with torch.no_grad():
    for sequences, intensities in hela2_dataloader:
        sequences = sequences.float().unsqueeze(2).to(device)
        intensities = intensities.float().to(device)

        outputs = model(sequences)
        loss = criterion(outputs, intensities)
        val_loss += loss.item()
        
        # 코사인 유사도 계산 (Validation)
        cosine_similarity = F.cosine_similarity(outputs, intensities, dim=1)
        cosine_similarities.extend(cosine_similarity.cpu().detach().numpy())

        # 벡터화된 피어슨 상관 계수 계산 (Validation)
        pearson_coefficient = batch_pearsonr(outputs, intensities)
        pearson_coefficients.extend(pearson_coefficient.cpu().detach().numpy())

# 평균 코사인 유사도 및 피어슨 상관 계수 계산
mean_cosine_similarity = sum(cosine_similarities) / len(cosine_similarities)
mean_pearson_coefficient = sum(pearson_coefficients) / len(pearson_coefficients)

print(f'Mean Cosine Similarity: {mean_cosine_similarity:.4f}')
print(f'Mean Pearson Correlation Coefficient: {mean_pearson_coefficient:.4f}')

Mean Cosine Similarity: 0.6781
Mean Pearson Correlation Coefficient: 0.6445
