In [2]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns  # Add this import for seaborn

데이터 로드 및 확인

In [4]:
data = pd.read_csv('BP_data.csv')  # CSV 파일 경로
data

Unnamed: 0,Patient_Number,Blood_Pressure_Abnormality,Level_of_Hemoglobin,Genetic_Pedigree_Coefficient,Age,BMI,Sex,Pregnancy,Smoking,Physical_activity,salt_content_in_the_diet,alcohol_consumption_per_day,Level_of_Stress,Chronic_kidney_disease,Adrenal_and_thyroid_disorders
0,1,1,11.28,0.90,34,23,1,1.0,0,45961,48071,,2,1,1
1,2,0,9.75,0.23,54,33,1,,0,26106,25333,205.0,3,0,0
2,3,1,10.79,0.91,70,49,0,,0,9995,29465,67.0,2,1,0
3,4,0,11.00,0.43,71,50,0,,0,10635,7439,242.0,1,1,0
4,5,1,14.17,0.83,52,19,0,,0,15619,49644,397.0,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1996,1,10.14,0.02,69,26,1,,1,26118,47568,144.0,3,1,0
1996,1997,1,11.77,1.00,24,45,1,1.0,1,2572,8063,,3,1,1
1997,1998,1,16.91,0.22,18,42,0,,0,14933,24753,,2,1,1
1998,1999,0,11.15,0.72,46,45,1,,1,18157,15275,253.0,3,0,1


데이터 엔코딩

In [5]:
# 원-핫 인코딩을 적용할 범주형 변수 리스트
categorical_columns = [
    'Sex', 'Pregnancy', 'Smoking', 'Physical_activity', 
    'Chronic_kidney_disease', 'Adrenal_and_thyroid_disorders'
]

# NaN 값은 우선 0으로 채움 (또는 필요시 평균으로 채워도 됨)
data[categorical_columns] = data[categorical_columns].fillna(0)

# 원-핫 인코딩 적용 (drop_first=True로 다중공선성 방지)
data_encoded = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# 남은 NaN 처리 (연속형 변수들 포함)
data_encoded = data_encoded.fillna(0)

# 모든 컬럼을 float32로 변환 (PyTorch 호환용)
data_encoded = data_encoded.astype('float32')

# 결과 확인
print(data_encoded)

      Patient_Number  Blood_Pressure_Abnormality  Level_of_Hemoglobin  \
0                1.0                         1.0                11.28   
1                2.0                         0.0                 9.75   
2                3.0                         1.0                10.79   
3                4.0                         0.0                11.00   
4                5.0                         1.0                14.17   
...              ...                         ...                  ...   
1995          1996.0                         1.0                10.14   
1996          1997.0                         1.0                11.77   
1997          1998.0                         1.0                16.91   
1998          1999.0                         0.0                11.15   
1999          2000.0                         1.0                11.36   

      Genetic_Pedigree_Coefficient   Age   BMI  salt_content_in_the_diet  \
0                             0.90  34.0  23.0 

타겟 변수

In [6]:
# 타겟 변수 (Blood_Pressure_Abnormality) 설정
X = data_encoded.drop(['Blood_Pressure_Abnormality', 'Patient_Number'], axis=1)  # 타겟 제외한 입력 변수들
y = data_encoded['Blood_Pressure_Abnormality']  # 타겟 변수

데이터 분할

In [7]:
# 데이터 분할: 훈련 데이터와 테스트 데이터 80:20 비율로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

데이터 정규화

In [8]:
# 데이터 정규화 (StandardScaler 사용)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # 훈련 데이터에 맞게 스케일링
X_test_scaled = scaler.transform(X_test)  # 테스트 데이터에 동일한 스케일링 적용

PyTorch 텐서로 데이터 변환

In [9]:
# PyTorch 텐서로 변환
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

데이터 로드

In [10]:
# Create DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)  # 학습 데이터를 TensorDataset 형태로 변환
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)  # 학습용 DataLoader 생성

test_dataset = TensorDataset(X_test_tensor, y_test_tensor)  # 테스트 데이터를 TensorDataset 형태로 변환
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)  # 테스트용 DataLoader 생성

# 데이터 크기 확인
print(X_train_tensor.shape, X_test_tensor.shape, y_train_tensor.shape, y_test_tensor.shape)

torch.Size([1600, 1962]) torch.Size([400, 1962]) torch.Size([1600]) torch.Size([400])


모델 정의

In [31]:
# 모델 정의
class BloodPressureModel(nn.Module):
    def __init__(self, input_size=1962):
        super(BloodPressureModel, self).__init__()
        
        # First layer
        self.fc1 = nn.Linear(input_size, 64)
        self.bn1 = nn.BatchNorm1d(64)
        self.dropout1 = nn.Dropout(0.3)

        # Second layer
        self.fc2 = nn.Linear(64, 32)
        self.bn2 = nn.BatchNorm1d(32)
        self.dropout2 = nn.Dropout(0.3)

        # Output layer
        self.fc3 = nn.Linear(32, 2)

    def forward(self, x):
        x = torch.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        x = torch.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

# 모델 초기화
model = BloodPressureModel()

# 손실 함수, 옵티마이저 설정
criterion = nn.CrossEntropyLoss()  # CrossEntropyLoss expects class indices as labels
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)


모델 훈련

In [29]:
# Number of training epochs
num_epochs = 20  # <-- Define this

# Early stopping settings
best_val_loss = float('inf')
epochs_without_improvement = 0
patience = 5
min_delta = 1e-4

# Training loop
for epoch in range(num_epochs):
    model.train()
    train_loss_total = 0.0

    for inputs, labels in train_dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss_total += loss.item()

    avg_train_loss = train_loss_total / len(train_dataloader)
    print(f"[Epoch {epoch+1}/{num_epochs}]

SyntaxError: EOL while scanning string literal (561555831.py, line 24)

모델 평가

In [68]:
# Evaluation on test set
model.eval()
test_loss = 0.0
correct_predictions = 0
total_predictions = 0
y_true = []
y_pred = []

# Iterate through the test data
with torch.no_grad():
    for inputs, labels in test_dataloader:
        test_outputs = model(inputs)
        loss = criterion(test_outputs, labels)
        test_loss += loss.item()

        # Calculate accuracy
        _, predicted = torch.max(test_outputs, 1)
        correct_predictions += (predicted == labels).sum().item()
        total_predictions += labels.size(0)

        # Store true and predicted values for confusion matrix
        y_true.extend(labels.numpy())
        y_pred.extend(predicted.numpy())

# Calculate average test loss and accuracy
avg_test_loss = test_loss / len(test_dataloader)
accuracy = 100 * correct_predictions / total_predictions
print(f"Test Loss: {avg_test_loss:.4f}, Test Accuracy: {accuracy:.2f}%")

Test Loss: 0.6766, Test Accuracy: 70.50%


히트맵

In [3]:
# Example for getting the predicted labels from the model
y_true = []  # List to hold true labels
y_pred = []  # List to hold predicted labels

# Make sure you use the model to get predictions on the validation/test set
model.eval()
with torch.no_grad():
    for inputs, labels in val_dataloader:  # Use the correct data loader (e.g., validation data)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)  # Get the class with the highest probability (logits)
        
        # Append to y_true and y_pred
        y_true.extend(labels.cpu().numpy())  # Convert to numpy array
        y_pred.extend(predicted.cpu().numpy())  # Convert to numpy array

# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Plot confusion matrix heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='g', cmap='Blues', xticklabels=['Normal', 'Abnormal'], yticklabels=['Normal', 'Abnormal'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix Heatmap')
plt.show()

NameError: name 'model' is not defined