In [1]:
#dataset
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score

# vvvvvvvvvvvvvvvvvv
ds_source = "Kaggle"
# ^^^^^^^^^^^^^^^^^^
if ds_source == "local":
  # Option1: # โหลด dataset จากลิงก์โดยล็อคอินด้วยบัญชีที่สมัครกับ Kaggle และนำเข้าจากอุปกรณ์ส่วนตัวแปะลงบน file folder หรือชี้ไปที่ directory ที่เก็บไว้
  dir = './diabetes-dataset.csv'
  df = pd.read_csv(dir)
elif ds_source == "Kaggle":
  # Option2: ดาวน์โหลดชุดข้อมูลจาก library
  !pip install opendatasets
  import opendatasets as od
  od.download('https://www.kaggle.com/datasets/akshaydattatraykhare/diabetes-dataset')
  df = pd.read_csv('diabetes-dataset/diabetes.csv')

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22
Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: thanachotvilai
Your Kaggle Key: ··········
Dataset URL: https://www.kaggle.com/datasets/akshaydattatraykhare/diabetes-dataset
Downloading diabetes-dataset.zip to ./diabetes-dataset


100%|██████████| 8.91k/8.91k [00:00<00:00, 4.39MB/s]







In [2]:
print('ดูฟีเจอร์ทั้งหมดที่มี : \n', df.head())
print('ดูประเภทข้อมูลทั้งหมดที่มี : \n', df.info())

ดูฟีเจอร์ทั้งหมดที่มี : 
    Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glu

In [3]:
print('ค่าเอกลักของค่ากลูโคส: \n', df['Glucose'].unique(), '\n')
print('ค่ามากสุดของกลูโคส :', df['Glucose'].max())
print('ค่าน้อยสุดของกลูโคส :', df['Glucose'].min())
print('ค่าเอกลัก: ', df['Outcome'].unique())
print('จำนวนคลาสที่กำหนดใช้จริง: ', df['Outcome'].value_counts())

ค่าเอกลักของค่ากลูโคส: 
 [148  85 183  89 137 116  78 115 197 125 110 168 139 189 166 100 118 107
 103 126  99 196 119 143 147  97 145 117 109 158  88  92 122 138 102  90
 111 180 133 106 171 159 146  71 105 101 176 150  73 187  84  44 141 114
  95 129  79   0  62 131 112 113  74  83 136  80 123  81 134 142 144  93
 163 151  96 155  76 160 124 162 132 120 173 170 128 108 154  57 156 153
 188 152 104  87  75 179 130 194 181 135 184 140 177 164  91 165  86 193
 191 161 167  77 182 157 178  61  98 127  82  72 172  94 175 195  68 186
 198 121  67 174 199  56 169 149  65 190] 

ค่ามากสุดของกลูโคส : 199
ค่าน้อยสุดของกลูโคส : 0
ค่าเอกลัก:  [1 0]
จำนวนคลาสที่กำหนดใช้จริง:  Outcome
0    500
1    268
Name: count, dtype: int64


In [48]:
X = df.drop(columns=['Outcome']).values
y = df['Outcome'].values.reshape(-1,1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

class OneLayerMLP(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.layers = nn.Linear(input_dim, output_dim)
    def forward(self, x):
        return self.layers(x)

class TwoLayerMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout=0.3):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, output_dim)
        )
    def forward(self, x):
        return self.layers(x)

# -------------------
# Hyperparameters
# -------------------
epochs = 1000
batch_size = 32
lr = 0.001
input_dim = X_train.shape[1]
output_dim = 1


# 1. One layer
model1 = OneLayerMLP(input_dim, output_dim)
# 2. Two layer hidden 16
model2 = TwoLayerMLP(input_dim, 16, output_dim)
# 3. Two layer hidden 64
model3 = TwoLayerMLP(input_dim, 64, output_dim)

model_cls = model2

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model_cls.parameters(), lr=lr)

for epoch in range(epochs):
    model_cls.train()
    train_loss = 0
    train_correct = 0

    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model_cls(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * X_batch.size(0)
        preds = (torch.sigmoid(outputs) >= 0.5).float()
        train_correct += (preds == y_batch).sum().item()

    avg_train_loss = train_loss / len(train_dataset)
    train_acc = train_correct / len(train_dataset)

    model_cls.eval()
    val_loss = 0
    val_correct = 0
    with torch.no_grad():
        for X_val, y_val in test_loader:
            outputs = model_cls(X_val)
            loss = criterion(outputs, y_val)
            val_loss += loss.item() * X_val.size(0)
            preds = (torch.sigmoid(outputs) >= 0.5).float()
            val_correct += (preds == y_val).sum().item()

    avg_val_loss = val_loss / len(test_dataset)
    val_acc = val_correct / len(test_dataset)

    if (epoch+1) % 100 == 0 or epoch==0:
        print(f"Epoch {epoch+1}/{epochs}, "
              f"Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, "
              f"Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}")

model_cls.eval()
with torch.no_grad():
    test_outputs = model_cls(X_test_tensor)
    test_preds = torch.sigmoid(test_outputs)
    test_preds_class = (test_preds >= 0.5).int()
df_pred = pd.DataFrame(test_preds_class.numpy(), columns=["Diagnosis results"])
df_pred.to_csv("diagnosis_for_diabetes_test-val-set.csv", index=False)

Epoch 1/1000, Train Loss: 0.6909, Val Loss: 0.6802, Train Acc: 0.5603, Val Acc: 0.5909
Epoch 100/1000, Train Loss: 0.4460, Val Loss: 0.5160, Train Acc: 0.7932, Val Acc: 0.7597
Epoch 200/1000, Train Loss: 0.4456, Val Loss: 0.5279, Train Acc: 0.7818, Val Acc: 0.7468
Epoch 300/1000, Train Loss: 0.4405, Val Loss: 0.5347, Train Acc: 0.7948, Val Acc: 0.7468
Epoch 400/1000, Train Loss: 0.4486, Val Loss: 0.5360, Train Acc: 0.7883, Val Acc: 0.7338
Epoch 500/1000, Train Loss: 0.4341, Val Loss: 0.5435, Train Acc: 0.7980, Val Acc: 0.7532
Epoch 600/1000, Train Loss: 0.4247, Val Loss: 0.5617, Train Acc: 0.7997, Val Acc: 0.7468
Epoch 700/1000, Train Loss: 0.4316, Val Loss: 0.5532, Train Acc: 0.7964, Val Acc: 0.7273
Epoch 800/1000, Train Loss: 0.4536, Val Loss: 0.5568, Train Acc: 0.7818, Val Acc: 0.7273
Epoch 900/1000, Train Loss: 0.4320, Val Loss: 0.5542, Train Acc: 0.7997, Val Acc: 0.7208
Epoch 1000/1000, Train Loss: 0.4197, Val Loss: 0.5545, Train Acc: 0.8029, Val Acc: 0.7273
