In [85]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn import metrics
from torch.utils.data import DataLoader, Dataset, random_split

In [86]:
class DiabetesDataset(Dataset):

    def __init__(self, df):
        x = df[df.columns.drop("Outcome")].to_numpy()
        y = df["Outcome"].to_numpy()
        self.x = torch.from_numpy(x).to(torch.float)
        self.y = torch.from_numpy(y).to(torch.long)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [87]:
df = pd.read_csv("diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Красивые картинки:

8 признаков, 2 класса, 768 примеров:

In [88]:
dataset = DiabetesDataset(df)
dataset.x.size(), dataset.y.size(), len(dataset.y.unique())

(torch.Size([768, 8]), torch.Size([768]), 2)

Разделю выборку на 80% / 20% - обычно хорошее разделение:

In [89]:
train_size = round(0.8 * len(dataset))
train_dataset, test_dataset = random_split(dataset, (train_size, len(dataset) - train_size))
len(train_dataset), len(test_dataset)

(614, 154)

Два полносвязных слоя + Dropout половины нейронов после каждого слоя:

In [90]:
class Classificator(nn.Module):

    def __init__(self, in_features, out_features):
        super().__init__()
        self.classifier = nn.Sequential(
            nn.Linear(in_features, 64),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(64, out_features),
        )

    def forward(self, x):
        return self.classifier(x)

In [91]:
net = Classificator(dataset.x.size(1), len(dataset.y.unique()))
net

Classificator(
  (classifier): Sequential(
    (0): Linear(in_features=8, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=64, out_features=2, bias=True)
  )
)

In [92]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.002)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=len(test_dataset))

for epoch in range(20):
    net.train()
    for batch, (x, y) in enumerate(train_dataloader):
        pred = net(x)
        loss = loss_fn(pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    net.eval()  # отключаем Dropout
    size = len(test_dataloader.dataset)
    correct = 0
    with torch.no_grad():  # отключаем вычисление градиентов для ускорения оценки
        for x, y in test_dataloader:
            pred = net(x)
            correct += (pred.argmax(1) == y).sum().item()

    accuracy = correct / size
    print(f"Epoch [{epoch + 1:>2}]: accuracy: {accuracy:>4f}")

Epoch [ 1]: accuracy: 0.681818
Epoch [ 2]: accuracy: 0.701299
Epoch [ 3]: accuracy: 0.707792
Epoch [ 4]: accuracy: 0.701299
Epoch [ 5]: accuracy: 0.668831
Epoch [ 6]: accuracy: 0.681818
Epoch [ 7]: accuracy: 0.733766
Epoch [ 8]: accuracy: 0.701299
Epoch [ 9]: accuracy: 0.740260
Epoch [10]: accuracy: 0.707792
Epoch [11]: accuracy: 0.720779
Epoch [12]: accuracy: 0.727273
Epoch [13]: accuracy: 0.727273
Epoch [14]: accuracy: 0.720779
Epoch [15]: accuracy: 0.740260
Epoch [16]: accuracy: 0.714286
Epoch [17]: accuracy: 0.714286
Epoch [18]: accuracy: 0.772727
Epoch [19]: accuracy: 0.720779
Epoch [20]: accuracy: 0.714286


In [93]:
net.eval()  # отключаем Dropout

x_test, y_test = next(iter(test_dataloader))
y_pred = net(x_test).argmax(1)
print(metrics.classification_report(y_true=y_test, y_pred=y_pred, zero_division=True))

              precision    recall  f1-score   support

           0       0.74      0.92      0.82       109
           1       0.53      0.22      0.31        45

    accuracy                           0.71       154
   macro avg       0.63      0.57      0.57       154
weighted avg       0.68      0.71      0.67       154



Несбалансированность выборки не позволяет достаточно точно определять наличие заболевания, однако общая точности модели неплохая.