In [41]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn import metrics
from torch.utils.data import DataLoader, Dataset, random_split

In [42]:
class DiabetesDataset(Dataset):

    def __init__(self, df):
        x = df[df.columns.drop("Outcome")].to_numpy()
        y = df["Outcome"].to_numpy()
        # добавляю 1 искусственный канал для 1D свертки
        self.x = torch.from_numpy(x).to(torch.float).unsqueeze(1)
        self.y = torch.from_numpy(y).to(torch.long)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [43]:
df = pd.read_csv("diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


8 признаков, 2 класса, 768 примеров:

In [44]:
dataset = DiabetesDataset(df)
print(dataset.x.size(), dataset.y.size(), len(dataset.y.unique()))

train_size = round(0.8 * len(dataset))  # 80% - train, 20% - test
train_dataset, test_dataset = random_split(dataset, (train_size, len(dataset) - train_size))
print(len(train_dataset), len(test_dataset))

torch.Size([768, 1, 8]) torch.Size([768]) 2
614 154


Два полносвязных слоя + Dropout половины нейронов после каждого слоя:

In [45]:
class Classificator(nn.Module):

    def __init__(self, in_channels, out_features):
        super().__init__()
        out_channels = 32
        size = 6
        self.features = nn.Sequential(
            # свертка
            nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            # сжатие
            nn.MaxPool1d(2),
        )
        # подгон размеров
        self.avgpool = nn.AdaptiveAvgPool1d(size)
        self.classifier = nn.Sequential(
            nn.Linear(out_channels * size, 64),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(64, out_features),
        )

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        return self.classifier(x)

In [46]:
net = Classificator(1, len(dataset.y.unique()))
net

Classificator(
  (features): Sequential(
    (0): Conv1d(1, 32, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool1d(output_size=6)
  (classifier): Sequential(
    (0): Linear(in_features=192, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=64, out_features=2, bias=True)
  )
)

Подбор гиперпараметров несколькими запусками:

In [47]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=len(test_dataset))

for epoch in range(20):
    net.train()
    for batch, (x, y) in enumerate(train_dataloader):
        pred = net(x)
        loss = loss_fn(pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    net.eval()  # отключаем Dropout
    size = len(test_dataloader.dataset)
    correct = 0
    with torch.no_grad():  # отключаем вычисление градиентов для ускорения оценки
        for x, y in test_dataloader:
            pred = net(x)
            correct += (pred.argmax(1) == y).sum().item()

    accuracy = correct / size
    print(f"Epoch [{epoch + 1:>2}]: {accuracy:4f}")

Epoch [ 1]: 0.655844
Epoch [ 2]: 0.701299
Epoch [ 3]: 0.740260
Epoch [ 4]: 0.720779
Epoch [ 5]: 0.727273
Epoch [ 6]: 0.714286
Epoch [ 7]: 0.733766
Epoch [ 8]: 0.655844
Epoch [ 9]: 0.785714
Epoch [10]: 0.759740
Epoch [11]: 0.772727
Epoch [12]: 0.733766
Epoch [13]: 0.720779
Epoch [14]: 0.759740
Epoch [15]: 0.707792
Epoch [16]: 0.753247
Epoch [17]: 0.792208
Epoch [18]: 0.785714
Epoch [19]: 0.720779
Epoch [20]: 0.785714


In [48]:
net.eval()  # отключаем Dropout

x_test, y_test = next(iter(test_dataloader))
y_pred = net(x_test).argmax(1)
print(metrics.classification_report(y_true=y_test, y_pred=y_pred, zero_division=True))

              precision    recall  f1-score   support

           0       0.81      0.87      0.84       101
           1       0.72      0.62      0.67        53

    accuracy                           0.79       154
   macro avg       0.77      0.75      0.75       154
weighted avg       0.78      0.79      0.78       154



Неплохая точность модели ~0.8, но recall для больных говорит, о неприменимости модели.