# Импорты

In [221]:
import os
import random
import numpy as np
import pandas as pd
from collections import Counter

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset, Subset, random_split, TensorDataset
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

from tqdm.notebook import tqdm

import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10

import pytorch_lightning as pl
from pytorch_lightning.callbacks import TQDMProgressBar, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
from pytorch_lightning.callbacks import Callback

from torchmetrics import Accuracy, F1Score, MetricCollection, AUROC

Зафиксируем random_seed

In [2]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

set_seed(42)

Random seed set as 42


# Знакомство с данными

Загрузим данные и посмотрим общую информациб о них

In [3]:
df = pd.read_csv(filepath_or_buffer="data_input/train.csv")
df.head()

Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,0,35.0,175.0,75.0,86.5,1.2,1.2,1.0,1.0,127.0,...,58.0,108.0,15.6,1.0,0.9,17.0,14.0,21.0,0.0,0.0
1,1,45.0,155.0,60.0,82.0,1.2,1.0,1.0,1.0,129.0,...,50.0,110.0,14.0,1.0,0.7,22.0,18.0,14.0,0.0,0.0
2,2,35.0,175.0,60.0,74.0,1.2,1.2,1.0,1.0,100.0,...,58.0,116.0,14.8,1.0,0.9,20.0,15.0,16.0,0.0,1.0
3,3,60.0,160.0,55.0,74.0,1.2,1.5,1.0,1.0,139.0,...,73.0,95.0,15.1,1.0,0.7,47.0,31.0,15.0,0.0,0.0
4,4,40.0,160.0,55.0,71.0,0.9,1.2,1.0,1.0,100.0,...,66.0,103.0,13.1,1.0,0.6,24.0,21.0,13.0,0.0,0.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   15000 non-null  int64  
 1   age                  15000 non-null  float64
 2   height(cm)           15000 non-null  float64
 3   weight(kg)           15000 non-null  float64
 4   waist(cm)            15000 non-null  float64
 5   eyesight(left)       15000 non-null  float64
 6   eyesight(right)      15000 non-null  float64
 7   hearing(left)        15000 non-null  float64
 8   hearing(right)       15000 non-null  float64
 9   systolic             15000 non-null  float64
 10  relaxation           15000 non-null  float64
 11  fasting blood sugar  15000 non-null  float64
 12  Cholesterol          15000 non-null  float64
 13  triglyceride         15000 non-null  float64
 14  HDL                  15000 non-null  float64
 15  LDL                  15000 non-null 

In [5]:
df_pred = pd.read_csv(filepath_or_buffer="data_input/test.csv")
df_pred.head()

Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,triglyceride,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries
0,15000,25.0,165.0,65.0,85.0,1.2,1.2,1.0,1.0,128.0,...,92.0,41.0,132.0,15.0,1.0,1.1,34.0,23.0,14.0,0.0
1,15001,45.0,165.0,60.0,74.0,1.5,1.0,1.0,1.0,104.0,...,124.0,54.0,129.0,11.3,1.0,0.7,20.0,17.0,11.0,0.0
2,15002,65.0,155.0,55.0,72.0,0.8,0.6,1.0,1.0,130.0,...,103.0,76.0,128.0,14.4,1.0,0.8,38.0,18.0,24.0,1.0
3,15003,30.0,170.0,85.0,88.0,0.7,0.9,1.0,1.0,119.0,...,212.0,44.0,117.0,14.8,1.0,1.1,26.0,38.0,19.0,0.0
4,15004,40.0,155.0,50.0,70.0,0.9,0.8,1.0,1.0,102.0,...,87.0,68.0,130.0,13.3,1.0,0.9,18.0,12.0,14.0,0.0


In [6]:
df_pred.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   10000 non-null  int64  
 1   age                  10000 non-null  float64
 2   height(cm)           10000 non-null  float64
 3   weight(kg)           10000 non-null  float64
 4   waist(cm)            10000 non-null  float64
 5   eyesight(left)       10000 non-null  float64
 6   eyesight(right)      10000 non-null  float64
 7   hearing(left)        10000 non-null  float64
 8   hearing(right)       10000 non-null  float64
 9   systolic             10000 non-null  float64
 10  relaxation           10000 non-null  float64
 11  fasting blood sugar  10000 non-null  float64
 12  Cholesterol          10000 non-null  float64
 13  triglyceride         10000 non-null  float64
 14  HDL                  10000 non-null  float64
 15  LDL                  10000 non-null  

# DataModule

Создадим DataModule для работы Pytroch Ligthing с данными

In [7]:
class SmokingDataset(Dataset):
    def __init__(self, dataframe):
        self.features = dataframe.drop(columns=['id', 'smoking']).values
        self.targets = dataframe['smoking'].values.astype(float)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return torch.tensor(self.features[idx], dtype=torch.float32), torch.tensor(self.targets[idx], dtype=torch.float32)

In [8]:
class SmokingDataModule(pl.LightningDataModule):
    def __init__(self, df: pd.DataFrame, batch_size=32):
        super().__init__()
        self.df = df
        self.batch_size = batch_size

    def setup(self, stage=None):
        # Разделение на train/val/test
        train_size = int(0.7 * len(self.df))
        val_size = int(0.15 * len(self.df))
        test_size = len(self.df) - train_size - val_size

        df_train, df_val, df_test = torch.utils.data.random_split(
            self.df, [train_size, val_size, test_size]
        )

        # Преобразуем обратно в DataFrame
        self.train_df = self.df.iloc[df_train.indices].reset_index(drop=True)
        self.val_df = self.df.iloc[df_val.indices].reset_index(drop=True)
        self.test_df = self.df.iloc[df_test.indices].reset_index(drop=True)

    def train_dataloader(self):
        return DataLoader(SmokingDataset(self.train_df), batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(SmokingDataset(self.val_df), batch_size=self.batch_size)

    def test_dataloader(self):
        return DataLoader(SmokingDataset(self.test_df), batch_size=self.batch_size)

In [9]:
data_module = SmokingDataModule(df)

# Custom Test Callback

Создадим callback, чтобы при обучении каждую n-эпоху проводился тест

In [90]:
class TestEveryNepochs(Callback):
    def __init__(self, every_n_epochs):
        self.every_n_epochs = every_n_epochs

    def on_epoch_end(self, trainer, pl_module):
        # Если текущая эпоха делится на n, запускаем тест
        if trainer.current_epoch % self.every_n_epochs == 0:
            print(f"Running test at epoch {trainer.current_epoch}")
            trainer.test(model=pl_module)

test_callback = TestEveryNepochs(every_n_epochs=10)

Далее были создадны 7 моделей. Каждый раз пробовал разную архитектуру. Лучшей моделью оказалась модель V7

# Модель V1

In [10]:
class SmokingClassifierV1(pl.LightningModule):
    def __init__(self, input_size):
        super(SmokingClassifierV1, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

        self.metrics = MetricCollection({
            'auroc': AUROC(task='binary'),
        })

        self.val_metrics = self.metrics.clone(prefix='val_')
        self.test_metrics = self.metrics.clone(prefix='test_')

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = F.binary_cross_entropy(y_hat, y.float())
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = F.binary_cross_entropy(y_hat, y.float())
        
        # Обновляем метрики
        self.val_metrics.update(y_hat, y.int())

        acc = ((y_hat > 0.5).int() == y.int()).float().mean()
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)

    def on_validation_epoch_end(self):
        val_metrics = self.val_metrics.compute()
        self.log_dict(val_metrics, prog_bar=True, on_epoch=True)
        self.val_metrics.reset()

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = F.binary_cross_entropy(y_hat, y.float())
        
        # Обновляем метрики
        self.test_metrics.update(y_hat, y.int())

        acc = ((y_hat > 0.5).int() == y.int()).float().mean()
        self.log('test_loss', loss, prog_bar=True)
        self.log('test_acc', acc, prog_bar=True)

    def on_test_epoch_end(self):
        test_metrics = self.test_metrics.compute()
        self.log_dict(test_metrics, prog_bar=True, on_epoch=True)
        self.test_metrics.reset()

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)


In [11]:
input_size = df.shape[1] - 2  # Исключаем 'id' и 'smoking'
model = SmokingClassifierV1(input_size)

# Модель V2

In [51]:
class SmokingClassifierV2(pl.LightningModule):
    def __init__(self, input_size):
        super(SmokingClassifierV2, self).__init__()
        
        self.model = nn.Sequential(
            # Первый слой: Linear -> BatchNorm -> ReLU
            nn.Linear(input_size, 128),
            nn.BatchNorm1d(128),  # Нормализация для улучшения сходимости
            nn.ReLU(),
            nn.Dropout(0.2),  # Dropout для предотвращения переобучения

            # Второй слой: Linear -> BatchNorm -> ReLU
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.3),

            # Третий слой: Linear -> BatchNorm -> ReLU
            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Dropout(0.3),
            
            # Четвертый слой: Linear
            nn.Linear(32, 16),
            nn.ReLU(),

            # Выходной слой для бинарной классификации
            nn.Linear(16, 1),
            nn.Sigmoid()
        )

        self.metrics = MetricCollection({
            'auroc': AUROC(task='binary'),
        })

        self.val_metrics = self.metrics.clone(prefix='val_')
        self.test_metrics = self.metrics.clone(prefix='test_')

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = F.binary_cross_entropy(y_hat, y.float())
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = F.binary_cross_entropy(y_hat, y.float())
        
        # Обновляем метрики
        self.val_metrics.update(y_hat, y.int())

        acc = ((y_hat > 0.5).int() == y.int()).float().mean()
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)

    def on_validation_epoch_end(self):
        val_metrics = self.val_metrics.compute()
        self.log_dict(val_metrics, prog_bar=True, on_epoch=True)
        self.val_metrics.reset()

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = F.binary_cross_entropy(y_hat, y.float())
        
        # Обновляем метрики
        self.test_metrics.update(y_hat, y.int())

        acc = ((y_hat > 0.5).int() == y.int()).float().mean()
        self.log('test_loss', loss, prog_bar=True)
        self.log('test_acc', acc, prog_bar=True)

    def on_test_epoch_end(self):
        test_metrics = self.test_metrics.compute()
        self.log_dict(test_metrics, prog_bar=True, on_epoch=True)
        self.test_metrics.reset()

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)

In [52]:
input_size = df.shape[1] - 2  # Исключаем 'id' и 'smoking'
model = SmokingClassifierV2(input_size)

# Модель V3

In [82]:
class SmokingClassifierV3(pl.LightningModule):
    def __init__(self, input_size):
        super(SmokingClassifierV3, self).__init__()
        
        self.model = nn.Sequential(
            # Первый слой: Linear -> ReLU
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            # Второй слой: Linear -> ReLU
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            # Третий слой: Linear -> ReLU
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            # Четвёртый слой: Linear -> ReLU
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            # Пятый слой: Linear -> ReLU
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            # Шестой слой: Linear -> ReLU
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            # Седьмой слой: Linear -> ReLU
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            # Восьмой слой: Linear -> ReLU
            nn.Linear(64, 16),
            nn.ReLU(),
            
            # Выходной слой: Linear -> Sigmoid
            nn.Linear(16, 1),
            nn.Sigmoid()
        )

        self.metrics = MetricCollection({
            'auroc': AUROC(task='binary'),
        })

        self.val_metrics = self.metrics.clone(prefix='val_')
        self.test_metrics = self.metrics.clone(prefix='test_')

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = F.binary_cross_entropy(y_hat, y.float())
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = F.binary_cross_entropy(y_hat, y.float())
        
        # Обновляем метрики
        self.val_metrics.update(y_hat, y.int())

        acc = ((y_hat > 0.5).int() == y.int()).float().mean()
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)

    def on_validation_epoch_end(self):
        val_metrics = self.val_metrics.compute()
        self.log_dict(val_metrics, prog_bar=True, on_epoch=True)
        self.val_metrics.reset()

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = F.binary_cross_entropy(y_hat, y.float())
        
        # Обновляем метрики
        self.test_metrics.update(y_hat, y.int())

        acc = ((y_hat > 0.5).int() == y.int()).float().mean()
        self.log('test_loss', loss, prog_bar=True)
        self.log('test_acc', acc, prog_bar=True)

    def on_test_epoch_end(self):
        test_metrics = self.test_metrics.compute()
        self.log_dict(test_metrics, prog_bar=True, on_epoch=True)
        self.test_metrics.reset()

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)

In [83]:
input_size = df.shape[1] - 2  # Исключаем 'id' и 'smoking'
model = SmokingClassifierV3(input_size)

# Модель V4

In [99]:
class SmokingClassifierV4(pl.LightningModule):
    def __init__(self, input_size):
        super(SmokingClassifierV4, self).__init__()
        
        self.embedding = nn.Sequential(
            nn.Linear(input_size, 256),         # Большая входная размерность
            nn.ReLU(),
            nn.BatchNorm1d(256),                # Нормализация для улучшения сходимости
            nn.Dropout(0.3)                     # Dropout для предотвращения переобучения
        )
        
        # Сетку для обработки нелинейных зависимостей
        self.fc1 = nn.Linear(256, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        
        # Добавление слоёв внимания (Attention) для выделения важных признаков
        self.attention = nn.MultiheadAttention(embed_dim=32, num_heads=4, dropout=0.1)
        
        # Слой для вывода на одну категорию (курит/не курит)
        self.fc_out = nn.Linear(32, 1)
        
        self.sigmoid = nn.Sigmoid()  # Для бинарной классификации

        self.metrics = MetricCollection({
            'auroc': AUROC(task='binary'),
        })

        self.val_metrics = self.metrics.clone(prefix='val_')
        self.test_metrics = self.metrics.clone(prefix='test_')
    
    def forward(self, x):
        # Применяем embedding
        x = self.embedding(x)
        
        # Применяем полносвязные слои
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        
        # Применяем внимание для выделения наиболее важных признаков
        x = x.unsqueeze(0)  # Добавляем размерность для применения внимания
        x, _ = self.attention(x, x, x)
        x = x.squeeze(0)  # Убираем лишнюю размерность
        
        # Вывод
        x = self.fc_out(x)
        x = self.sigmoid(x)  # Получаем вероятность принадлежности к классу "курит"
        
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = F.binary_cross_entropy(y_hat, y.float())
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = F.binary_cross_entropy(y_hat, y.float())
        
        # Обновляем метрики
        self.val_metrics.update(y_hat, y.int())

        acc = ((y_hat > 0.5).int() == y.int()).float().mean()
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)

    def on_validation_epoch_end(self):
        val_metrics = self.val_metrics.compute()
        self.log_dict(val_metrics, prog_bar=True, on_epoch=True)
        self.val_metrics.reset()

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = F.binary_cross_entropy(y_hat, y.float())
        
        # Обновляем метрики
        self.test_metrics.update(y_hat, y.int())

        acc = ((y_hat > 0.5).int() == y.int()).float().mean()
        self.log('test_loss', loss, prog_bar=True)
        self.log('test_acc', acc, prog_bar=True)

    def on_test_epoch_end(self):
        test_metrics = self.test_metrics.compute()
        self.log_dict(test_metrics, prog_bar=True, on_epoch=True)
        self.test_metrics.reset()

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)

In [100]:
input_size = df.shape[1] - 2  # Исключаем 'id' и 'smoking'
model = SmokingClassifierV4(input_size)

# Модель V5

In [134]:
class SmokingClassifierV5(pl.LightningModule):
    def __init__(self, input_size):
        super(SmokingClassifierV5, self).__init__()
        
        self.model = nn.Sequential(
            # Первый слой: Linear -> ReLU
            nn.Linear(input_size, 1024),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            # Второй слой: Linear -> ReLU
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            # Третий слой: Linear -> ReLU
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            # Четвёртый слой: Linear -> ReLU
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            # Седьмой слой: Linear -> ReLU
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            # Седьмой слой: Linear -> ReLU
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            # Восьмой слой: Linear -> ReLU
            nn.Linear(32, 16),
            nn.ReLU(),
            
            # Выходной слой: Linear -> Sigmoid
            nn.Linear(16, 1),
            nn.Sigmoid()
        )

        self.metrics = MetricCollection({
            'auroc': AUROC(task='binary'),
        })

        self.val_metrics = self.metrics.clone(prefix='val_')
        self.test_metrics = self.metrics.clone(prefix='test_')

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = F.binary_cross_entropy(y_hat, y.float())
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = F.binary_cross_entropy(y_hat, y.float())
        
        # Обновляем метрики
        self.val_metrics.update(y_hat, y.int())

        acc = ((y_hat > 0.5).int() == y.int()).float().mean()
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)

    def on_validation_epoch_end(self):
        val_metrics = self.val_metrics.compute()
        self.log_dict(val_metrics, prog_bar=True, on_epoch=True)
        self.val_metrics.reset()

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = F.binary_cross_entropy(y_hat, y.float())
        
        # Обновляем метрики
        self.test_metrics.update(y_hat, y.int())

        acc = ((y_hat > 0.5).int() == y.int()).float().mean()
        self.log('test_loss', loss, prog_bar=True)
        self.log('test_acc', acc, prog_bar=True)

    def on_test_epoch_end(self):
        test_metrics = self.test_metrics.compute()
        self.log_dict(test_metrics, prog_bar=True, on_epoch=True)
        self.test_metrics.reset()

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)

In [135]:
input_size = df.shape[1] - 2  # Исключаем 'id' и 'smoking'
model = SmokingClassifierV5(input_size)

# Модель V6

In [166]:
class SmokingClassifierV6(pl.LightningModule):
    def __init__(self, input_size):
        super(SmokingClassifierV6, self).__init__()

        # Модель с дополнительными слоями
        self.model = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.BatchNorm1d(128),  # Нормализация для улучшения сходимости
            nn.ReLU(),
            nn.Dropout(0.2),  # Dropout для предотвращения переобучения

            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Dropout(0.3),
            
            nn.Linear(32, 16),
            nn.ReLU(),

            # Выходной слой для бинарной классификации
            nn.Linear(16, 1),
            nn.Sigmoid()  # Для бинарной классификации
        )

        # Метрики
        self.metrics = MetricCollection({
            'auroc': AUROC(task='binary'),
        })
        self.val_metrics = self.metrics.clone(prefix='val_')
        self.test_metrics = self.metrics.clone(prefix='test_')

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = F.binary_cross_entropy(y_hat, y.float())
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = F.binary_cross_entropy(y_hat, y.float())
        
        # Обновляем метрики на вероятностях (не бинарных значениях)
        self.val_metrics.update(y_hat, y.int())

        acc = ((y_hat > 0.5).int() == y.int()).float().mean()
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)

    def on_validation_epoch_end(self):
        val_metrics = self.val_metrics.compute()
        self.log_dict(val_metrics, prog_bar=True, on_epoch=True)
        self.val_metrics.reset()

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = F.binary_cross_entropy(y_hat, y.float())
        
        # Обновляем метрики
        self.test_metrics.update(y_hat, y.int())

        acc = ((y_hat > 0.5).int() == y.int()).float().mean()
        self.log('test_loss', loss, prog_bar=True)
        self.log('test_acc', acc, prog_bar=True)

    def on_test_epoch_end(self):
        test_metrics = self.test_metrics.compute()
        self.log_dict(test_metrics, prog_bar=True, on_epoch=True)
        self.test_metrics.reset()

    def configure_optimizers(self):
        # Добавление weight decay для L2 регуляризации
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3, weight_decay=1e-5)
        return optimizer

In [180]:
input_size = df.shape[1] - 2  # Исключаем 'id' и 'smoking'
model = SmokingClassifierV6(input_size)

# Модель V7

In [228]:
class SmokingClassifierV7(pl.LightningModule):
    def __init__(self, input_size):
        super(SmokingClassifierV7, self).__init__()

        self.model = nn.Sequential(
            # Первый слой: Linear -> BatchNorm -> LeakyReLU
            nn.Linear(input_size, 256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(0.1),  # Используем LeakyReLU
            nn.Dropout(0.2),

            # Остаточная связь (Residual connection)
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.3),

            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.3),

            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.4),

            nn.Linear(32, 16),
            nn.LeakyReLU(0.1),

            # Выходной слой
            nn.Linear(16, 1),
            nn.Sigmoid()  # Для бинарной классификации
        )

        # Метрики
        self.metrics = MetricCollection({
            'auroc': AUROC(task='binary'),
        })
        self.val_metrics = self.metrics.clone(prefix='val_')
        self.test_metrics = self.metrics.clone(prefix='test_')

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = F.binary_cross_entropy(y_hat, y.float())
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = F.binary_cross_entropy(y_hat, y.float())
        
        # Обновляем метрики на вероятностях (не бинарных значениях)
        self.val_metrics.update(y_hat, y.int())

        acc = ((y_hat > 0.5).int() == y.int()).float().mean()
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)

    def on_validation_epoch_end(self):
        val_metrics = self.val_metrics.compute()
        self.log_dict(val_metrics, prog_bar=True, on_epoch=True)
        self.val_metrics.reset()

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = F.binary_cross_entropy(y_hat, y.float())
        
        # Обновляем метрики
        self.test_metrics.update(y_hat, y.int())

        acc = ((y_hat > 0.5).int() == y.int()).float().mean()
        self.log('test_loss', loss, prog_bar=True)
        self.log('test_acc', acc, prog_bar=True)

    def on_test_epoch_end(self):
        test_metrics = self.test_metrics.compute()
        self.log_dict(test_metrics, prog_bar=True, on_epoch=True)
        self.test_metrics.reset()

    def configure_optimizers(self):
        # Оптимизатор
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3, weight_decay=1e-5)
        
        # Scheduler для уменьшения learning rate
        scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=2, verbose=True)

        # Возвращаем оптимизатор и scheduler как список
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'monitor': 'val_loss',  # Метрика, которую нужно отслеживать
            }
        }

In [229]:
input_size = df.shape[1] - 2  # Исключаем 'id' и 'smoking'
model = SmokingClassifierV7(input_size)

# Обучение модели

Обучим модель в течении 10 эпох

In [230]:
trainer = pl.Trainer(
    max_epochs=11, 
    callbacks=[TQDMProgressBar(refresh_rate=10), test_callback],
    logger=TensorBoardLogger(save_dir='lightning_logs', name='smoking_classifier', version='v7')
)

trainer.fit(model, data_module)
trainer.test(model, datamodule=data_module)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name         | Type             | Params | Mode 
----------------------------------------------------------
0 | model        | Sequential       | 50.6 K | train
1 | metrics      | MetricCollection | 0      | train
2 | val_metrics  | MetricCollection | 0      | train
3 | test_metrics | MetricCollection | 0      | train
----------------------------------------------------------
50.6 K    Trainable params
0         Non-trainable params
50.6 K    Total params
0.202     Total estimated model params size (MB)
27        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

d:\Documents\ITMO\Semestr_1\dl_course_2024\.venv\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
d:\Documents\ITMO\Semestr_1\dl_course_2024\.venv\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=11` reached.
d:\Documents\ITMO\Semestr_1\dl_course_2024\.venv\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_acc            0.8111110925674438
       test_auroc           0.8923875093460083
        test_loss           0.39220476150512695
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.39220476150512695,
  'test_acc': 0.8111110925674438,
  'test_auroc': 0.8923875093460083}]

# Дообучение модели

В этом пункте я постоянно подгружал чекпоинты для дообучения моделей.

Укажем путь к чекпоинту

In [236]:
checkpoint_path = './lightning_logs/smoking_classifier/v7/checkpoints/epoch=100-step=33229.ckpt'

In [237]:
trainer = pl.Trainer(
    max_epochs=201,  # Увеличь количество эпох
    callbacks=[TQDMProgressBar(refresh_rate=10), test_callback],
    logger=TensorBoardLogger(save_dir='lightning_logs', name='smoking_classifier', version='v7')
)

# Продолжаем обучение с чекпойнта
trainer.fit(model, data_module, ckpt_path=checkpoint_path)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
d:\Documents\ITMO\Semestr_1\dl_course_2024\.venv\lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:654: Checkpoint directory lightning_logs\smoking_classifier\v7\checkpoints exists and is not empty.
Restoring states from the checkpoint path at ./lightning_logs/smoking_classifier/v7/checkpoints/epoch=100-step=33229.ckpt

  | Name         | Type             | Params | Mode 
----------------------------------------------------------
0 | model        | Sequential       | 50.6 K | train
1 | metrics      | MetricCollection | 0      | train
2 | val_metrics  | MetricCollection | 0      | train
3 | test_metrics | MetricCollection | 0      | train
----------------------------------------------------------
50.6 K    Trainable params
0         Non-trainable params
50.6 K    Total params
0.202     Total estimated model params size (MB)
27        Modules in train mode
0     

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

d:\Documents\ITMO\Semestr_1\dl_course_2024\.venv\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
d:\Documents\ITMO\Semestr_1\dl_course_2024\.venv\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=201` reached.


Проверка на тесте

In [238]:
trainer.test(model, datamodule=data_module)

d:\Documents\ITMO\Semestr_1\dl_course_2024\.venv\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_acc            0.7933333516120911
       test_auroc           0.8753673434257507
        test_loss           0.4135120213031769
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.4135120213031769,
  'test_acc': 0.7933333516120911,
  'test_auroc': 0.8753673434257507}]

# Предсказания

Подготовим данные для предсказания

In [240]:
ids = df_pred['id']
X_pred = df_pred.drop(columns=['id']).values

Загрузим нужную версию модели для предсказания

In [241]:
model = SmokingClassifierV7(input_size=X_pred.shape[1])
checkpoint = torch.load('./lightning_logs/smoking_classifier/v7/checkpoints/epoch=10-step=3619.ckpt')
model.load_state_dict(checkpoint['state_dict'])
model.eval()

  checkpoint = torch.load('./lightning_logs/smoking_classifier/v7/checkpoints/epoch=10-step=3619.ckpt')


SmokingClassifierV7(
  (model): Sequential(
    (0): Linear(in_features=22, out_features=256, bias=True)
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): LeakyReLU(negative_slope=0.1)
    (3): Dropout(p=0.2, inplace=False)
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): LeakyReLU(negative_slope=0.1)
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=128, out_features=64, bias=True)
    (9): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): LeakyReLU(negative_slope=0.1)
    (11): Dropout(p=0.3, inplace=False)
    (12): Linear(in_features=64, out_features=32, bias=True)
    (13): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (14): LeakyReLU(negative_slope=0.1)
    (15): Dropout(p=0.4, inplace=False)
    (16): Linear(in_features=32, out

Предскажем

In [242]:
X_tensor = torch.tensor(X_pred, dtype=torch.float32)
with torch.no_grad():
    predictions = model(X_tensor).squeeze().numpy()
# predictions = (predictions > 0.5).astype(int).astype(float)

Сохраним результаты предсказания

In [243]:
output_df = pd.DataFrame({'id': ids, 'smoking': predictions})
output_df.to_csv('./data_output/predictions15.csv', index=False)

# Анализ

Здесь проводились сравнения версий моделей

In [239]:
%reload_ext tensorboard
%tensorboard --logdir lightning_logs/

Reusing TensorBoard on port 6006 (pid 14064), started 2:18:20 ago. (Use '!kill 14064' to kill it.)