## Import

In [53]:
import random

import matplotlib.pyplot as plt
import numpy as np
import polars as pl
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import StratifiedKFold
from torch import Tensor
from torch.utils.data import DataLoader, Dataset, Subset
from tqdm.notebook import tqdm


## Functions

In [54]:
# 乱数シード固定（再現性の担保）
def fix_seed(seed) -> None:
    # random
    random.seed(seed)
    # numpy
    np.random.seed(seed)
    # pytorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [55]:
# データローダーのサブプロセスの乱数seedが固定
def worker_init_fn(worker_id) -> None:
    np.random.seed(np.random.get_state()[1][0] + worker_id)

In [63]:
class Mydataset(Dataset):
    def __init__(self, df, x_columns, y_columns) -> None:
        self._df = df
        self.x_columns = x_columns
        self.y_columns = y_columns

    def __len__(self) -> int:
        return self._df.select(pl.len()).to_numpy()[0][0]

    def __getitem__(self, idx) -> tuple[Tensor, Tensor]:
        features = np.array(self._df.select(self.x_columns).row(idx)[:-1])
        target = np.array(self._df.select(self.y_columns).row(idx)[-1])
        return torch.tensor(features, dtype=torch.float32), torch.tensor(
            target, dtype=torch.float32
        )
    @property
    def y(self) -> Tensor:
        return self._df.select(self.y_columns).to_torch()

In [57]:
# モデルの定義
class Mymodel(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 16, 3, 2, 1), nn.BatchNorm2d(16), nn.ReLU()
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(16, 64, 3, 2, 1), nn.BatchNorm2d(64), nn.ReLU()
        )

        self.fc1 = nn.Linear(2 * 2 * 64, 100)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = torch.nn.Linear(100, 2)

    def forward(self, x) -> Tensor:
        x = self.conv1(x)
        x = self.conv2(x)
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x


## Settings

In [58]:
# リソースの指定（CPU/GPU）
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
seed = 42
fix_seed(seed)

## Process

### Read Data

In [66]:
df = pl.read_csv("/workspace/data/origin/criteo.csv")
y_column = "visit"
X_columns = df.columns.remove(y_column)
dataset = Mydataset(df, X_columns, y_column)

In [69]:
df.to_torch()

tensor([[12.6164, 10.0597,  8.9764,  ...,  0.0000,  0.0000,  0.0000],
        [12.6164, 10.0597,  9.0027,  ...,  0.0000,  0.0000,  0.0000],
        [12.6164, 10.0597,  8.9648,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [12.6164, 10.0597,  8.9422,  ...,  0.0000,  0.0000,  0.0000],
        [24.5558, 10.0597,  8.2144,  ...,  0.0000,  0.0000,  0.0000],
        [26.6716, 10.0597,  8.2144,  ...,  0.0000,  0.0000,  0.0000]],
       dtype=torch.float64)

In [68]:
dataset.y()

AttributeError: 'DataFrame' object has no attribute 'to_tensor'

In [65]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
for _fold, (train_index, valid_index) in enumerate(skf.split(range(len(dataset)))):
    train_dataset = Subset(dataset, train_index)
    valid_dataset = Subset(dataset, valid_index)

    # データローダーの作成
    train_loader = DataLoader(
        train_dataset,
        batch_size=16,  # バッチサイズ
        shuffle=True,  # データシャッフル
        num_workers=2,  # 高速化
        pin_memory=True,  # 高速化
        worker_init_fn=worker_init_fn,
    )
    test_loader = DataLoader(
        test_dataset,
        batch_size=16,
        shuffle=False,
        num_workers=2,
        pin_memory=True,
        worker_init_fn=worker_init_fn,
    )

AttributeError: 'LazyFrame' object has no attribute 'to_numpy'

In [None]:
# モデル・損失関数・最適化アルゴリスムの設定
model = Mymodel().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)


# モデル訓練関数
def train_model(model, train_loader, test_loader):
    # Train loop ----------------------------
    model.train()  # 学習モードをオン
    train_batch_loss = []
    for data, label in train_loader:
        # GPUへの転送
        data, label = data.to(device), label.to(device)
        # 1. 勾配リセット
        optimizer.zero_grad()
        # 2. 推論
        output = model(data)
        # 3. 誤差計算
        loss = criterion(output, label)
        # 4. 誤差逆伝播
        loss.backward()
        # 5. パラメータ更新
        optimizer.step()
        # train_lossの取得
        train_batch_loss.append(loss.item())

    # Test(val) loop ----------------------------
    model.eval()  # 学習モードをオフ
    test_batch_loss = []
    with torch.no_grad():  # 勾配を計算なし
        for data, label in test_loader:
            data, label = data.to(device), label.to(device)
            output = model(data)
            loss = criterion(output, label)
            test_batch_loss.append(loss.item())

    return model, np.mean(train_batch_loss), np.mean(test_batch_loss)

In [None]:
# 訓練の実行
epoch = 100
train_loss = []
test_loss = []

for epoch in tqdm(range(epoch)):
    model, train_l, test_l = train_model(model)
    train_loss.append(train_l)
    test_loss.append(test_loss)
    # 10エポックごとにロスを表示
    if epoch % 10 == 0:
        print(
            "Train loss: {a:.3f}, Test loss: {b:.3f}".format(
                a=train_loss[-1], b=test_loss[-1]
            )
        )

# 学習状況（ロス）の確認
plt.plot(train_loss, label="train_loss")
plt.plot(test_loss, label="test_loss")
plt.legend()