In [1]:
import time
import pandas as pd

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
BATCH_SIZE = 20000
NUM_EPOCHS = 1000

In [3]:
class CustomDataset(Dataset):
    def __init__(self, data, targets):
        self.data = torch.tensor(data, dtype=torch.float32).cuda()
        self.targets = torch.tensor(targets, dtype=torch.float32).cuda()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx]

In [4]:
class RegressionModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RegressionModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [5]:
def data_reader():
    all = pd.read_csv('../../data/raw/all.csv')
    all = all.drop(columns=['IMR', 'ID'])

    all['MRG'] = all['MRG'].apply(lambda x: x if x in [91, 92, 93, 94, 95, 96, 97] else 90)

    for k in all['REL'].value_counts().keys()[-1:]:
        all = all[all['REL'] != k]

    for k in all['WORKPLACE'].value_counts().keys()[-2:]:
        all = all[all['WORKPLACE'] != k]

    ITM40_mean, ITM40_std = all['ITM40'].mean(), all['ITM40'].std()
    all['ITM40'] = (all['ITM40'] - ITM40_mean) / ITM40_std

    X, Y = all.iloc[:, :-1], all.iloc[:, -1]

    df_encoded = pd.get_dummies(X, columns=list(X.columns), prefix='Prefix').values
    return df_encoded, Y.values

In [6]:
def train(model, dataloader, criterion, optimizer):
    model.train()
    total_loss = 0.0
    mae_loss = 0.0

    for inputs, targets in dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets.view(-1, 1))
        loss.backward()
        optimizer.step()
        mae = torch.mean(torch.abs(outputs - targets.view(-1, 1)))
        mae_loss += mae.item()
        total_loss += loss.item()

    average_loss = total_loss / len(dataloader)
    mae_loss = mae_loss / len(dataloader)
    return average_loss, mae_loss

In [7]:
def test(model, dataloader, criterion):
    model.eval()
    mae_loss = 0.0
    total_loss = 0.0

    with torch.no_grad():
        for inputs, targets in dataloader:
            outputs = model(inputs)
            loss = criterion(outputs, targets.view(-1, 1))
            total_loss += loss.cpu().item()
            mae = torch.mean(torch.abs(outputs - targets.view(-1, 1)))
            mae_loss += mae.item()

    average_loss = total_loss / len(dataloader)
    mae_loss = mae_loss / len(dataloader)
    return average_loss, mae_loss

In [8]:
X, y = data_reader()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [9]:
train_dataset = CustomDataset(X_train, y_train)
test_dataset = CustomDataset(X_test, y_test)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=False)
test_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)

model = RegressionModel(244, 128, 1).cuda()
criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=0.003)

In [10]:
epoch_list = []
train_loss_list = []
test_loss_list = []
mae_train_loss_list = []
mae_test_loss_list = []
for epoch in range(NUM_EPOCHS):
    time_start = time.time()
    train_loss, mae_train_loss = train(model, train_dataloader, criterion, optimizer)
    test_loss, mae_test_loss = test(model, test_dataloader, criterion)

    epoch_list.append(epoch)
    train_loss_list.append(train_loss)
    test_loss_list.append(test_loss)
    mae_train_loss_list.append(mae_train_loss)
    mae_test_loss_list.append(mae_test_loss)

    pd.DataFrame(
        {'epoch_list': epoch_list, 'train_loss_list': train_loss_list, 'test_loss_list': test_loss_list, 'mae_train_loss_list': mae_train_loss_list, 'mae_test_loss_list': mae_test_loss_list}).to_csv('nn_log.csv', index=False)

    print(
        f'Cost: {time.time() - time_start}, Epoch [{epoch + 1}/{NUM_EPOCHS}], Train Loss Mse: {train_loss}, Test Loss Mse: {test_loss}, Train Loss Mae: {mae_train_loss}, Test Loss Mae: {mae_test_loss}')

Cost: 4.957527160644531, Epoch [1/1000], Train Loss Mse: 0.7912675539652506, Test Loss Mse: 0.6546467592318853, Train Loss Mae: 0.48101823776960373, Test Loss Mae: 0.43906354159116745
Cost: 3.168949604034424, Epoch [2/1000], Train Loss Mse: 0.6193532943725586, Test Loss Mse: 0.5832663749655088, Train Loss Mae: 0.4124184673031171, Test Loss Mae: 0.3714194546143214
Cost: 3.2317755222320557, Epoch [3/1000], Train Loss Mse: 0.5695970207452774, Test Loss Mse: 0.5570312639077505, Train Loss Mae: 0.3693689952294032, Test Loss Mae: 0.3673633262515068
Cost: 3.076153039932251, Epoch [4/1000], Train Loss Mse: 0.5605082760254542, Test Loss Mse: 0.5476653675238291, Train Loss Mae: 0.3654800703128179, Test Loss Mae: 0.3614436164498329
Cost: 3.1902503967285156, Epoch [5/1000], Train Loss Mse: 0.5491095383961996, Test Loss Mse: 0.541009247303009, Train Loss Mae: 0.3599667400121689, Test Loss Mae: 0.360446664194266
Cost: 3.060882806777954, Epoch [6/1000], Train Loss Mse: 0.5408298621575037, Test Loss M