In [1]:
import time
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
BATCH_SIZE = 2000
NUM_EPOCHS = 1000

In [3]:
class CustomDataset(Dataset):
    def __init__(self, data, targets):
        self.data = torch.tensor(data, dtype=torch.long).cuda()
        self.targets = torch.tensor(targets, dtype=torch.long).cuda()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx]

In [4]:
class RegressionModel(nn.Module):
    def __init__(self, embedding_sizes, hidden_size=128, output_size=1):
        super(RegressionModel, self).__init__()

        self.embeddings = nn.ModuleList([nn.Embedding(num_embeddings=num, embedding_dim=embedding_dim)
                                         for num, embedding_dim in embedding_sizes])
        
        self.fc1 = nn.Linear(sum([embedding_dim for _, embedding_dim in embedding_sizes]), out_features=hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(in_features=hidden_size, out_features=output_size)

    def forward(self, categorical_inputs):
        # print(categorical_inputs.shape, categorical_inputs.dtype)
        embedding_outputs = [embedding(categorical_inputs[:, i]) for i, embedding in enumerate(self.embeddings)]
        embedding_outputs = torch.cat(embedding_outputs, dim=1)
        # print(embedding_outputs.shape, embedding_outputs.dtype)
        
        x = self.relu(self.fc1(embedding_outputs))
        x = self.fc2(x)

        return x

In [5]:
def data_reader():
    all = pd.read_csv('../../data/raw/all.csv')
    all = all.drop(columns=['IMR', 'ID'])

    all['MRG'] = all['MRG'].apply(lambda x: x if x in [91, 92, 93, 94, 95, 96, 97] else 90)

    for k in all['REL'].value_counts().keys()[-1:]:
        all = all[all['REL'] != k]

    for k in all['WORKPLACE'].value_counts().keys()[-2:]:
        all = all[all['WORKPLACE'] != k]

    ITM40_mean, ITM40_std = all['ITM40'].mean(), all['ITM40'].std()
    all['ITM40'] = (all['ITM40'] - ITM40_mean) / ITM40_std

    X, Y = all.iloc[:, :-1], all.iloc[:, -1]
    X = OrdinalEncoder().fit_transform(X)
    return X, Y.values

In [6]:
def train(model, dataloader, criterion, optimizer):
    model.train()
    total_loss = 0.0
    mae_loss = 0.0

    for inputs, targets in dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets.float().view(-1, 1))
        loss.backward()
        optimizer.step()
        mae = torch.mean(torch.abs(outputs - targets.float().view(-1, 1)))
        mae_loss += mae.item()
        total_loss += loss.item()

    average_loss = total_loss / len(dataloader)
    mae_loss = mae_loss / len(dataloader)
    return average_loss, mae_loss

In [7]:
def test(model, dataloader, criterion):
    model.eval()
    mae_loss = 0.0
    total_loss = 0.0

    with torch.no_grad():
        for inputs, targets in dataloader:
            outputs = model(inputs)
            loss = criterion(outputs, targets.float().view(-1, 1))
            total_loss += loss.cpu().item()
            mae = torch.mean(torch.abs(outputs - targets.float().view(-1, 1)))
            mae_loss += mae.item()

    average_loss = total_loss / len(dataloader)
    mae_loss = mae_loss / len(dataloader)
    return average_loss, mae_loss

In [8]:
X, y = data_reader()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [9]:
train_dataset = CustomDataset(X_train, y_train)
test_dataset = CustomDataset(X_test, y_test)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=False)
test_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)

embedding_sizes = [(11+1, 1), (26+1, 1), (13+1, 1), (2+1, 1), (101+1, 1), (10+1, 1), (22+1, 1), (13+1, 1), (8+1, 1), (2+1, 1), (26+1, 1),
                   (8+1, 1), (2+1, 1)]
model = RegressionModel(embedding_sizes, 128, 1).cuda()
criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=0.003)

In [10]:
epoch_list = []
train_loss_list = []
test_loss_list = []
mae_train_loss_list = []
mae_test_loss_list = []
for epoch in range(NUM_EPOCHS):
    time_start = time.time()
    train_loss, mae_train_loss = train(model, train_dataloader, criterion, optimizer)
    test_loss, mae_test_loss = test(model, test_dataloader, criterion)

    epoch_list.append(epoch)
    train_loss_list.append(train_loss)
    test_loss_list.append(test_loss)
    mae_train_loss_list.append(mae_train_loss)
    mae_test_loss_list.append(mae_test_loss)

    pd.DataFrame(
        {'epoch_list': epoch_list, 'train_loss_list': train_loss_list, 'test_loss_list': test_loss_list,
         'mae_train_loss_list': mae_train_loss_list, 'mae_test_loss_list': mae_test_loss_list}).to_csv(
        'embedding_nn_log.csv', index=False)

    print(
        f'Cost: {time.time() - time_start}, Epoch [{epoch + 1}/{NUM_EPOCHS}], Train Loss Mse: {train_loss}, Test Loss Mse: {test_loss}, Train Loss Mae: {mae_train_loss}, Test Loss Mae: {mae_test_loss}')

Cost: 4.904807090759277, Epoch [1/1000], Train Loss Mse: 0.48124754721792334, Test Loss Mse: 0.4549781166844898, Train Loss Mae: 0.26667934872655785, Test Loss Mae: 0.24200432104432684
Cost: 2.708998680114746, Epoch [2/1000], Train Loss Mse: 0.44278900338034344, Test Loss Mse: 0.4320857789781358, Train Loss Mae: 0.2344879015133931, Test Loss Mae: 0.2163223313470172
Cost: 2.8661837577819824, Epoch [3/1000], Train Loss Mse: 0.42554825136804175, Test Loss Mse: 0.41955015356214637, Train Loss Mae: 0.2261809080074995, Test Loss Mae: 0.2493305025447128
Cost: 2.8656115531921387, Epoch [4/1000], Train Loss Mse: 0.41013421207411677, Test Loss Mse: 0.40617867399994123, Train Loss Mae: 0.2192781889795238, Test Loss Mae: 0.21134484412833157
Cost: 2.922576904296875, Epoch [5/1000], Train Loss Mse: 0.4038781929984052, Test Loss Mse: 0.398827905074144, Train Loss Mae: 0.2185612303069514, Test Loss Mae: 0.23267989624769259
Cost: 2.8026657104492188, Epoch [6/1000], Train Loss Mse: 0.410116898198413, Te