In [1]:
import time
import pandas as pd

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
BATCH_SIZE = 2000
NUM_EPOCHS = 1000

In [3]:
class CustomDataset(Dataset):
    def __init__(self, data, targets):
        self.data = torch.tensor(data, dtype=torch.float32).cuda()
        self.targets = torch.tensor(targets, dtype=torch.float32).cuda()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx]

In [4]:
class RegressionModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RegressionModel, self).__init__()
        self.Embedding1 = nn.Embedding(num_embeddings=11, embedding_dim=1)
        self.Embedding2 = nn.Embedding(num_embeddings=26, embedding_dim=1)
        self.Embedding3 = nn.Embedding(num_embeddings=13, embedding_dim=1)
        self.Embedding4 = nn.Embedding(num_embeddings=2, embedding_dim=1)
        self.Embedding5 = nn.Embedding(num_embeddings=101, embedding_dim=1)
        self.Embedding6 = nn.Embedding(num_embeddings=10, embedding_dim=1)
        self.Embedding7 = nn.Embedding(num_embeddings=22, embedding_dim=1)
        self.Embedding8 = nn.Embedding(num_embeddings=13, embedding_dim=1)
        self.Embedding9 = nn.Embedding(num_embeddings=8, embedding_dim=1)
        self.Embedding10 = nn.Embedding(num_embeddings=2, embedding_dim=1)
        self.Embedding11 = nn.Embedding(num_embeddings=26, embedding_dim=1)
        self.Embedding12 = nn.Embedding(num_embeddings=8, embedding_dim=1)
        self.Embedding13 = nn.Embedding(num_embeddings=2, embedding_dim=1)
        
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # print(x.shape)
        # 
        # x13 = self.Embedding13(x[:, 0])
        # x1 = self.Embedding1(x[:, 1])
        # x2 = self.Embedding2(x[:, 2])
        # x3 = self.Embedding3(x[:, 3])
        # x4 = self.Embedding4(x[:, 4])
        # x5 = self.Embedding5(x[:, 5])
        # x6 = self.Embedding6(x[:, 6])
        # x7 = self.Embedding7(x[:, 7])
        # x8 = self.Embedding8(x[:, 8])
        # x9 = self.Embedding9(x[:, 9])
        # x10 = self.Embedding10(x[:, 10])
        # x11 = self.Embedding11(x[:, 11])
        # x12 = self.Embedding12(x[:, 12])
        # x = torch.cat([x13, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12], dim=1)
        # print(x.shape)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [5]:
def data_reader():
    all = pd.read_csv('../../data/raw/all.csv')
    all = all.drop(columns=['IMR', 'ID'])

    all['MRG'] = all['MRG'].apply(lambda x: x if x in [91, 92, 93, 94, 95, 96, 97] else 90)

    for k in all['REL'].value_counts().keys()[-1:]:
        all = all[all['REL'] != k]

    for k in all['WORKPLACE'].value_counts().keys()[-2:]:
        all = all[all['WORKPLACE'] != k]

    ITM40_mean, ITM40_std = all['ITM40'].mean(), all['ITM40'].std()
    all['ITM40'] = (all['ITM40'] - ITM40_mean) / ITM40_std

    X, Y = all.iloc[:, :-1], all.iloc[:, -1]
    return X.values, Y.values

In [6]:
def train(model, dataloader, criterion, optimizer):
    model.train()
    total_loss = 0.0
    mae_loss = 0.0

    for inputs, targets in dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets.view(-1, 1))
        loss.backward()
        optimizer.step()
        mae = torch.mean(torch.abs(outputs - targets.view(-1, 1)))
        mae_loss += mae.item()
        total_loss += loss.item()

    average_loss = total_loss / len(dataloader)
    mae_loss = mae_loss / len(dataloader)
    return average_loss, mae_loss

In [7]:
def test(model, dataloader, criterion):
    model.eval()
    mae_loss = 0.0
    total_loss = 0.0

    with torch.no_grad():
        for inputs, targets in dataloader:
            outputs = model(inputs)
            loss = criterion(outputs, targets.view(-1, 1))
            total_loss += loss.cpu().item()
            mae = torch.mean(torch.abs(outputs - targets.view(-1, 1)))
            mae_loss += mae.item()

    average_loss = total_loss / len(dataloader)
    mae_loss = mae_loss / len(dataloader)
    return average_loss, mae_loss

In [8]:
X, y = data_reader()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [9]:
train_dataset = CustomDataset(X_train, y_train)
test_dataset = CustomDataset(X_test, y_test)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=False)
test_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)

model = RegressionModel(13, 128, 1).cuda()
criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=0.003)

In [10]:
epoch_list = []
train_loss_list = []
test_loss_list = []
mae_train_loss_list = []
mae_test_loss_list = []
for epoch in range(NUM_EPOCHS):
    time_start = time.time()
    train_loss, mae_train_loss = train(model, train_dataloader, criterion, optimizer)
    test_loss, mae_test_loss = test(model, test_dataloader, criterion)

    epoch_list.append(epoch)
    train_loss_list.append(train_loss)
    test_loss_list.append(test_loss)
    mae_train_loss_list.append(mae_train_loss)
    mae_test_loss_list.append(mae_test_loss)

    pd.DataFrame(
        {'epoch_list': epoch_list, 'train_loss_list': train_loss_list, 'test_loss_list': test_loss_list, 'mae_train_loss_list': mae_train_loss_list, 'mae_test_loss_list': mae_test_loss_list}).to_csv('without_onehot_nn_log.csv', index=False)

    print(
        f'Cost: {time.time() - time_start}, Epoch [{epoch + 1}/{NUM_EPOCHS}], Train Loss Mse: {train_loss}, Test Loss Mse: {test_loss}, Train Loss Mae: {mae_train_loss}, Test Loss Mae: {mae_test_loss}')

Cost: 4.298880100250244, Epoch [1/1000], Train Loss Mse: 327.1321972812343, Test Loss Mse: 2.1693482633329864, Train Loss Mae: 9.201171758847359, Test Loss Mae: 1.1281211253924248
Cost: 2.554266929626465, Epoch [2/1000], Train Loss Mse: 1.3283288621494913, Test Loss Mse: 0.9575436313947042, Train Loss Mae: 0.7795184494083763, Test Loss Mae: 0.5768194157853086
Cost: 2.7900190353393555, Epoch [3/1000], Train Loss Mse: 0.9100576985595573, Test Loss Mse: 0.8822427570310414, Train Loss Mae: 0.5533119967350593, Test Loss Mae: 0.5512405598265493
Cost: 2.5259811878204346, Epoch [4/1000], Train Loss Mse: 0.8736788579541394, Test Loss Mse: 0.8659546517918253, Train Loss Mae: 0.5354689204285288, Test Loss Mae: 0.5395563741525015
Cost: 2.6042659282684326, Epoch [5/1000], Train Loss Mse: 0.8593088729768736, Test Loss Mse: 0.8569078272224492, Train Loss Mae: 0.5289554565380781, Test Loss Mae: 0.5448648114489694
Cost: 2.5467095375061035, Epoch [6/1000], Train Loss Mse: 0.8476138774655823, Test Loss M