In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import copy
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import time
from tqdm.auto import tqdm
from IPython.display import clear_output

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
start_time = time.time()

## Hyperparameters and Model

In [2]:
train_val_ratio = 0.9999

epoch = 500
batch_size = 16
save_best = False

In [3]:
class Model(nn.Module):
    def __init__(self, input_shape):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_shape, 32),
            nn.ELU(),
            nn.Linear(32, 64),
            nn.ELU(),
            nn.Linear(64, 1),
        )

    def forward(self, x):
        return self.layers(x)

## PATH
change path to yours

In [4]:
MODEL_PATH = './models/model.pt'
TRAIN_PATH = './train.csv'
TEST_PATH = './test.csv'

## Load Data
load data and filled missing value with mean / drop [`id`] [`product_code`]

In [5]:
train_df = pd.read_csv(f'{TRAIN_PATH}')
test_df = pd.read_csv(f'{TEST_PATH}')

In [6]:
def clean(data):
    le = LabelEncoder()

    # Replace str by int using LabelEncoder
    data_le = copy.deepcopy(data)

    cols = ['attribute_0', 'attribute_1']
    for col in cols:
        data_le[col] = le.fit_transform(data_le[col])

    data_le = data_le.drop(['id', 'product_code'], axis=1)

    # filled missing values
    imputer = SimpleImputer(strategy='mean')
    final_data = pd.DataFrame(imputer.fit_transform(data_le))

    final_data.columns = data_le.columns

    return final_data

In [7]:
train_df = clean(train_df)
test_df_clean = clean(test_df)

train_df.head(5)

Unnamed: 0,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
0,80.1,1.0,2.0,9.0,5.0,7.0,8.0,4.0,18.04,12.518,...,10.672,15.859,17.594,15.193,15.029,16.048444,13.034,14.684,764.1,0.0
1,84.89,1.0,2.0,9.0,5.0,14.0,3.0,3.0,18.213,11.54,...,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0.0
2,82.43,1.0,2.0,9.0,5.0,12.0,1.0,5.0,18.057,11.652,...,12.715,15.607,19.172085,13.798,16.711,18.631,14.094,17.946,663.376,0.0
3,101.07,1.0,2.0,9.0,5.0,13.0,2.0,6.0,17.295,11.188,...,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0.0
4,188.06,1.0,2.0,9.0,5.0,9.0,2.0,8.0,19.346,12.95,...,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0.0


## Prepare train data
split train data to train and val
use dataloader to load data

In [8]:
class TaskDataset(Dataset):
    def __init__(self, data, return_y=True):
        self.data = data
        self.return_y = return_y

    def __getitem__(self, index):
        if self.return_y == True:
            x = self.data[index][:-1]
            y = self.data[index][-1]
            return torch.FloatTensor(x), torch.FloatTensor(torch.from_numpy(np.array(y, dtype=np.float32)))
        else:
            x = self.data[index]
            return torch.FloatTensor(x)

    def __len__(self):
        return len(self.data)

In [9]:
data_ds = {}
train_np = train_df.to_numpy()

train_data, val_data = train_test_split(train_df, train_size=train_val_ratio)

data_ds['train'] = TaskDataset(np.array(train_data))
data_ds['val'] = TaskDataset(np.array(val_data))

dataloaders = {
    x: torch.utils.data.DataLoader(
        data_ds[x],
        batch_size=batch_size,
        shuffle=True) for x in [
        'train',
        'val']}

In [10]:
dataset_sizes = {x: len(data_ds[x]) for x in ['train', 'val']}
print(dataset_sizes)

{'train': 26567, 'val': 3}


## Train Model
**Model**: input_shape -> 32 -> 64 -> 1
**optimizer**: Adam( lr=0.001, betas=( 0.9, 0.999), eps=1e-08 )

In [11]:
train_accuracy = []
train_loss = []
val_accuracy = []
val_loss = []

In [12]:
def calculate_acc(y_pred, y_test):
    y_pred = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred == y_test).sum().float()
    acc = correct_results_sum / y_test.shape[0]
    acc = torch.round(acc * 100)

    return acc

In [13]:
def train_model(
        model,
        criterion,
        dataloaders,
        optimizer,
        num_epochs=25,
        enable_tqdm=False):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    best_epoch = 0
    min_loss = float('inf')

    for epoch in range(num_epochs):
        # print(f'Epoch {epoch}/{num_epochs - 1}')
        # print('-' * 10)
        epoch_since = time.time()
        loss_history = []
        acc_history = []
        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()  # Set model to evaluate mode

            # Iterate over data.
            if enable_tqdm:
                t = tqdm(enumerate(dataloaders[phase]), total=len(dataloaders[phase]))
            else:
                t = enumerate(dataloaders[phase])
            for i, (x_train, y_train) in t:
                x_train = x_train.to(device)
                y_train = y_train.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    y_pred = model(x_train)
                    y_train = y_train.unsqueeze(-1)
                    loss = criterion(y_pred, y_train)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_train_acc = calculate_acc(y_pred, y_train).item()
                acc_history.append(running_train_acc)
                loss_history.append(loss.item())
                # tqdm settings
                if enable_tqdm:
                    # epoch_loss = torch.mean(torch.Tensor(loss_history)).item()
                    # epoch_acc = torch.mean(torch.Tensor(acc_history)).item()
                    t.set_description(f'epoch_{epoch}/{num_epochs} {phase}\t')
                    # t.set_description(f'epoch_{epoch} {phase}  \t**Acc={epoch_acc/100:.4f}**  Loss={epoch_loss:.4f}')

            epoch_loss = torch.mean(torch.Tensor(loss_history)).item()
            epoch_acc = torch.mean(torch.Tensor(acc_history)).item() / 100
            if epoch % 10 == 0 and enable_tqdm == False:
                print(f'epoch_{epoch} {phase} Loss: {epoch_loss:.7f} Acc: {epoch_acc:.4f}')
            elif enable_tqdm:
                clear_output(wait=True)

            # loop.set_description(f'Epoch [{epoch}/{num_epoch}]')
            # loop.set_postfix(loss=loss.item(), acc=running_train_acc)
            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                min_loss = epoch_loss
                best_epoch = epoch
                best_model_wts = copy.deepcopy(model.state_dict())

            # record loss and accuracy
            if phase == 'train':
                train_accuracy.append(float(epoch_acc))
                train_loss.append(float(epoch_loss))
            elif phase == 'val':
                val_accuracy.append(float(epoch_acc))
                val_loss.append(float(epoch_loss))

        epoch_time_elapsed = time.time() - epoch_since
        # tqdm.write(
        #     f'Time elapsed {epoch_time_elapsed // 60:.0f}m {epoch_time_elapsed % 60:.0f}s\n')

    time_elapsed = time.time() - since
    tqdm.write(
        f'\nTraining complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    tqdm.write(f'Best val Acc: {best_acc:4f} Best epoch: {best_epoch}')

    # load best model weights
    if save_best:
        model.load_state_dict(best_model_wts)
    torch.save(model, f'{MODEL_PATH}')
    return model, best_epoch

In [14]:
model = Model(input_shape=train_np.shape[1] - 1)
model = model.to(device)
optimizer = torch.optim.Adam(
    model.parameters(),
    lr=0.001,
    betas=(
        0.9,
        0.999),
    eps=1e-08)
criterion = nn.BCEWithLogitsLoss()
dict(model.named_modules())

{'': Model(
   (layers): Sequential(
     (0): Linear(in_features=23, out_features=32, bias=True)
     (1): ELU(alpha=1.0)
     (2): Linear(in_features=32, out_features=64, bias=True)
     (3): ELU(alpha=1.0)
     (4): Linear(in_features=64, out_features=1, bias=True)
   )
 ),
 'layers': Sequential(
   (0): Linear(in_features=23, out_features=32, bias=True)
   (1): ELU(alpha=1.0)
   (2): Linear(in_features=32, out_features=64, bias=True)
   (3): ELU(alpha=1.0)
   (4): Linear(in_features=64, out_features=1, bias=True)
 ),
 'layers.0': Linear(in_features=23, out_features=32, bias=True),
 'layers.1': ELU(alpha=1.0),
 'layers.2': Linear(in_features=32, out_features=64, bias=True),
 'layers.3': ELU(alpha=1.0),
 'layers.4': Linear(in_features=64, out_features=1, bias=True)}

In [None]:
model, best_epoch = train_model(
    model,
    criterion,
    dataloaders,
    optimizer,
    num_epochs=epoch,
    enable_tqdm=False
)

epoch_0 train Loss: 0.5575756 Acc: 0.7799
epoch_0 val Loss: 0.5573665 Acc: 0.7800
epoch_10 train Loss: 0.5107872 Acc: 0.7876
epoch_10 val Loss: 0.5106018 Acc: 0.7877
epoch_20 train Loss: 0.5099689 Acc: 0.7879
epoch_20 val Loss: 0.5098037 Acc: 0.7880
epoch_30 train Loss: 0.5099548 Acc: 0.7880
epoch_30 val Loss: 0.5097943 Acc: 0.7881
epoch_40 train Loss: 0.5104989 Acc: 0.7875
epoch_40 val Loss: 0.5103312 Acc: 0.7876
epoch_50 train Loss: 0.5099977 Acc: 0.7879
epoch_50 val Loss: 0.5098285 Acc: 0.7880
epoch_60 train Loss: 0.5098612 Acc: 0.7875
epoch_60 val Loss: 0.5096841 Acc: 0.7876
epoch_70 train Loss: 0.5100456 Acc: 0.7877
epoch_70 val Loss: 0.5098632 Acc: 0.7879
epoch_80 train Loss: 0.5096288 Acc: 0.7880
epoch_80 val Loss: 0.5094877 Acc: 0.7881
epoch_90 train Loss: 0.5098630 Acc: 0.7876
epoch_90 val Loss: 0.5096930 Acc: 0.7877
epoch_100 train Loss: 0.5098044 Acc: 0.7877
epoch_100 val Loss: 0.5096220 Acc: 0.7878
epoch_110 train Loss: 0.5097315 Acc: 0.7878
epoch_110 val Loss: 0.5095559 Ac

## Plot result

In [None]:
#printing the loss
plt.plot(val_loss)
plt.title('Loss')
plt.xlabel('Epochs')
plt.ylabel('loss')

In [None]:
#printing the accuracy
plt.plot(val_accuracy)
plt.title('Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')

## Prediction

In [None]:
test_data = test_df_clean.to_numpy()
test_ds = TaskDataset(test_data, return_y=False)
print("test num: ", test_ds.__len__())
test_dl = DataLoader(
    test_ds,
    batch_size=10,
    num_workers=0,
    drop_last=False,
    shuffle=False)

model.eval()
pred = []
for x in tqdm(test_dl):
    x = x.to(device)
    y_pred = model(x)
    output = torch.sigmoid(y_pred)
    output = output.cpu().detach().numpy()
    pred.append(output[:][0])
    # for i in range(len(output)):
    #     pred.append(output[i][0])
result = pd.DataFrame({'id': test_df['id'], 'failure': pred})
result.to_csv('submission.csv', index=False)
result


In [None]:
process_time = time.time() - start_time
print(
    f'\n###############################\n'
    f'Process complete in {process_time // 60:.0f}m {process_time % 60:.0f}s')