In [957]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import copy
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import time
from tqdm import tqdm

## Hyperarameters

In [958]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
train_val_ratio = 0.8

epoch = 500
batch_size = 8
save_best = True

start_time = time.time()

In [959]:
class Model(nn.Module):
    def __init__(self, input_shape):
        super(Model, self).__init__()
        self.fc1 = nn.Linear(input_shape, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

## PATH

In [960]:
MODEL_PATH = './models'

## Load Data
load data and filled missing value with mean / drop [`id`] [`product_code`]

In [961]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
def clean(data):
    le = LabelEncoder()

    # Replace str by int using LabelEncoder
    data_le = copy.deepcopy(data)
    cols = ['attribute_0', 'attribute_1', 'product_code']
    for col in cols:
        data_le[col] = le.fit_transform(data[col])

    data_le = data_le.drop(['id', 'product_code'], axis=1)

    # filled missing values
    imputer = SimpleImputer(strategy='mean')
    final_data = pd.DataFrame(imputer.fit_transform(data_le))

    final_data.columns = data_le.columns

    return final_data

In [962]:
train_df = clean(train_df)
test_df_clean = clean(test_df)

train_df.head(5)

Unnamed: 0,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
0,80.1,1.0,2.0,9.0,5.0,7.0,8.0,4.0,18.04,12.518,...,10.672,15.859,17.594,15.193,15.029,16.048444,13.034,14.684,764.1,0.0
1,84.89,1.0,2.0,9.0,5.0,14.0,3.0,3.0,18.213,11.54,...,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0.0
2,82.43,1.0,2.0,9.0,5.0,12.0,1.0,5.0,18.057,11.652,...,12.715,15.607,19.172085,13.798,16.711,18.631,14.094,17.946,663.376,0.0
3,101.07,1.0,2.0,9.0,5.0,13.0,2.0,6.0,17.295,11.188,...,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0.0
4,188.06,1.0,2.0,9.0,5.0,9.0,2.0,8.0,19.346,12.95,...,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0.0


## Prepare train data
split train data to train and val
use dataloader to load data

In [963]:
class TaskDataset(Dataset):
    def __init__(self, data, return_y=True):
        self.data = data
        self.return_y = return_y

    def __getitem__(self, index):
        if self.return_y == True:
            x = self.data[index][:-1]
            y = self.data[index][-1]
            return torch.FloatTensor(x), torch.FloatTensor(torch.from_numpy(np.array(y, dtype=np.float32)))
        else:
            x = self.data[index]
            return torch.FloatTensor(x)

    def __len__(self):
        return len(self.data)

In [964]:
train_data = []
val_data = []
data_ds = {}
dataloaders = {}
train_np = train_df.to_numpy()

for row in train_np:
    if np.random.random() < train_val_ratio:
        train_data.append(row)
    else:
        val_data.append(row)

data_ds['train'] = TaskDataset(train_data)
data_ds['val'] = TaskDataset(val_data)

dataloaders = {
    x: torch.utils.data.DataLoader(
        data_ds[x],
        batch_size=batch_size,
        shuffle=True) for x in [
        'train',
        'val']}

In [965]:
dataset_sizes = {x: len(data_ds[x]) for x in ['train', 'val']}
print(dataset_sizes)

{'train': 21262, 'val': 5308}


## Train Model
**Model**: input_shape -> 32 -> 64 -> 1
**optimizer**: Adam( lr=0.001, betas=( 0.9, 0.999), eps=1e-08 )

In [966]:
train_accuracy = []
train_loss = []
val_accuracy = []
val_loss = []

In [967]:
def calculate_acc(y_pred, y_test):
    y_pred = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred == y_test).sum().float()
    acc = correct_results_sum / y_test.shape[0]
    acc = torch.round(acc * 100)

    return acc

In [968]:
def train_model(
        model,
        criterion,
        dataloaders,
        optimizer,
        num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    best_epoch = 0
    min_loss = float('inf')

    for epoch in range(num_epochs):
        # print(f'Epoch {epoch}/{num_epochs - 1}')
        # print('-' * 10)
        epoch_since = time.time()
        loss_history = []
        acc_history = []
        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()  # Set model to evaluate mode

            # Iterate over data.
            t = tqdm(enumerate(dataloaders[phase]), total=len(dataloaders[phase]))
            for i, (x_train, y_train) in t:
                x_train = x_train.to(device)
                y_train = y_train.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    y_pred = model(x_train)
                    y_train = y_train.unsqueeze(-1)
                    loss = criterion(y_pred, y_train)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_train_acc = calculate_acc(y_pred, y_train).item()
                acc_history.append(running_train_acc)
                loss_history.append(loss.item())
                # tqdm settings
                epoch_loss = torch.mean(torch.Tensor(loss_history)).item()
                epoch_acc = torch.mean(torch.Tensor(acc_history)).item()
                #t.set_description(f'epoch_{epoch} {phase} \t')
                t.set_description(f'epoch_{epoch} {phase}  \t**Acc={epoch_acc/100:.4f}**  Loss={epoch_loss:.4f}')



            epoch_loss = torch.mean(torch.Tensor(loss_history)).item()
            epoch_acc = torch.mean(torch.Tensor(acc_history)).item()
            print(f'{phase} Loss: {epoch_loss:.7f} Acc: {epoch_acc:.4f}')


            # loop.set_description(f'Epoch [{epoch}/{num_epoch}]')
            # loop.set_postfix(loss=loss.item(), acc=running_train_acc)
            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                min_loss = epoch_loss
                best_epoch = epoch
                best_model_wts = copy.deepcopy(model.state_dict())

            # record loss and accuracy
            if phase == 'train':
                train_accuracy.append(float(epoch_acc))
                train_loss.append(float(epoch_loss))
            elif phase == 'val':
                val_accuracy.append(float(epoch_acc))
                val_loss.append(float(epoch_loss))


        epoch_time_elapsed = time.time() - epoch_since
        # tqdm.write(
        #     f'Time elapsed {epoch_time_elapsed // 60:.0f}m {epoch_time_elapsed % 60:.0f}s\n')


    time_elapsed = time.time() - since
    tqdm.write(
        f'\nTraining complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    tqdm.write(f'Best val Acc: {best_acc:4f} Best epoch: {best_epoch}')

    # load best model weights
    if save_best:
        model.load_state_dict(best_model_wts)
    torch.save(model, f'{MODEL_PATH}/model.pt')
    return model

In [969]:
model = Model(input_shape=train_np.shape[1] - 1)
model = model.to(device)
optimizer = torch.optim.Adam(
    model.parameters(),
    lr=0.001,
    betas=(
        0.9,
        0.999),
    eps=1e-08)
criterion = nn.BCEWithLogitsLoss()

epoch_114 train 	**Acc=0.7905**  Loss=0.5069:  49%|████▉     | 1315/2658 [00:07<00:07, 181.85it/s]

In [None]:
model_ft1 = train_model(
    model,
    criterion,
    dataloaders,
    optimizer,
    num_epochs=epoch)

epoch_0 train 	**Acc=0.7765**  Loss=0.5844: 100%|██████████| 2658/2658 [00:15<00:00, 176.19it/s]
epoch_0 val 	**Acc=0.7798**  Loss=0.5693: 100%|██████████| 664/664 [00:02<00:00, 255.02it/s]
epoch_1 train 	**Acc=0.7861**  Loss=0.5214: 100%|██████████| 2658/2658 [00:14<00:00, 181.98it/s]
epoch_1 val 	**Acc=0.7874**  Loss=0.5182: 100%|██████████| 664/664 [00:02<00:00, 274.15it/s]
epoch_2 train 	**Acc=0.7872**  Loss=0.5185: 100%|██████████| 2658/2658 [00:14<00:00, 185.91it/s]
epoch_2 val 	**Acc=0.7883**  Loss=0.5155: 100%|██████████| 664/664 [00:02<00:00, 255.73it/s]
epoch_3 train 	**Acc=0.7870**  Loss=0.5164: 100%|██████████| 2658/2658 [00:14<00:00, 183.73it/s]
epoch_3 val 	**Acc=0.7881**  Loss=0.5139: 100%|██████████| 664/664 [00:02<00:00, 252.75it/s]
epoch_4 train 	**Acc=0.7871**  Loss=0.5162: 100%|██████████| 2658/2658 [00:14<00:00, 178.37it/s]
epoch_4 val 	**Acc=0.7882**  Loss=0.5161: 100%|██████████| 664/664 [00:02<00:00, 249.37it/s]
epoch_5 train 	**Acc=0.7871**  Loss=0.5145: 100%|█

## Plot result

In [None]:
#printing the loss
plt.plot(val_loss)
plt.title('Loss')
plt.xlabel('Epochs')
plt.ylabel('loss')

In [None]:
#printing the accuracy
plt.plot(val_accuracy)
plt.title('Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')

## Prediction

In [None]:
test_data = test_df_clean.to_numpy()
test_ds = TaskDataset(test_data, return_y=False)
print("test num: ", test_ds.__len__())
test_dl = DataLoader(
    test_ds,
    batch_size=1,
    num_workers=0,
    drop_last=False,
    shuffle=False)

model.eval()
pred = []
for x in tqdm(test_dl):
    x = x.to(device)
    y_pred = model(x)
    output = torch.sigmoid(y_pred)
    output = output.cpu().detach().numpy()
    for i in range(len(output)):
        pred.append(output[i][0])
result = pd.DataFrame({'id': test_df['id'], 'failure': pred})
result.to_csv('submission.csv', index=0)
result


In [None]:
process_time = time.time() - start_time
print(
    f'\n###############################\n'
    f'Process complete in {process_time // 60:.0f}m {process_time % 60:.0f}s')