In [33]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import torch.nn as nn
import copy
import torch
from torch.utils.data import Dataset, DataLoader
import time
from tqdm import tqdm
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
start_time = time.time()

## PATH
change path to yours

In [34]:
MODEL_PATH = './models/model.pt'
TEST_PATH = './test.csv'

In [35]:
class Model(nn.Module):
    def __init__(self, input_shape):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_shape, 32),
            nn.ELU(),
            nn.Linear(32, 64),
            nn.ELU(),
            nn.Linear(64, 1),
        )

    def forward(self, x):
        return self.layers(x)

In [36]:
model = torch.load(f'{MODEL_PATH}')

In [37]:
test_df = pd.read_csv(f'{TEST_PATH}')

In [38]:
def clean(data):
    le = LabelEncoder()

    # Replace str by int using LabelEncoder
    data_le = copy.deepcopy(data)
    cols = ['attribute_0', 'attribute_1', 'product_code']
    for col in cols:
        data_le[col] = le.fit_transform(data[col])

    data_le = data_le.drop(['id', 'product_code'], axis=1)

    # filled missing values
    imputer = SimpleImputer(strategy='mean')
    final_data = pd.DataFrame(imputer.fit_transform(data_le))

    final_data.columns = data_le.columns

    return final_data

In [39]:
test_df_clean = clean(test_df)

test_df_clean.head(5)

Unnamed: 0,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,...,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17
0,119.57,0.0,1.0,6.0,4.0,6.0,9.0,6.0,19.305,10.178,...,18.654,10.802,15.909,18.07,13.772,13.659,16.825,13.742,17.71,634.612
1,113.51,0.0,1.0,6.0,4.0,11.0,8.0,0.0,17.883,11.927,...,19.368,12.032,13.998,18.846571,12.473,17.468,16.708,14.776,14.102,537.037
2,112.16,0.0,1.0,6.0,4.0,8.0,12.0,4.0,18.475,10.481,...,17.774,11.743,17.046,18.086,10.907,13.363,15.737,17.065,16.021,658.995
3,112.72,0.0,1.0,6.0,4.0,8.0,11.0,10.0,16.518,10.888,...,18.948,11.79,18.165,16.163,10.933,15.501,15.667,12.62,16.111,594.301
4,208.0,0.0,1.0,6.0,4.0,14.0,16.0,8.0,17.808,12.693,...,19.141,12.37,14.578,17.849,11.941,16.07,16.183,13.324,17.15,801.044


In [40]:
class TaskDataset(Dataset):
    def __init__(self, data, return_y=True):
        self.data = data
        self.return_y = return_y

    def __getitem__(self, index):
        if self.return_y == True:
            x = self.data[index][:-1]
            y = self.data[index][-1]
            return torch.FloatTensor(x), torch.FloatTensor(torch.from_numpy(np.array(y, dtype=np.float32)))
        else:
            x = self.data[index]
            return torch.FloatTensor(x)

    def __len__(self):
        return len(self.data)

## Predict

In [41]:
test_data = test_df_clean.to_numpy()
test_ds = TaskDataset(test_data, return_y=False)
print("test num: ", test_ds.__len__())
test_dl = DataLoader(
    test_ds,
    batch_size=1,
    num_workers=0,
    drop_last=False,
    shuffle=False)

model.eval()
pred = []
for x in tqdm(test_dl):
    x = x.to(device)
    y_pred = model(x)
    output = torch.sigmoid(y_pred)
    output = output.cpu().detach().numpy()
    for i in range(len(output)):
        pred.append(output[i][0])
result = pd.DataFrame({'id': test_df['id'], 'failure': pred})
result.to_csv('submission_p.csv', index=0)
result

test num:  20775


100%|██████████| 20775/20775 [00:06<00:00, 3371.47it/s]


Unnamed: 0,id,failure
0,26570,0.193677
1,26571,0.116830
2,26572,0.179400
3,26573,0.195811
4,26574,0.308730
...,...,...
20770,47340,0.226949
20771,47341,0.130701
20772,47342,0.099486
20773,47343,0.205068
