In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

import torch
from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader

%matplotlib inline

In [None]:
DEBUG = False
np.random.seed(17)

if DEBUG:
    TRAIN_DATASETS = ['model_{}_ST4000DM000.csv'.format(year) for year in [2015]]
else:
    TRAIN_DATASETS = ['model_{}_ST4000DM000.csv'.format(year) for year in [2015, 2016, 2017]]
TEST_DATASET = 'model_2018_ST4000DM000.csv'

attributes = ['smart_{}_raw'.format(idx) for idx in [188, 197, 240]] + \
             ['smart_{}_normalized'.format(idx) for idx in [1, 187]]
attributes_and_target = attributes + ['failure']


In [None]:
def get_delta_days(ss):
    def get_delta(st, end):
        return (datetime.strptime(st, "%Y-%m-%d") - datetime.strptime(end, "%Y-%m-%d")).days
    min_date = ss.min()
    return ss.apply(lambda x: get_delta(x, min_date))

def prepare_data(X):
    groups = X.groupby('serial_number')
    x_data = []
    y_data = []
    for serial_number, df in groups:
        y_data.append(df.iloc[0].failure)
        x_sample = df[attributes].values
        x_data.append(x_sample)
    return np.array(x_data), np.array(y_data)

def convert_df(df):
    if isinstance(df, str):
        df = pd.read_csv(df)
    # remove serial_numbers where history is less than 21
    data = df.groupby(['serial_number']).filter(lambda x: len(x) >= 21)
    data.sort_values(by=['date'], inplace=True)
    data = data.groupby('serial_number').tail(21)

    X = data[attributes_and_target+['date', 'serial_number']]
    X['time_delta'] = X.groupby('serial_number').date.apply(get_delta_days)
    X, y = prepare_data(X)
    return X, y

def convert_dfs(dfs):
    if isinstance(dfs[0], str):
        dfs = [pd.read_csv(path) for path in dfs]
    xs, ys = [], []
    for df in dfs:
        X, y = convert_df(df)
        xs.append(X)
        ys.append(y)
    X, y = np.concatenate(xs), np.concatenate(ys)
    return X, y

In [None]:
X_train, y_train = convert_dfs(TRAIN_DATASETS)
X_test, y_test = convert_df(TEST_DATASET)

In [None]:
def not_nan_smart_attributes(data):
    attribs = []
    item0 = data.iloc[0]
    columns = data.columns
    for column in columns:
        if not column.startswith('smart_'):
            continue
        if not np.isnan(item0[column]):
            attribs.append(column)
    return attribs

# not_nan_smart_attributes(data)

In [None]:
from scipy.stats import bernoulli

In [None]:
class SMARTAttributesDataset(Dataset):
    def __init__(self, X, y, train=False, k=1., noise=False):
        assert len(X) == len(y)
        super().__init__()
        self._k = k  # ratio n_positive / n_neutral
        self._noise = noise
        self.train = train
        self._X, self._y = X.copy(), y.copy()
        if self.train and self._k:
            self._increase_samples_in_positive_class()
        self._X, self._y = self._X.astype(np.float32), self._y.astype(np.int)

    def _increase_samples_in_positive_class(self):
        positive_x = self._X[self._y == 1]
        n_positive = len(positive_x)
        n_neutral = len(self._X) - n_positive
        n_addition = int(self._k * n_neutral - n_positive)
        assert n_addition > 0
        positive_idxs = np.arange(len(positive_x))
        positive_idxs = np.random.choice(positive_idxs, size=n_addition)
        additional_samples = positive_x[positive_idxs]
        if self._noise:
            shape = additional_samples.shape
            additional_samples += bernoulli.rvs(0.5, size=shape) * 2 - 1
        self._y = np.concatenate((self._y, np.ones(n_addition)))
        self._X = np.concatenate((self._X, additional_samples))
        idxs = np.arange(len(self._X))
        np.random.shuffle(idxs)
        self._X, self._y = self._X[idxs], self._y[idxs]

    def __len__(self):
        return len(self._X)

    def __getitem__(self, idx):
        sample = {'x': self._X[idx], 'y': self._y[idx]}
        return sample

In [None]:
batch_size = 128

train_dataset = SMARTAttributesDataset(X_train, y_train, train=True, k=None, noise=False)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = SMARTAttributesDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
X_train2d, X_test2d = X_train[:,-1,:], X_test[:,-1,:]

train_dataset = SMARTAttributesDataset(X_train2d, y_train, train=True, k=1., noise=True)
test_dataset = SMARTAttributesDataset(X_test2d, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
class NNet(nn.Module):
    def __init__(self, input_size, nhidden=4, nlayers=1, dropout=0.1):
        super().__init__()
        self.rnn = nn.LSTM(input_size=input_size, hidden_size=nhidden, num_layers=nlayers, dropout=dropout,
                           batch_first=True)
        self.linear = nn.Linear(nhidden, 2)

    def forward(self, inp):
        _, (h_n, _) = self.rnn(inp)
        repr_ = h_n[-1]
        return self.linear(repr_)


class DenseNet(nn.Module):
    def __init__(self, input_size, hidden_sizes=[8, 8]):
        hs1, hs2 = hidden_sizes
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, hs1), nn.Tanh(),
            nn.Linear(hs1, hs2), nn.Tanh(),
            nn.Linear(hs2, 2)
        )

    def forward(self, inp):
        out = self.layers(inp)
        return out

In [None]:
from torch.nn.functional import softmax
from utils import FAR, FDR

In [None]:
## Train
# model = NNet(input_size=len(attributes))
model = DenseNet(input_size=len(attributes))

criterion = nn.CrossEntropyLoss(weight=torch.FloatTensor([1, 1]))
optimizer = optim.Adam(model.parameters())

def train_epoch(model, dataloader, optimizer, criterion):
    model.train()
    losses = []
    for batch_idx, batch in enumerate(dataloader):
        x, y = batch['x'], batch['y']
        optimizer.zero_grad()
        outputs = model(x)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        losses.append(loss.detach().numpy())
    return np.mean(losses)


def evaluate(model, dataloader, criterion):
    probs = []
    labels = []
    losses = []
    model.eval()
    with torch.no_grad():
        for batch_idx, batch in enumerate(dataloader):
            x, y = batch['x'], batch['y']
            outputs = model(x)  # size: [B, 2]
            loss = criterion(outputs, y)
            probs.append(softmax(outputs).numpy())
            labels.append(y.numpy())
            losses.append(loss.detach().numpy())
    probs = np.concatenate(probs, axis=0)[:, 1]
    labels = np.concatenate(labels, axis=0)
    metrics = {
        'FAR': FAR(labels, probs),
        'FDR': FDR(labels, probs),
        'loss': np.nanmean(losses)
    }
    return metrics


losses = {'train': [], 'test': []}
for epoch_idx in range(120):
    train_loss = train_epoch(model, train_loader, optimizer, criterion)
    test_loss = evaluate(model, test_loader, criterion)
    losses['train'].append(train_loss)
    losses['test'].append(test_loss)
    print(epoch_idx, train_loss, test_loss['loss'], test_loss['FAR'], test_loss['FDR'])