# **Anomaly Detection**

# Load and preprocess data

In [None]:
!gdown --id '1_zT3JOpvXFGr7mkxs3XJDeGxTn_8pItq' --output train.npy 
!gdown --id '11Y_6JDjlhIY-M5-jW1rLRshDMqeKi9Kr' --output test.npy 

import numpy as np

train = np.load('train.npy', allow_pickle=True)
test = np.load('test.npy', allow_pickle=True)

Downloading...
From: https://drive.google.com/uc?id=1_zT3JOpvXFGr7mkxs3XJDeGxTn_8pItq
To: /content/train.npy
983MB [00:05, 172MB/s]
Downloading...
From: https://drive.google.com/uc?id=11Y_6JDjlhIY-M5-jW1rLRshDMqeKi9Kr
To: /content/test.npy
246MB [00:05, 47.0MB/s]


# Task

In [None]:
task = 'knn'

# KNN

In [None]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import f1_score, pairwise_distances, roc_auc_score
from scipy.cluster.vq import vq, kmeans


if task == 'knn':
    x = train.reshape(len(train), -1) # 40000 * 32 * 32 * 3 => 40000 * 3072
    y = test.reshape(len(test), -1) # 10000 * 32 * 32 * 3 => 10000 * 3072
    scores = list()
    for n in range(1, 9): # 原為 10（總共10種數字，而訓練資料中最多有9種）
      kmeans_x = MiniBatchKMeans(n_clusters=n, batch_size=100).fit(x) # 先做 kmeans，並嘗試不同 n 的效果
      print(kmeans_x)
      y_cluster = kmeans_x.predict(y) # 再做 knn
      print(y_cluster)
      y_dist = np.sum(np.square(kmeans_x.cluster_centers_[y_cluster] - y), axis=1)
      y_pred = y_dist
    #   score = f1_score(y_label, y_pred, average='micro')
    #   score = roc_auc_score(y_label, y_pred, average='micro')
    #   scores.append(score)
    # print(np.max(scores), np.argmax(scores))
    # print(scores)
    # print('auc score: {}'.format(np.max(scores)))


MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
                init_size=None, max_iter=100, max_no_improvement=10,
                n_clusters=1, n_init=3, random_state=None,
                reassignment_ratio=0.01, tol=0.0, verbose=0)
[0 0 0 ... 0 0 0]
MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
                init_size=None, max_iter=100, max_no_improvement=10,
                n_clusters=2, n_init=3, random_state=None,
                reassignment_ratio=0.01, tol=0.0, verbose=0)
[0 1 1 ... 0 0 0]
MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
                init_size=None, max_iter=100, max_no_improvement=10,
                n_clusters=3, n_init=3, random_state=None,
                reassignment_ratio=0.01, tol=0.0, verbose=0)
[2 1 0 ... 2 2 0]
MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
                init_size=None, max_iter=100, max_no_improvement=10,
                n_clusters

# PCA

In [None]:
from sklearn.decomposition import PCA

if task == 'pca':

    x = train.reshape(len(train), -1)
    y = test.reshape(len(test), -1)
    pca = PCA(n_components=2).fit(x)

    y_projected = pca.transform(y)
    y_reconstructed = pca.inverse_transform(y_projected)  
    # print(np.square(y_reconstructed - y).shape)
    # print(np.square(y_reconstructed - y).reshape(len(y), -1).shape) # reshape 是多餘的
    dist = np.sqrt(np.sum(np.square(y_reconstructed - y).reshape(len(y), -1), axis=1))
    
    y_pred = dist
    # score = roc_auc_score(y_label, y_pred, average='micro')
    # score = f1_score(y_label, y_pred, average='micro')
    # print('auc score: {}'.format(score))

# Autoencoder

# Models & loss

In [None]:
import torch
from torch import nn
import torch.nn.functional as F


class fcn_autoencoder(nn.Module):
    def __init__(self):
        super(fcn_autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(32 * 32 * 3, 128),
            nn.ReLU(True),
            nn.Linear(128, 64),
            nn.ReLU(True), nn.Linear(64, 12), nn.ReLU(True), nn.Linear(12, 3))
        self.decoder = nn.Sequential(
            nn.Linear(3, 12),
            nn.ReLU(True),
            nn.Linear(12, 64),
            nn.ReLU(True),
            nn.Linear(64, 128),
            nn.ReLU(True), nn.Linear(128, 32 * 32 * 3
            ), nn.Tanh())

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x


class conv_autoencoder(nn.Module):
    def __init__(self):
        super(conv_autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Conv2d(3, 12, 4, stride=2, padding=1),            # [batch, 12, 16, 16] => 一次動2格，補一個0
            nn.ReLU(),
            nn.Conv2d(12, 24, 4, stride=2, padding=1),           # [batch, 24, 8, 8]
            nn.ReLU(),
			      nn.Conv2d(24, 48, 4, stride=2, padding=1),           # [batch, 48, 4, 4]
            nn.ReLU(),
    # 			nn.Conv2d(48, 96, 4, stride=2, padding=1),           # [batch, 96, 2, 2]
    #       nn.ReLU(),
        )
        self.decoder = nn.Sequential(
#             nn.ConvTranspose2d(96, 48, 4, stride=2, padding=1),  # [batch, 48, 4, 4]
#             nn.ReLU(),
            # output_size = strides * (input_size-1) + kernel_size - 2*padding
			      nn.ConvTranspose2d(48, 24, 4, stride=2, padding=1),  # [batch, 24, 8, 8]
            nn.ReLU(),
			      nn.ConvTranspose2d(24, 12, 4, stride=2, padding=1),  # [batch, 12, 16, 16]
            nn.ReLU(),
            nn.ConvTranspose2d(12, 3, 4, stride=2, padding=1),   # [batch, 3, 32, 32]
            nn.Tanh(),
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

# vae: https://blog.csdn.net/a312863063/article/details/87953517
class VAE(nn.Module):
    def __init__(self):
        super(VAE, self).__init__()

        self.fc1 = nn.Linear(32*32*3, 400)
        self.fc21 = nn.Linear(400, 20)
        self.fc22 = nn.Linear(400, 20)
        self.fc3 = nn.Linear(20, 400)
        self.fc4 = nn.Linear(400, 32*32*3)

    def encode(self, x):
        h1 = F.relu(self.fc1(x))
        return self.fc21(h1), self.fc22(h1)

    def reparametrize(self, mu, logvar):
        std = logvar.mul(0.5).exp_() # mul() 使矩陣對應位相乘
        if torch.cuda.is_available():
            eps = torch.cuda.FloatTensor(std.size()).normal_() ### 需再確認用法
        else:
            eps = torch.FloatTensor(std.size()).normal_()
        eps = Variable(eps) # 用 variable 儲存
        return eps.mul(std).add_(mu)

    def decode(self, z):
        h3 = F.relu(self.fc3(z))
        return F.sigmoid(self.fc4(h3))

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparametrize(mu, logvar)
        return self.decode(z), mu, logvar


def loss_vae(recon_x, x, mu, logvar, criterion):
    """
    recon_x: generating images
    x: origin images
    mu: latent mean
    logvar: latent log variance
    """
    mse = criterion(recon_x, x)  # mse loss
    # loss = 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2) 將分配帶入得到的結果
    KLD_element = mu.pow(2).add_(logvar.exp()).mul_(-1).add_(1).add_(logvar) # 有底線代表 inplace
    KLD = torch.sum(KLD_element).mul_(-0.5)
    # KL divergence
    return mse + KLD


In [None]:
task = 'ae'

# Training

In [None]:
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.optim import Adam, AdamW
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)


if task == 'ae':
    num_epochs = 1000
    batch_size = 128
    learning_rate = 1e-3

    #{'fcn', 'cnn', 'vae'} 
    model_type = 'fcn' 

    x = train
    if model_type == 'fcn' or model_type == 'vae':
        x = x.reshape(len(x), -1)
        
    data = torch.tensor(x, dtype=torch.float)
    train_dataset = TensorDataset(data)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)


    model_classes = {'fcn':fcn_autoencoder(), 'cnn':conv_autoencoder(), 'vae':VAE()}
    model = model_classes[model_type].cuda()
    criterion = nn.MSELoss()
    optimizer = torch.optim.AdamW(
        model.parameters(), lr=learning_rate)
    
    best_loss = np.inf
    model.train()
    for epoch in range(num_epochs):
        for data in train_dataloader:
            if model_type == 'cnn':
                img = data[0].transpose(3, 1).cuda()
            else:
                img = data[0].cuda()
            # ===================forward=====================
            output = model(img)
            if model_type == 'vae':
                loss = loss_vae(output[0], img, output[1], output[2], criterion)
            else:
                loss = criterion(output, img)
            # ===================backward====================
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # ===================save====================
            if loss.item() < best_loss:
                best_loss = loss.item()
                torch.save(model, 'best_model_{}.pt'.format(model_type))
        # ===================log========================
        print('epoch [{}/{}], loss:{:.4f}'
              .format(epoch + 1, num_epochs, loss.item()))
        




  "type " + obj.__name__ + ". It won't be checked "


epoch [1/1000], loss:0.1247
epoch [2/1000], loss:0.1289
epoch [3/1000], loss:0.1392
epoch [4/1000], loss:0.1319
epoch [5/1000], loss:0.1262
epoch [6/1000], loss:0.1251
epoch [7/1000], loss:0.1177
epoch [8/1000], loss:0.1162
epoch [9/1000], loss:0.1366
epoch [10/1000], loss:0.1329
epoch [11/1000], loss:0.1093
epoch [12/1000], loss:0.1468
epoch [13/1000], loss:0.1304
epoch [14/1000], loss:0.1120
epoch [15/1000], loss:0.1161
epoch [16/1000], loss:0.1224
epoch [17/1000], loss:0.1295
epoch [18/1000], loss:0.1213
epoch [19/1000], loss:0.1258
epoch [20/1000], loss:0.1411


# Evaluation

In [None]:
if task == 'ae':
    if model_type == 'fcn' or model_type == 'vae':
        y = test.reshape(len(test_tmp), -1)
    else:
        y = test
        
    data = torch.tensor(y, dtype=torch.float)
    test_dataset = TensorDataset(data)
    test_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size)

    model = torch.load('best_model_{}.pt'.format(model_type), map_location='cuda')

    model.eval()
    reconstructed = list()
    for i, data in enumerate(test_dataloader): 
        if model_type == 'cnn':
            img = data[0].transpose(3, 1).cuda() # numpy 和 tensor 儲存圖片的方式不同
        else:
            img = data[0].cuda()
        output = model(img)
        if model_type == 'cnn':
            output = output.transpose(3, 1) # 換回來
        elif model_type == 'vae':
            output = output[0]
        reconstructed.append(output.cpu().detach().numpy())

    reconstructed = np.concatenate(reconstructed, axis=0)
    anomality = np.sqrt(np.sum(np.square(reconstructed - y).reshape(len(y), -1), axis=1))
    y_pred = anomality
    with open('prediction.csv', 'w') as f:
        f.write('id,anomaly\n')
        for i in range(len(y_pred)):
            f.write('{},{}\n'.format(i+1, y_pred[i]))
    # score = roc_auc_score(y_label, y_pred, average='micro')
    # score = f1_score(y_label, y_pred, average='micro')
    # print('auc score: {}'.format(score))
