In [73]:
import torch
import torch.nn as nn

torch.manual_seed(77)

z = torch.randn(5) # z = f(x)
y_hat = torch.sigmoid(z) # y_hat = sigmoid(f(x))
y = torch.tensor([0.,1.,1.,0.,1.]) 

print(f"z = {z}\ny_hat = {y_hat}\ny = {y}\n{'':-^80}")

# Negative Log-likelihoods
loss_NLL_scratch = -(y * y_hat.log() + (1 - y) * (1 - y_hat).log())
print(f"Negative Log-likelihoods\n    {loss_NLL_scratch}")
print(f"Loss Summantion : {loss_NLL_scratch.sum()}")

z = tensor([-0.3568,  0.6007, -0.6968, -0.5242,  0.9087])
y_hat = tensor([0.4117, 0.6458, 0.3325, 0.3719, 0.7127])
y = tensor([0., 1., 1., 0., 1.])
--------------------------------------------------------------------------------
Negative Log-likelihoods
    tensor([0.5306, 0.4372, 1.1010, 0.4650, 0.3387])
Loss Summantion : 2.8724989891052246


In [74]:
import torch
import torch.nn as nn

torch.manual_seed(77)

### Multi class Setting ###

print(f"{'Setting up multiclass case':-^80}") 

z = torch.randn(5,3) # z = f(x)
y_hat = torch.softmax(z, dim=1) # y_hat = softmax(f(x))

y = torch.tensor([0,1,0,2,1]) 

print(f"z = {z}\ny_hat = {y_hat}\ny = {y}\n{'':-^80}")

# Negative Log-likelihoods 
loss_NLL_scratch = -y_hat.log()[torch.arange(5),y.long()] 
print(f"Negative Log-likelihoods\n    {loss_NLL_scratch}")
print(f"Loss Summantion : {loss_NLL_scratch.sum()}")

---------------------------Setting up multiclass case---------------------------
z = tensor([[-0.3568,  0.6007, -0.6968],
        [-0.5242,  0.9087, -1.6423],
        [ 0.4583, -0.1266,  0.2302],
        [ 0.0024, -0.8097,  1.3568],
        [-0.6798, -0.0881, -1.2044]])
y_hat = tensor([[0.2316, 0.6035, 0.1649],
        [0.1812, 0.7595, 0.0592],
        [0.4250, 0.2368, 0.3383],
        [0.1880, 0.0835, 0.7285],
        [0.2942, 0.5317, 0.1741]])
y = tensor([0, 1, 0, 2, 1])
--------------------------------------------------------------------------------
Negative Log-likelihoods
    tensor([1.4626, 0.2751, 0.8558, 0.3168, 0.6317])
Loss Summantion : 3.5419459342956543


# 오토인코더

In [75]:
import pandas as pd
from sklearn.metrics import f1_score
import numpy as np
import torch
import tqdm
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, Dataset
import torch.optim as optim
import matplotlib.pyplot as plt 
import seaborn as sns 
%matplotlib inline
plt.rcParams['font.family'] = 'Malgun Gothic'

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [76]:
data = pd.read_csv('pulsar_stars.csv')
data.head()

Unnamed: 0,Mean of the integrated profile,Standard deviation of the integrated profile,Excess kurtosis of the integrated profile,Skewness of the integrated profile,Mean of the DM-SNR curve,Standard deviation of the DM-SNR curve,Excess kurtosis of the DM-SNR curve,Skewness of the DM-SNR curve,target_class
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [77]:
y = data['target_class']
X = data.drop(['target_class'], axis=1)

In [78]:
from sklearn.model_selection import train_test_split
trainx, test_X, trainy, test_y = train_test_split(X, y, test_size=0.3, stratify=y)

mins = trainx.min()
maxs = trainx.max()

# # 데이터 스케일링
train_X = (trainx - mins) / (maxs - mins)
test_X = (test_X - mins) / (maxs - mins)

train_X, val_X, train_y, val_y = train_test_split(trainx, trainy, test_size=0.5, stratify=trainy)

# DataFrame -> numpy -> tensor
train_X = torch.from_numpy(train_X.to_numpy()).float().to(device)
train_y = torch.tensor(train_y.to_numpy(), dtype = torch.int64).to(device)
val_X = torch.from_numpy(val_X.to_numpy()).float().to(device)
val_y = torch.tensor(val_y.to_numpy(), dtype = torch.int64).to(device)
test_X = torch.from_numpy(test_X.to_numpy()).float().to(device)
test_y = torch.tensor(test_y.to_numpy(), dtype = torch.int64).to(device)

In [79]:
# 입력 데이터의 열 수
num_features = 8
threshold = 0.01  # 이상치 판단 기준값

class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        
        # 인코더
        self.encoder = nn.Sequential(
            nn.Linear(num_features, 4),
            nn.ReLU(),
            nn.Linear(4, 2)
        )
        
        # 디코더
        self.decoder = nn.Sequential(
            nn.Linear(2, 4),
            nn.ReLU(),
            nn.Linear(4, num_features),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded
    

In [80]:
num_epochs = 100
batch_size = 64

In [81]:
class Trainer():
    def __init__(self, model, optimizer, train_loader, val_loader, scheduler, device):
        self.model = model
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.scheduler = scheduler
        self.device = device
        # Loss Function
        self.criterion = nn.L1Loss().to(self.device)
        
    def fit(self, ):
        self.model.to(self.device)
        best_score = 0
        for epoch in range(num_epochs):
            self.model.train()
            train_loss = []
            for x in iter(self.train_loader):
                inputs, _ = x
                self.optimizer.zero_grad()

                _x = self.model(inputs)
                loss = self.criterion(inputs, _x)

                loss.backward()
                self.optimizer.step()

                train_loss.append(loss.item())

            score = self.validation(self.model, 0.95)
            print(f'Epoch : [{epoch}] Train loss : [{np.mean(train_loss)}] Val Score : [{score}])')

            if self.scheduler is not None:
                self.scheduler.step(score)

            if best_score < score:
                best_score = score
                torch.save(model.module.state_dict(), './best_model.pth', _use_new_zipfile_serialization=False)
    
    def validation(self, eval_model, thr):
        cos = nn.CosineSimilarity(dim=1, eps=1e-6)
        eval_model.eval()
        pred = []
        true = []
        with torch.no_grad():
            for x, y in iter(self.val_loader):
            
                _x = self.model(x)
                diff = cos(x, _x).cpu().tolist()
                batch_pred = np.where(np.array(diff)<thr, 1,0).tolist()
                pred += batch_pred
                true += y.tolist()

        return f1_score(true, pred, average='macro')

In [82]:
train_dataset = TensorDataset(train_X, train_y)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=6)

val_dataset = TensorDataset(val_X, val_y)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=6)

In [83]:
model = nn.DataParallel(Autoencoder())
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = 0.01)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=10, threshold_mode='abs', min_lr=1e-8, verbose=True)

trainer = Trainer(model, optimizer, train_loader, val_loader, scheduler, device)
trainer.fit()

Epoch : [0] Train loss : [36.17522296522345] Val Score : [0.07855251544571933])
Epoch : [1] Train loss : [36.07329540045894] Val Score : [0.07855251544571933])
Epoch : [2] Train loss : [36.03239570953408] Val Score : [0.07855251544571933])
Epoch : [3] Train loss : [36.07053356708921] Val Score : [0.07855251544571933])
Epoch : [4] Train loss : [36.06841712340484] Val Score : [0.07855251544571933])
Epoch : [5] Train loss : [36.036548828729906] Val Score : [0.07855251544571933])
Epoch : [6] Train loss : [36.08002402625351] Val Score : [0.07855251544571933])


RuntimeError: DataLoader worker (pid(s) 13212) exited unexpectedly

In [None]:
model = Autoencoder()
model.load_state_dict(torch.load('./best_model.pth'))
model = nn.DataParallel(model)
model.eval()