In [1]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

In [2]:
# For reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

In [3]:
with open('../data/splits.pkl', 'rb') as f:
    splits = pickle.load(f)

x_train = splits['X_train']
x_val = splits['X_val']

# Convert to numpy
x_train_np = x_train.values.astype(np.float32)
x_val_np = x_val.values.astype(np.float32)

In [4]:
train_dataset = TensorDataset(torch.tensor(x_train_np))
val_dataset = TensorDataset(torch.tensor(x_val_np))

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [5]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()

        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32,16)
        )

        self.decoder = nn.Sequential(
            nn.Linear(16,32),
            nn.ReLU(),
            nn.Linear(32,64),
            nn.ReLU(),
            nn.Linear(64, input_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [6]:
input_dim = x_train.shape[1]

model = Autoencoder(input_dim)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [7]:
num_epochs = 10
train_losses = []
val_losses = []

for epoch in range(num_epochs):
    # Training
    model.train() # Set model to training mode
    train_loss = 0

    for batch in train_loader:
        x_batch = batch[0]

        optimizer.zero_grad() # Clear gradients
        outputs = model(x_batch) # Pass data through model and get reconstruction
        loss = criterion(outputs, x_batch) # Calculate loss between reconstruction and original data

        loss.backward() # Backpropagate loss and compute gradients
        optimizer.step() # Update weights

        train_loss += loss.item() * x_batch.size(0) # Total loss

    train_loss /= len(train_loader.dataset) #Average loss
    train_losses.append(train_loss)

    # Validation
    model.eval() # Set model to evaluation mode
    val_loss = 0

    with torch.no_grad():
        for batch in val_loader:
            x_batch = batch[0]
            outputs = model(x_batch)
            loss = criterion(outputs, x_batch)
            val_loss += loss.item() * x_batch.size(0)

    val_loss /= len(val_loader.dataset)
    val_losses.append(val_loss)

    #print(f'Epoch [{epoch+1}/{num_epochs}] Train Loss: {train_loss:.6f} Val Loss: {val_loss:.6f}')



In [8]:
%matplotlib inline
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.legend()
plt.title('Training & Validation Loss')
plt.savefig('../plots/loss_curve.png')
plt.close('all')


In [10]:
torch.save(model.state_dict(), '../data/autoencoder_model.pth')