In [30]:
import numpy
import pickle
import time

from sklearn.preprocessing import OneHotEncoder

import torch
import torch.nn as nn

import pandas as pd

from torch.optim import SGD, Adam
from torch.utils.data import Dataset, DataLoader

import matplotlib
matplotlib.rcParams['figure.figsize'] = (9.0, 7.0)
from matplotlib import pyplot

In [13]:
# Load data from numpy file
# debinarizedData = numpy.load('debinarizedData.npy')
# numpy.shape(debinarizedData)

In [2]:
# Load data from pickle file

# infile = open('../Projet/trainDatasetPerdiem-20191028.pkl', 'rb')
# data, labels = pickle.load(infile)

# infile.close()

# data_numpy = data.to_numpy()
# print(labels)
# data.describe() # give column stats

# To one-hot-encode
# - Spécialité de médecins        | 33
# - Université de graduation      | 14
# - Plage horaire de facturation  | 3
# - Agence de représentation      | 845
# - Établissements                | 241

In [12]:
# Separate labeled and unlabeled data

# unlabeled = data_numpy[data_numpy[:, 1182] == -1][:, :-1]
# numpy.random.shuffle(unlabeled)
# numpy.save('unlabeled.npy', unlabeled)

# labeled = data_numpy[data_numpy[:, 1182] != -1]
# numpy.random.shuffle(labeled)
# numpy.save('labeled.npy', labeled)

In [31]:
# Make smaller sample for testing

# unlabeled = debinarizedData = numpy.load('unlabeled.npy')
# smaller_size = 5000
# smaller_unlabeled = unlabeled[:smaller_size]
# numpy.save('smaller_unlabeled.npy', smaller_unlabeled)

smaller_unlabeled = numpy.load('smaller_unlabeled.npy')
print(numpy.shape(smaller_unlabeled))

(5000, 1182) (1000, 1182)


In [49]:
# Make dataset
class RAMQDatasetForAE(Dataset):
    
    def __init__(self, data):
        super().__init__()
        self.data = []
        for elem in data:
            if isinstance(elem, numpy.ndarray):
                elem = elem.tolist()
            elem = torch.Tensor(elem)
            # AE targets are same as inputs -> [x, x']
            self.data += [(elem, elem)]
        self.targets = data

    def __getitem__(self, index):
        return self.data[index]
    
    def __len__(self):
        return len(self.data)
    
    def

In [58]:
# Clustering model
# Ref tied weights:
# https://discuss.pytorch.org/t/how-to-create-and-train-a-tied-autoencoder/2585/13
# https://gist.github.com/InnovArul/500e0c57e88300651f8005f9bd0d12bc

class Autoencoder(nn.Module):
    """
    Définition d'un réseau de neurones pleinement connecté
    de type autoencoder qui permet de réduire l'erreur de
    reconstruction sur le jeu de donnée
    """
    def __init__(self, input_len):
        super(Autoencoder, self).__init__()
        self.encode_only = False
        # Initialisation of the network layers
        self.encoder = nn.Sequential(
            nn.Linear(input_len, 800), 
            nn.ReLU(True), 
            nn.Linear(800, 500),
            nn.ReLU(True),
            nn.Linear(500, 200),
            nn.ReLU(True),
            nn.Linear(200, 50),
            nn.ReLU(True),
            nn.Linear(50, 25),
            nn.ReLU(True),
            nn.Linear(25, 10),
            nn.ReLU(True))
        self.decoder = nn.Sequential(
            nn.Linear(10, 25),
            nn.ReLU(True),
            nn.Linear(25, 50),
            nn.ReLU(True),
            nn.Linear(50, 200),
            nn.ReLU(True),
            nn.Linear(200, 500),
            nn.ReLU(True),
            nn.Linear(500, 800),
            nn.ReLU(True),
            nn.Linear(800, input_len),
            nn.ReLU(True))
    def set_encode_only(self, value):
        self.encode_only = value
        
    def forward(self, x):
        x = self.encoder(x)
        if not self.encode_only:
            x = self.decoder(x)
        return x

In [67]:
# AE Training
# Pretrain autoencoder to minimise reconstruction loss 
# and keep model for clustering
def train(model):
    # Setup cuda if available
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Training parameters 
    nb_epoch = 5
    learning_rate = 0.01
    momentum = 0.9
    batch_size = 100     # SGD optimizer
    weight_decay = 1e-5  # Adam optimizer

    # Load data set and create dataloader
    input_d = numpy.shape(smaller_unlabeled)[1]
    train_set = RAMQDatasetForAE(smaller_unlabeled)

    train_loader = DataLoader(train_set, batch_size=batch_size)

    # Load model and setup proper device
    model.to(device)
    model.set_encode_only(False)

    # Reconstruction loss and optimzer
    criterion = nn.MSELoss()
    optimizer = SGD(model.parameters(), lr=learning_rate, momentum=momentum)
    # optimizer = Adam(model.parameters(), weight_decay=weight_decay)

    model.train()
    losses = []
    for i_epoch in range(nb_epoch):

        start_time, train_losses = time.time(), []
        for i_batch, batch in enumerate(train_loader):
            # Read batch data
            x, y = batch

            # Reset gradients to zero
            optimizer.zero_grad()

            # Predictions and loss
            x_pred = model(x)
            loss = criterion(x_pred, y)

            # Backpropagate and gradient descent
            loss.backward()
            optimizer.step()

            train_losses.append(loss.item())

        mean_loss = numpy.mean(train_losses)
        losses.append(mean_loss)
        print(' [-] epoch {:4}/{:}, train loss {:.6f} in {:.2f}s'.format(
            i_epoch+1, nb_epoch, mean_loss, time.time()-start_time))

In [68]:
model = Autoencoder(input_d)
train(model)

 [-] epoch    1/5, train loss 0.030879 in 1.93s
 [-] epoch    2/5, train loss 0.030771 in 1.94s
 [-] epoch    3/5, train loss 0.030663 in 1.92s
 [-] epoch    4/5, train loss 0.030559 in 1.94s
 [-] epoch    5/5, train loss 0.030458 in 1.93s


In [None]:
# AE clustering optimisation
# Must retrain the autoencoder from pretrained weights
# now optimising weights to minimise clustering loss
# AND reconstruction loss
