# VAE with Pytorch for Adult.

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics # plot_roc_curve.
from sklearn.model_selection import train_test_split # Train/test/validation split of data.
import sklearn.preprocessing as preprocessing
import patsy # Is not needed anylonger since sklearn did what I wanted.
import random 

# Pytorch imports
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader # Not sure what "Dataset" is for atm.

# Configure the device 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using '{device}' device.")

# Print working directory (for control)
import os
print(f"The working directory is {os.getcwd()}")

# Set seeds for reproducibility. 
seed = 1234
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

Using 'cuda' device.
The working directory is /home/ajo/gitRepos/master_thesis


In [12]:
# Load the adult data. 
adult_data = pd.read_csv("adult_data_no_NA.csv", index_col = 0)
print(adult_data.shape) # Looks good!

categorical_features = ["workclass","marital_status","occupation","relationship", \
                        "race","sex","native_country"]
numerical_features = ["age","fnlwgt","education_num","capital_gain","capital_loss","hours_per_week"]

(45222, 14)


In [83]:
#%%writefile Data.py
# Classes for data.

class Data():
    """Class for pre-processing data. It automatically encodes, splits and scales the data. 
    
    Contains methods for standardization, encoding and train/test/validation splitting.
    
    Parameters
    ----------
    data : dataframe
        Pandas df with loaded data. 
    cat_features : list of strings.
        List of categorical features. 
    num_features : list of string. 
        List of numerical features. 
    valid : Boolean 
        True if validation data should be made, False if not. 
        
    Methods 
    -------
    get_training_data :
        Returns a tuple with training data (X,y).
    get_test_data :
        Returns a tuple with test data (X,y).   
    get_validation_data :
        Returns a tuple with validation data (X,y) (if applicable).
    train_test_valid_split : 
        Returns a tuple with (X_train, y_train, X_test, y_test) or 
        (X_train, y_train, X_test, y_test, X_valid, y_valid).
    scale : 
        Scale the numerical features according to X_train.
    descale : 
        Descale the numerical features according to X_train.
    fit_scaler :
        Fit sklearn scaler to X_train.
    encode :
        Encode the categorical features according to X_train.
    decode :
        Decode the categorical features according to X_train.
    fit_encoder :
        Fit sklearn encoder to X_train.
        
    """
    def __init__(self, data, cat_features, num_features, valid = False):
        # The transformations are then done here. 
        self._data = data
        self.categorical_features = cat_features
        self.numerical_features = num_features
        self.valid = valid
        
        # Assume output always is called 'y'.
        self._X = data.loc[:, data.columns != "y"]
        self._y = data.loc[:,"y"] 
        
        # Encode the categorical features. 
        self.encoder = self.fit_encoder() # Fit the encoder to the categorical data.
        self.X_encoded = self.encode()
        
        # Split into train/test/valid.
        if self.valid:
            (self.X_train, self.y_train, self.X_test, self.y_test, \
                self.X_valid, self.y_valid) = self.train_test_valid_split(self.X_encoded, self._y)
        else:
            (self.X_train, self.y_train, self.X_test, self.y_test) = self.train_test_valid_split(self.X_encoded, self._y)
        
        
        # Scale the numerical features. 
        self.scaler = self.fit_scaler()
        self.X_train = self.scale(self.X_train) # Scale the training data.
        self.X_test = self.scale(self.X_test) # Scale the test data.
        if self.valid:
            self.X_valid = self.scale(self.X_valid) # Scale the validation data. 
        
    
    def get_training_data(self):
        """Returns training data (X_train, y_train)."""
        return self.X_train, self.y_train
    
    def get_test_data(self):
        """Returns test data (X_test, y_test)."""
        return self.X_test, self.y_test
    
    def get_validation_data(self):
        """Returns validation data (X_valid, y_valid) if applicable."""
        if self.valid:
            return self.X_valid, self.y_valid
        else: 
            raise ValueError("You did not instantiate this object to contain validation data.")
    
    def train_test_valid_split(self, X, y):
        """Split data into training/testing/validation, where validation is optional at instantiation."""
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=42)
        if self.valid:
            X_test, X_valid, y_test, y_valid = train_test_split( \
                                        X_test, y_test, test_size=1/3, random_state=42)
            return (X_train, y_train, X_test, y_test, X_valid, y_valid)
        return (X_train, y_train, X_test, y_test)
            
    def scale(self, df):
        """Scale the numerical features according to the TRAINING data."""
        output = df.copy() # Deep copy the given df. 
        output[self.numerical_features] = self.scaler.transform(output[self.numerical_features])
        return output
        
    def descale(self, df):
        """Descale the numerical features according to the TRAINING data."""
        output = df.copy()
        output[self.numerical_features] = self.scaler.inverse_transform(output[self.numerical_features])
        return output

    def fit_scaler(self):
        """Fit the scaler to the numerical TRAINING data. Only supports OneHotEncoding."""
        return preprocessing.StandardScaler().fit(self.X_train[self.numerical_features])
    
    def encode(self):
        """Encode the categorical data. Only supports OneHotEncoding."""
        output = self._X.copy() # Deep copy the X-data.
        encoded_features = self.encoder.get_feature_names(self.categorical_features) # Get the encoded names. 
        
        # Add the new columns to the new dataset (all the levels of the categorical features).
        output[encoded_features] = self.encoder.transform(output[self.categorical_features])

        # Remove the old columns (before one-hot encoding)
        output = output.drop(self.categorical_features, axis = 1) 
        return output
    
    def decode(self, df):
        """Decode the categorical data. Only support OneHotEncoding."""
        output = df.copy()
        encoded_features = self.encoder.get_feature_names(self.categorical_features) # Get the encoded names. 
        
        if len(encoded_features) == 0:
            return output # Does not work when there are not categorical features in df.
        
        output[self.categorical_features] = self.encoder.inverse_transform(output[encoded_features])
        output = output.drop(encoded_features, axis=1)
        return output
    
    def fit_encoder(self):
        """Fit the encoder to the categorical data. Only supports OneHotEncoding."""
        return preprocessing.OneHotEncoder(handle_unknown = "error", \
          sparse = False, drop = None).fit(self._X[self.categorical_features])
    

Writing Data.py


In [81]:
# Time to test the class out. 
Adult = Data(adult_data, categorical_features, numerical_features, valid = True)
X_train, y_train = Adult.get_training_data()
X_test, y_test = Adult.get_test_data()
X_valid, y_valid = Adult.get_validation_data()
#print(X_train.shape)
#print(X_test.shape)
#print(X_valid.shape)

# Test descaling the already scaled data sets.
X_train_descaled = Adult.descale(X_train)
#print(X_train_descaled.shape)
#print(X_train_descaled[numerical_features].describe())

# Test decoding from one-hot encoding.
X_train_decoded = Adult.decode(X_train)
#print(X_train_decoded.shape)
#print(X_train_decoded[categorical_features].describe())

# Decoded and descaled data set should be the same as original (training) data. 
X_train_descaled = Adult.descale(X_train)
X_train_de_everything = Adult.decode(X_train_descaled)
print(X_train_de_everything[categorical_features].describe())
print(X_train_de_everything[numerical_features].describe())
print(X_train_de_everything.shape)
# Looks like it all works as I intended!

       workclass       marital_status       occupation relationship    race  \
count      30148                30148            30148        30148   30148   
unique         7                    7               14            6       5   
top      Private   Married-civ-spouse   Prof-specialty      Husband   White   
freq       22151                14080             4050        12504   25931   

          sex  native_country  
count   30148           30148  
unique      2              40  
top      Male   United-States  
freq    20386           27533  
               age        fnlwgt  education_num  capital_gain  capital_loss  \
count  30148.00000  3.014800e+04   30148.000000  30148.000000  30148.000000   
mean      38.54249  1.896485e+05      10.124453   1091.022788     87.993200   
std       13.24241  1.059980e+05       2.565913   7519.182124    403.737188   
min       17.00000  1.349200e+04       1.000000      0.000000      0.000000   
25%       28.00000  1.172680e+05       9.000000  

## The Dataset class for Pytorch below takes some data constructed from the Data class

In [85]:
class CustomDataset(Dataset):
    """Class for using data with Pytorch."""
    def __init__(self, X, y, transform = None):
        self.X = X.values.astype(np.float32) # Return a Numpy array with the dataframe contents. 
        self.y = np.reshape(y.values,(len(y.values),1)).astype(np.float32) # Return a Numpy array with the dataframe contents. 

        
        self.n_samples = self.X.shape[0]
        self.transform = transform
       
    def __getitem__(self, index):
        sample = self.X[index], self.y[index]
        #return sample
        if self.transform:
            sample = self.transform(sample)
        return sample
   
    def __len__(self):
        return self.n_samples
    
class ToTensor:
    """Callable object to transform CustomDataset inputs and labels to Pytorch tensors."""
    def __call__(self, sample):
        inputs, labels = sample
        #print(labels)
        return torch.from_numpy(inputs), torch.from_numpy(labels)


# select rows from the dataset
train_data = CustomDataset(X_train, y_train, transform = ToTensor()) # Here we put the Adult data loaded in earlier cells into the Dataset type. 

# Check if it works as expected.
nex = train_data[0]
print(nex)
print(type(nex[0]), type(nex[1]))

(tensor([-1.4003, -0.2451, -0.0485, -0.1451, -0.2180, -1.9924,  0.0000,  0.0000,
         1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  1.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  1.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000]), tensor([0.]))
<class 'torch.Tensor'> <class 'torch.Tensor'>
