<a href="https://colab.research.google.com/github/cesar-yoab/Survey-GAINs/blob/main/SurveyGAINs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Library import and workspace set-up

In [None]:
# In case you forgot to import stuff run this cell
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from scipy.special import expit

In [None]:
use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')
device

device(type='cuda')

# Survey Encoder
Implementation of survey encoder using variation of  weigh of evidence (WOE) is.
We define the weight of a category $c$ as:
\begin{align*}
W(c) = \log\left(\frac{\text{# of non-}c}{\text{# of }c}\right)
\end{align*}

In [None]:
class SurveyEncoder(object):
    def __init__(self, columns='all'):
        self.columns = columns
        self.map = dict()
        self.reverse_map = dict()

    def _fit(self, data):
        raise NotImplementedError
    
    def _transform(self, data):
        """Replace categorical values with the fitted continous values."""
        for col in self.columns:
            data[col] = data[col].replace(self.map[col])

    def fit_transform(self, data):
        """Fit a mapping for the data and return a copy of the 
        data frame but encoded using the fitted mapping."""
        self._fit(data)
        encoded = data.copy()
        self._transform(encoded)
        return encoded

    def decode_imputed(self, imputed):
        """Decodes back to categorical responses."""
        for col in self.columns:
            imputed[col] = imputed[col].replace(self.reverse_map[col])
        
        return imputed

    def smart_decode(self, imputed):
        """Decodes to categorical after passing it through the model."""
        keys = list(self.map.keys())

        for j in range(imputed.shape[1]):
            col_keys = np.array(list(self.reverse_map[keys[j]].keys()))
            imp_col = imputed[:, j]
            for i in range(imputed.shape[0]):
                dist = np.abs(col_keys - imp_col[i])
                imp_col[i] = col_keys[dist.argmin()]

        imputed_df = pd.DataFrame(imputed, columns=self.columns)
        return self.decode_imputed(imputed_df)

In [None]:
# This what will be ultimately using
class WOEncoder(SurveyEncoder):
    def __init__(self, columns='all'):
        super().__init__(columns)

    def _fit(self, data):
        if self.columns == 'all':
            self.columns = data.columns

        for col in self.columns:
            prop_events = data[col].value_counts(normalize=True)
            prop_nonevents = 1-prop_events

            # convert to dictionary
            prop_events = prop_events.to_dict()
            prop_nonevents = prop_nonevents.to_dict()
            
            col_map = dict()
            col_remap = dict()
            # Calculate WOE
            for k, v in prop_events.items():
                # sigmoid woe
                woe = np.log(prop_nonevents[k]/v)
                col_map[k] = woe
                col_remap[woe] = k
            
            self.map[col] = col_map
            self.reverse_map[col] = col_remap

# GAIN
Implementation of our GAIN Networks

In [None]:
class GAIN(nn.Module):
    def __init__(self, weights):
        super().__init__()
        n_layers = len(weights) - 1

        layers = [nn.Linear(weights[0], weights[1])]
        for i in range(1, n_layers):
            layers.extend([nn.GELU(), nn.Linear(weights[i], weights[i + 1])])

        self.model = nn.Sequential(*layers)

    def forward(self, X, M):
        input = torch.cat(dim=1, tensors=[X, M])
        output = self.model(input)
        return output

In [None]:
class GAINHyper(object):
    def __init__(self, batch_size=128, lr=1e-4, alpha=1000,
                 epochs=50, p_hint=.9):
        self.batch_size = batch_size
        self.lr = lr
        self.alpha = alpha
        self.epochs = epochs
        self.p_hint = p_hint

In [None]:
# Helper functions
def mask_data(data, mask, tau=0):
    return mask * data + (1 - mask) * tau

def sample_Z(m, n):
    return np.random.uniform(0., 0.01, size=[m, n])

def sample_M(m, n, p):
    A = np.random.uniform(0., 1., size=[m, n])
    B = A > p
    C = 1. * B

    return C

# Loss functions
def discriminator_loss(M, New_X, H, nets):
    Generator = nets[0]
    Discriminator = nets[1]

    # Generator 
    G_sample = Generator(New_X, M)

    # Combine with original data
    Hat_New_X = New_X * M + G_sample * (1-M)

    # Discriminator
    D_prob = Discriminator(Hat_New_X, H)

    # Loss
    D_loss = -torch.mean(M * torch.log(D_prob + 1e-8) + (1-M) * torch.log(1.-D_prob + 1e-8))
    return D_loss

def generator_loss(X, M, New_X, H, alpha, nets):
    Generator = nets[0]
    Discriminator = nets[1]

    # Generator
    G_sample = Generator(New_X, M)

    # Combine with original data
    Hat_New_X = New_X * M + G_sample * (1-M)

    # Discriminator
    D_prob = Discriminator(Hat_New_X, H)

    # Loss
    G_loss1 = -torch.mean((1-M) * torch.log(D_prob + 1e-8))
    MSE_train_loss = torch.mean((M * New_X - M * G_sample)**2) / torch.mean(M)

    G_loss = G_loss1 + alpha * MSE_train_loss

    return G_loss, MSE_train_loss

## Driver code to train the GAIN model

In [None]:
def train_GAIN(nets, dataloaders, optimizers,
               batch_size=128, alpha=100, 
               p_hint=.9, epochs=50, verbose=True):
    """(Generator, Discriminator), (Data, Mask)"""
    data_loader, mask_loader = dataloaders
    optimizer_G, optimizer_D = optimizers
    # Training the network
    it = 0
    for epoch in range(epochs):
        if verbose:
            print("Epoch: {}".format(epoch))
        # Batch training
        for X_mb, M_mb in zip(data_loader, mask_loader):
            # Must be floats
            X_mb = X_mb
            M_mb = M_mb

            # Noise matrix
            Z_mb = sample_Z(X_mb.shape[0], dims[1])
            Z_mb = torch.tensor(Z_mb, device=device).float()

            # Hint matrix
            H_mb = sample_M(X_mb.shape[0], dims[1], 1-p_hint)
            H_mb = M_mb * torch.tensor(H_mb, device=device).float()

            # Mask data
            X_mask = mask_data(X_mb, M_mb, Z_mb)

            # Discriminator
            optimizer_D.zero_grad()
            D_loss_curr = discriminator_loss(M=M_mb, New_X=X_mask, 
                                             H=H_mb, nets=nets)
            D_loss_curr.backward()
            optimizer_D.step()

            # Generator
            optimizer_G.zero_grad()
            G_loss_curr, MSE_train_loss_curr = generator_loss(X=X_mb, M=M_mb, 
                                                              New_X=X_mask,
                                                              alpha=alpha, 
                                                              H=H_mb, nets=nets)
            G_loss_curr.backward()
            optimizer_G.step()

            if it % 100 == 0 and verbose:
                print("\tIter: {}".format(it))
                print("\tTrain RMSE: {:.4}".format(np.sqrt(MSE_train_loss_curr.item())))
                print()

            it += 1

In [None]:
def imputator(Generator, X, M, encoder, cuda=False):
    # Convert data set to tensor
    Xtens = torch.tensor(X, device=device).float()
    Mtens = torch.tensor(M, device=device).float()

    imputed_data = Mtens * Xtens + (1-Mtens) * Generator(Xtens, Mtens)
    if cuda:
        imputed_data = imputed_data.cpu().detach().numpy()
    else:
        imputed_data = imputed_data.detach().numpy()

    rmse = np.sqrt(np.mean(X - imputed_data)**2)

    return encoder.smart_decode(imputed_data), rmse

In [None]:
def imputed_scores(X, imputed):
    score = dict()
    score['RMSE'] = np.sqrt(np.mean(X - imputed)**2)

In [None]:
def generate_trial_data(X, p_miss=.2, split=.8):
    ### Missing data introduction
    p_miss_vec = p_miss * np.ones((dims[1], 1))
    M = np.zeros(dims)

    for i in range(dims[1]):
        A = np.random.uniform(0., 1., size=[dims[0], ])
        B = A > p_miss_vec[i]
        M[:, i] = 1.*B

    # Train Test division
    idx = np.random.permutation(dims[0])

    Train_no = int(dims[0] * .8)
    Test_no = dims[0] - Train_no

    # Train / Test Features
    trainX = X[idx[:Train_no], :]
    testX = X[idx[Train_no:], :]

    # Train / Test Features
    trainM = M[idx[:Train_no], :]
    testM = M[idx[Train_no:], :]
    return (trainX, testX), (trainM, testM), M

# Kaggle importation and cleaning


In [None]:
# Import and load Kaggle survey data
import zipfile

with zipfile.ZipFile('kaggle.zip', 'r') as zipf:
        zipf.extractall('./')    


# Import data and first looks
survey_df = pd.read_csv('kaggle_survey_2020_responses.csv')
# First row contains the question displayed in the survey
#  we create a dict with the column name as key and the question as value
questions = dict(zip(survey_df.columns, survey_df.iloc[0]))

# Then we have to remove the entire first row since it doesn't contain any actual data
survey_df = survey_df.iloc[1::]
survey_df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Time from Start to Finish (seconds),Q1,Q2,Q3,Q4,Q5,Q6,Q7_Part_1,Q7_Part_2,Q7_Part_3,Q7_Part_4,Q7_Part_5,Q7_Part_6,Q7_Part_7,Q7_Part_8,Q7_Part_9,Q7_Part_10,Q7_Part_11,Q7_Part_12,Q7_OTHER,Q8,Q9_Part_1,Q9_Part_2,Q9_Part_3,Q9_Part_4,Q9_Part_5,Q9_Part_6,Q9_Part_7,Q9_Part_8,Q9_Part_9,Q9_Part_10,Q9_Part_11,Q9_OTHER,Q10_Part_1,Q10_Part_2,Q10_Part_3,Q10_Part_4,Q10_Part_5,Q10_Part_6,Q10_Part_7,...,Q31_B_Part_7,Q31_B_Part_8,Q31_B_Part_9,Q31_B_Part_10,Q31_B_Part_11,Q31_B_Part_12,Q31_B_Part_13,Q31_B_Part_14,Q31_B_OTHER,Q33_B_Part_1,Q33_B_Part_2,Q33_B_Part_3,Q33_B_Part_4,Q33_B_Part_5,Q33_B_Part_6,Q33_B_Part_7,Q33_B_OTHER,Q34_B_Part_1,Q34_B_Part_2,Q34_B_Part_3,Q34_B_Part_4,Q34_B_Part_5,Q34_B_Part_6,Q34_B_Part_7,Q34_B_Part_8,Q34_B_Part_9,Q34_B_Part_10,Q34_B_Part_11,Q34_B_OTHER,Q35_B_Part_1,Q35_B_Part_2,Q35_B_Part_3,Q35_B_Part_4,Q35_B_Part_5,Q35_B_Part_6,Q35_B_Part_7,Q35_B_Part_8,Q35_B_Part_9,Q35_B_Part_10,Q35_B_OTHER
1,1838,35-39,Man,Colombia,Doctoral degree,Student,5-10 years,Python,R,SQL,C,,,Javascript,,,,MATLAB,,Other,Python,"Jupyter (JupyterLab, Jupyter Notebooks, etc)",,,Visual Studio Code (VSCode),,Spyder,,,,,,,Kaggle Notebooks,Colab Notebooks,,,,,,...,,,,,,,SAP Analytics Cloud,,,"Automated data augmentation (e.g. imgaug, albu...",,,,Automated hyperparameter tuning (e.g. hyperopt...,Automation of full ML pipelines (e.g. Google C...,,,Google Cloud AutoML,,Databricks AutoML,,,Auto-Keras,Auto-Sklearn,,,,,,,,,,TensorBoard,,,,,,
2,289287,30-34,Man,United States of America,Master’s degree,Data Engineer,5-10 years,Python,R,SQL,,,,,,,,,,,Python,,,Visual Studio,,PyCharm,,,Sublime Text,,,,,,Colab Notebooks,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,860,35-39,Man,Argentina,Bachelor’s degree,Software Engineer,10-20 years,,,,,,Java,Javascript,,,Bash,,,,R,,,,Visual Studio Code (VSCode),,,Notepad++,Sublime Text,Vim / Emacs,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,507,30-34,Man,United States of America,Master’s degree,Data Scientist,5-10 years,Python,,SQL,,,,,,,Bash,,,,Python,,,,,PyCharm,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,78,30-34,Man,Japan,Master’s degree,Software Engineer,3-5 years,Python,,,,,,,,,,,,,Python,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
### Preparing data (Selecting columns to use)
test_df = survey_df[['Q1', 'Q2', 'Q4', 'Q5', 'Q6', 'Q8', 'Q11', 'Q15']]
test_df = test_df.dropna()
subset_df = test_df.copy()
subset_df.head()

Unnamed: 0,Q1,Q2,Q4,Q5,Q6,Q8,Q11,Q15
1,35-39,Man,Doctoral degree,Student,5-10 years,Python,"A cloud computing platform (AWS, Azure, GCP, h...",1-2 years
2,30-34,Man,Master’s degree,Data Engineer,5-10 years,Python,A personal computer or laptop,1-2 years
3,35-39,Man,Bachelor’s degree,Software Engineer,10-20 years,R,A personal computer or laptop,I do not use machine learning methods
4,30-34,Man,Master’s degree,Data Scientist,5-10 years,Python,"A cloud computing platform (AWS, Azure, GCP, h...",3-4 years
7,22-24,Man,Bachelor’s degree,Student,3-5 years,Python,A personal computer or laptop,Under 1 year


# Full Kaggle Data Set Experiment
In this section we test different architecture and use the best
performing ones for our longer test with 30 trials and different 
missing values.

In [None]:
# Encoding data and preparing for training
encoder = WOEncoder()
X_raw = encoder.fit_transform(subset_df)
X = X_raw.values
dims = X.shape

### Missing data introduction
p_miss = .2 # 20% missing data
p_miss_vec = p_miss * np.ones((dims[1], 1))
M = np.zeros(dims)

for i in range(dims[1]):
    A = np.random.uniform(0., 1., size=[dims[0], ])
    B = A > p_miss_vec[i]
    M[:, i] = 1.*B

# Train Test division
idx = np.random.permutation(dims[0])

Train_no = int(dims[0] * .8)
Test_no = dims[0] - Train_no

# Train / Test Features
trainX = X[idx[:Train_no], :]
testX = X[idx[Train_no:], :]

# Train / Test Features
trainM = M[idx[:Train_no], :]
testM = M[idx[Train_no:], :]

In [None]:
# Hyper params and network architectures
weights = [dims[1]*2, 64, 128, 64, dims[1]]

hyper_params = GAINHyper(batch_size=128, lr=0.001, 
                         alpha=1000, epochs=40, p_hint=.9)

# Data Loaders for training
data_loader = DataLoader(torch.tensor(trainX, device=device).float(), 
                         batch_size=hyper_params.batch_size, 
                         shuffle=False)
mask_loader = DataLoader(torch.tensor(trainM, device=device).float(), 
                         batch_size=hyper_params.batch_size, 
                         shuffle=False)

# Generator and Discriminator
Generator = GAIN(weights=weights).to(device)
Discriminator = GAIN(weights=weights).to(device)

# Optimizers
optimizer_G = optim.Adam(Generator.parameters(), lr=hyper_params.lr)
optimizer_D = optim.Adam(Discriminator.parameters(), lr=hyper_params.lr)

In [None]:
train_GAIN(nets=(Generator, Discriminator), 
           optimizers=(optimizer_G, optimizer_D),
           dataloaders=(data_loader, mask_loader),
           batch_size=hyper_params.batch_size,
           alpha=hyper_params.alpha,
           p_hint=hyper_params.p_hint,
           epochs=hyper_params.epochs)

Epoch: 0
	Iter: 0
	Train RMSE: 1.842

	Iter: 100
	Train RMSE: 0.3432

Epoch: 1
	Iter: 200
	Train RMSE: 0.1237

Epoch: 2
	Iter: 300
	Train RMSE: 0.08545

Epoch: 3
	Iter: 400
	Train RMSE: 0.06142

Epoch: 4
	Iter: 500
	Train RMSE: 0.05136

Epoch: 5
	Iter: 600
	Train RMSE: 0.04736

Epoch: 6
	Iter: 700
	Train RMSE: 0.03733

Epoch: 7
	Iter: 800
	Train RMSE: 0.03746

Epoch: 8
	Iter: 900
	Train RMSE: 0.03742

Epoch: 9
	Iter: 1000
	Train RMSE: 0.03548

Epoch: 10
	Iter: 1100
	Train RMSE: 0.03206

Epoch: 11
	Iter: 1200
	Train RMSE: 0.02585

Epoch: 12
	Iter: 1300
	Train RMSE: 0.03202

Epoch: 13
	Iter: 1400
	Train RMSE: 0.02742

Epoch: 14
	Iter: 1500
	Train RMSE: 0.03389

Epoch: 15
	Iter: 1600
	Train RMSE: 0.02969

Epoch: 16
	Iter: 1700
	Train RMSE: 0.0262

Epoch: 17
	Iter: 1800
	Train RMSE: 0.02617

Epoch: 18
	Iter: 1900
	Train RMSE: 0.02489

Epoch: 19
	Iter: 2000
	Train RMSE: 0.02269

Epoch: 20
	Iter: 2100
	Train RMSE: 0.02285

Epoch: 21
	Iter: 2200
	Train RMSE: 0.02055

Epoch: 22
	Iter: 2300
	Tr

### Testing and metrics for single trial
We notice that its important to split data into a train and test set, this
should be done to avoid over fitting. When testing on a test set we can see if 
the model generalized by looking at the RMSE, it should be relatively close to 
the train RMSE.

In [None]:
# Test set
imputed_data, rmse = imputator(Generator, testX, testM, encoder, cuda=True)
testDecoded = encoder.smart_decode(testX)
means = []
for col in imputed_data.columns:
    mean = np.mean(np.where(imputed_data[col].values == testDecoded[col].values, 1, 0))
    means.append(mean)

accuracy = np.mean(np.array(means))
print("RMSE: {}\nAccuracy: {}".format(rmse, accuracy))

RMSE: 0.00819122496624373
Accuracy: 0.9559923664122139


In [None]:
# Entire set
imputed_data, rmse = imputator(Generator, X, M, encoder, cuda=True)
means = []
for col in imputed_data.columns:
    mean = np.mean(np.where(imputed_data[col].values == test_df[col].values, 1, 0))
    means.append(mean)

accuracy = np.mean(np.array(means))
print("RMSE: {}\nAccuracy: {}".format(rmse, accuracy))

RMSE: 0.008448171974611719
Accuracy: 0.9539284841822401


## Multiple Trial Training and Testing
#### Full Kaggle Data Test

In [None]:
trials = 30
X = X_raw.values
dims = X.shape
pmissing = [.1, .2, .3, .4]
avg_rmses = []
sd_rmses = []
avg_accs = []
sd_accs = []

for p in pmissing:
    print("Test for {}% missing data".format(100*p))
    rmses = []
    accs = []
    for t in range(trials):
        print("Trial {}".format(t))
        #############
        # Preparing #
        #############

        Xt, mt, M = generate_trial_data(X, p_miss=p)
        trainX, testX = Xt
        trainM, testM = mt

        # Hyper params and network architectures
        weights = [dims[1]*2, 64, 128, 64, dims[1]]

        hyper_params = GAINHyper(batch_size=128, lr=0.001, 
                                alpha=1000, epochs=40, p_hint=.9)

        # Data Loaders for training
        data_loader = DataLoader(torch.tensor(trainX, device=device).float(), 
                                batch_size=hyper_params.batch_size, 
                                shuffle=False)
        mask_loader = DataLoader(torch.tensor(trainM, device=device).float(), 
                                batch_size=hyper_params.batch_size, 
                                shuffle=False)

        # Generator and Discriminator
        Generator = GAIN(weights=weights).to(device)
        Discriminator = GAIN(weights=weights).to(device)

        # Optimizers
        optimizer_G = optim.Adam(Generator.parameters(), lr=hyper_params.lr)
        optimizer_D = optim.Adam(Discriminator.parameters(), lr=hyper_params.lr)

        ############
        # Training #
        ############
        train_GAIN(nets=(Generator, Discriminator), 
            optimizers=(optimizer_G, optimizer_D),
            dataloaders=(data_loader, mask_loader),
            batch_size=hyper_params.batch_size,
            alpha=hyper_params.alpha,
            p_hint=hyper_params.p_hint,
            epochs=hyper_params.epochs, verbose=False)
        

        ###########
        # Testing #
        ###########
        # Test set
        imputed_data, rmse = imputator(Generator, testX, testM, encoder, cuda=True)
        testDecoded = encoder.smart_decode(testX)
        means = []
        for col in imputed_data.columns:
            mean = np.mean(np.where(imputed_data[col].values == testDecoded[col].values, 1, 0))
            means.append(mean)

        accuracy = np.mean(np.array(means))
        print("\tTest RMSE: {0:.4f}".format(rmse))
        print("\tTest Accuracy: {0:.4f}".format(accuracy))


        # Entire set
        imputed_data, rmse = imputator(Generator, X, M, encoder, cuda=True)
        testDecoded = encoder.smart_decode(testX)
        means = []
        for col in imputed_data.columns:
            mean = np.mean(np.where(imputed_data[col].values == test_df[col].values, 1, 0))
            means.append(mean)

        accuracy = np.mean(np.array(means))
        print("\tTotal RMSE: {0:.4f}".format(rmse))
        print("\tTotal Accuracy: {0:.4f}".format(accuracy))
        rmses.append(rmse)
        accs.append(accuracy)


    ### Average metrics for experiment

    # RMSE
    mean_rmse = np.mean(np.array(rmses))
    avg_rmses.append(mean_rmse)
    sd_rmses.append(np.std(np.array(rmses)))
    # Accuracy
    mean_acc = np.mean(np.array(accs))
    avg_accs.append(mean_acc)
    sd_accs.append(np.std(np.array(accs)))
    print()
    print('#'*20)
    print("\tMetrics for {}% missing".format(100*p))
    print("\tAvg RMSE: {0:.4f}".format(mean_rmse))
    print("\tAvg Accuracy: {0:.4f}".format(mean_acc))
    print('#'*20)

Test for 10.0% missing data
Trial 0
	Test RMSE: 0.0053
	Test Accuracy: 0.9832
	Total RMSE: 0.0053
	Total Accuracy: 0.9832
Trial 1
	Test RMSE: 0.0047
	Test Accuracy: 0.9775
	Total RMSE: 0.0046
	Total Accuracy: 0.9763
Trial 2
	Test RMSE: 0.0014
	Test Accuracy: 0.9787
	Total RMSE: 0.0012
	Total Accuracy: 0.9778
Trial 3
	Test RMSE: 0.0030
	Test Accuracy: 0.9835
	Total RMSE: 0.0029
	Total Accuracy: 0.9843
Trial 4
	Test RMSE: 0.0035
	Test Accuracy: 0.9752
	Total RMSE: 0.0036
	Total Accuracy: 0.9751
Trial 5
	Test RMSE: 0.0033
	Test Accuracy: 0.9726
	Total RMSE: 0.0031
	Total Accuracy: 0.9733
Trial 6
	Test RMSE: 0.0008
	Test Accuracy: 0.9777
	Total RMSE: 0.0004
	Total Accuracy: 0.9777
Trial 7
	Test RMSE: 0.0069
	Test Accuracy: 0.9613
	Total RMSE: 0.0069
	Total Accuracy: 0.9606
Trial 8
	Test RMSE: 0.0076
	Test Accuracy: 0.9769
	Total RMSE: 0.0073
	Total Accuracy: 0.9776
Trial 9
	Test RMSE: 0.0017
	Test Accuracy: 0.9763
	Total RMSE: 0.0015
	Total Accuracy: 0.9769
Trial 10
	Test RMSE: 0.0007
	Tes

### Notes on first test
The network performs better when there is more missing data,
however this may vary depending on the dataset and its dimensions

In [None]:
results = pd.DataFrame({'PMissing': pmissing, 
                        'FullKaggleRMSE': avg_rmses, 
                        'FullKaggleAcc': avg_accs, 
                        'FullKaggleSdRMSE': sd_rmses,
                        'FullKaggleSdAcc': sd_accs})
#results.to_csv("gain_results.csv")
#
results.head()

Unnamed: 0,PMissing,FullKaggleRMSE,FullKaggleAcc,FullKaggleSdRMSE,FullKaggleSdAcc
0,0.1,0.003842,0.978563,0.003149,0.006506
1,0.2,0.008603,0.958363,0.006427,0.015054
2,0.3,0.010202,0.941573,0.008683,0.019399
3,0.4,0.012585,0.91562,0.012169,0.021049


In [None]:
results.to_csv("gain_results2_06-12-2020.csv")

# 1000 Samples Kaggle Data Experiment

In [None]:
# Encoding data and preparing for training
encoder = WOEncoder()
X_raw = encoder.fit_transform(subset_df)
X = X_raw.values

# select only 1000 samples
sampler = np.random.permutation(X.shape[0])
X = X[sampler[:1000], :]
dims = X.shape

test_df = pd.DataFrame(test_df.values[sampler[:1000], :], columns=test_df.columns)

### Missing data introduction
p_miss = .2 # 20% missing data
p_miss_vec = p_miss * np.ones((dims[1], 1))
M = np.zeros(dims)

for i in range(dims[1]):
    A = np.random.uniform(0., 1., size=[dims[0], ])
    B = A > p_miss_vec[i]
    M[:, i] = 1.*B

# Train Test division
idx = np.random.permutation(dims[0])

Train_no = int(dims[0] * .8)
Test_no = dims[0] - Train_no

# Train / Test Features
trainX = X[idx[:Train_no], :]
testX = X[idx[Train_no:], :]

# Train / Test Features
trainM = M[idx[:Train_no], :]
testM = M[idx[Train_no:], :]

In [None]:
# Hyper params and network architectures
weights = [dims[1]*2, 64, 128, 64, dims[1]]

hyper_params = GAINHyper(batch_size=32, lr=0.001, 
                         alpha=100, epochs=30, p_hint=.9)

# Data Loaders for training
data_loader = DataLoader(torch.tensor(trainX, device=device).float(), 
                         batch_size=hyper_params.batch_size, 
                         shuffle=False)
mask_loader = DataLoader(torch.tensor(trainM, device=device).float(), 
                         batch_size=hyper_params.batch_size, 
                         shuffle=False)

# Generator and Discriminator
Generator = GAIN(weights=weights).to(device)
Discriminator = GAIN(weights=weights).to(device)

# Optimizers
optimizer_G = optim.Adam(Generator.parameters(), lr=hyper_params.lr)
optimizer_D = optim.Adam(Discriminator.parameters(), lr=hyper_params.lr)

In [None]:
train_GAIN(nets=(Generator, Discriminator), 
           optimizers=(optimizer_G, optimizer_D),
           dataloaders=(data_loader, mask_loader),
           batch_size=hyper_params.batch_size,
           alpha=hyper_params.alpha,
           p_hint=hyper_params.p_hint,
           epochs=hyper_params.epochs, verbose=False)

### Testing and metrics for single experiment

In [None]:
# Test set
imputed_data, rmse = imputator(Generator, testX, testM, encoder, cuda=True)
testDecoded = encoder.smart_decode(testX)
means = []
for col in imputed_data.columns:
    mean = np.mean(np.where(imputed_data[col].values == testDecoded[col].values, 1, 0))
    means.append(mean)

accuracy = np.mean(np.array(means))
print("RMSE: {}\nAccuracy: {}".format(rmse, accuracy))

RMSE: 0.0044505285752466225
Accuracy: 0.975625


In [None]:
# Entire set
imputed_data, rmse = imputator(Generator, X, M, encoder, cuda=True)

means = []
for col in imputed_data.columns:
    mean = np.mean(np.where(imputed_data[col].values == test_df[col].values, 1, 0))
    means.append(mean)

accuracy = np.mean(np.array(means))
print("RMSE: {}\nAccuracy: {}".format(rmse, accuracy))

RMSE: 0.0045422498766494995
Accuracy: 0.974


## Multiple Trial Training and Testing

In [None]:
# Encoding data and preparing for training
encoder = WOEncoder()
X_raw = encoder.fit_transform(subset_df)
X = X_raw.values

# select only 1000 samples
sampler = np.random.permutation(X.shape[0])
X = X[sampler[:1000], :]
dims = X.shape

test_df = pd.DataFrame(test_df.values[sampler[:1000], :], columns=test_df.columns)

In [None]:
trials = 30
pmissing = [.1, .2, .3, .4]
avg_rmses = []
sd_rmses = []
avg_accs = []
sd_accs = []

print("SurvGAIN test on Kaggle {} samples".format(dims[0]))

for p in pmissing:
    print("Test for {}% missing data".format(100*p))
    rmses = []
    accs = []
    for t in range(trials):
        print("Trial {}".format(t))
        #############
        # Preparing #
        #############

        Xt, mt, M = generate_trial_data(X, p_miss=p)
        trainX, testX = Xt
        trainM, testM = mt

        # Hyper params and network architectures
        weights = [dims[1]*2, 64, 128, 64, dims[1]]

        hyper_params = GAINHyper(batch_size=32, lr=0.001, 
                                alpha=100, epochs=30, p_hint=.9)

        # Data Loaders for training
        data_loader = DataLoader(torch.tensor(trainX, device=device).float(), 
                                batch_size=hyper_params.batch_size, 
                                shuffle=False)
        mask_loader = DataLoader(torch.tensor(trainM, device=device).float(), 
                                batch_size=hyper_params.batch_size, 
                                shuffle=False)

        # Generator and Discriminator
        Generator = GAIN(weights=weights).to(device)
        Discriminator = GAIN(weights=weights).to(device)

        # Optimizers
        optimizer_G = optim.Adam(Generator.parameters(), lr=hyper_params.lr)
        optimizer_D = optim.Adam(Discriminator.parameters(), lr=hyper_params.lr)

        ############
        # Training #
        ############
        train_GAIN(nets=(Generator, Discriminator), 
            optimizers=(optimizer_G, optimizer_D),
            dataloaders=(data_loader, mask_loader),
            batch_size=hyper_params.batch_size,
            alpha=hyper_params.alpha,
            p_hint=hyper_params.p_hint,
            epochs=hyper_params.epochs, verbose=False)
        

        ###########
        # Testing #
        ###########
        # Test set
        imputed_data, rmse = imputator(Generator, testX, testM, encoder, cuda=True)
        testDecoded = encoder.smart_decode(testX)
        means = []
        for col in imputed_data.columns:
            mean = np.mean(np.where(imputed_data[col].values == testDecoded[col].values, 1, 0))
            means.append(mean)

        accuracy = np.mean(np.array(means))
        print("\tTest RMSE: {0:.4f}".format(rmse))
        print("\tTest Accuracy: {0:.4f}".format(accuracy))


        # Entire set
        imputed_data, rmse = imputator(Generator, X, M, encoder, cuda=True)
        testDecoded = encoder.smart_decode(testX)
        means = []
        for col in imputed_data.columns:
            mean = np.mean(np.where(imputed_data[col].values == test_df[col].values, 1, 0))
            means.append(mean)

        accuracy = np.mean(np.array(means))
        print("\tTotal RMSE: {0:.4f}".format(rmse))
        print("\tTotal Accuracy: {0:.4f}".format(accuracy))
        rmses.append(rmse)
        accs.append(accuracy)


    ### Average metrics for experiment

    # RMSE
    mean_rmse = np.mean(np.array(rmses))
    avg_rmses.append(mean_rmse)
    sd_rmses.append(np.std(np.array(rmses)))
    # Accuracy
    mean_acc = np.mean(np.array(accs))
    avg_accs.append(mean_acc)
    sd_accs.append(np.std(np.array(accs)))
    print()
    print('#'*20)
    print("\tMetrics for {}% missing".format(100*p))
    print("\tAvg RMSE: {0:.4f}".format(mean_rmse))
    print("\tAvg Accuracy: {0:.4f}".format(mean_acc))
    print('#'*20)

SurvGAIN test on Kaggle 1000 samples
Test for 10.0% missing data
Trial 0
	Test RMSE: 0.0064
	Test Accuracy: 0.9888
	Total RMSE: 0.0068
	Total Accuracy: 0.9855
Trial 1
	Test RMSE: 0.0013
	Test Accuracy: 0.9888
	Total RMSE: 0.0018
	Total Accuracy: 0.9881
Trial 2
	Test RMSE: 0.0025
	Test Accuracy: 0.9756
	Total RMSE: 0.0038
	Total Accuracy: 0.9751
Trial 3
	Test RMSE: 0.0064
	Test Accuracy: 0.9675
	Total RMSE: 0.0081
	Total Accuracy: 0.9689
Trial 4
	Test RMSE: 0.0055
	Test Accuracy: 0.9862
	Total RMSE: 0.0058
	Total Accuracy: 0.9831
Trial 5
	Test RMSE: 0.0031
	Test Accuracy: 0.9856
	Total RMSE: 0.0027
	Total Accuracy: 0.9799
Trial 6
	Test RMSE: 0.0012
	Test Accuracy: 0.9900
	Total RMSE: 0.0022
	Total Accuracy: 0.9873
Trial 7
	Test RMSE: 0.0032
	Test Accuracy: 0.9888
	Total RMSE: 0.0034
	Total Accuracy: 0.9879
Trial 8
	Test RMSE: 0.0003
	Test Accuracy: 0.9744
	Total RMSE: 0.0002
	Total Accuracy: 0.9748
Trial 9
	Test RMSE: 0.0020
	Test Accuracy: 0.9719
	Total RMSE: 0.0034
	Total Accuracy: 0.

In [None]:
newresults = pd.DataFrame({
                        '1000KaggleRMSE': avg_rmses, 
                        '1000KaggleAcc': avg_accs, 
                        '1000KaggleSdRMSE': sd_rmses,
                        '1000KaggleSdAcc': sd_accs})
#results.to_csv("gain_results.csv")
#
newresults.head()

Unnamed: 0,1000KaggleRMSE,1000KaggleAcc,1000KaggleSdRMSE,1000KaggleSdAcc
0,0.004023,0.982487,0.002832,0.006039
1,0.006596,0.959388,0.004383,0.010633
2,0.012725,0.938762,0.010018,0.012309
3,0.01393,0.916654,0.011426,0.019586


In [None]:
results = pd.read_csv("gain_results.csv")

results = pd.concat([results, newresults], axis=1)

In [None]:
results.to_csv("gain_results.csv")

# 100 Samples Kaggle Data Experiment

In [None]:
# Encoding data and preparing for training
encoder = WOEncoder()
X_raw = encoder.fit_transform(subset_df)
X = X_raw.values

# select only 100 samples
sampler = np.random.permutation(X.shape[0])
X = X[sampler[:100], :]
dims = X.shape

test_df = pd.DataFrame(test_df.values[sampler[:100], :], columns=test_df.columns)

### Missing data introduction
p_miss = .2 # 20% missing data
p_miss_vec = p_miss * np.ones((dims[1], 1))
M = np.zeros(dims)

for i in range(dims[1]):
    A = np.random.uniform(0., 1., size=[dims[0], ])
    B = A > p_miss_vec[i]
    M[:, i] = 1.*B

# Train Test division
idx = np.random.permutation(dims[0])

Train_no = int(dims[0] * .8)
Test_no = dims[0] - Train_no

# Train / Test Features
trainX = X[idx[:Train_no], :]
testX = X[idx[Train_no:], :]

# Train / Test Features
trainM = M[idx[:Train_no], :]
testM = M[idx[Train_no:], :]

In [None]:
# Hyper params and network architectures
weights = [dims[1]*2, 64, 128, 64, dims[1]]

hyper_params = GAINHyper(batch_size=4, lr=0.001, 
                         alpha=10, epochs=30, p_hint=.9)

# Data Loaders for training
data_loader = DataLoader(torch.tensor(trainX, device=device).float(), 
                         batch_size=hyper_params.batch_size, 
                         shuffle=False)
mask_loader = DataLoader(torch.tensor(trainM, device=device).float(), 
                         batch_size=hyper_params.batch_size, 
                         shuffle=False)

# Generator and Discriminator
Generator = GAIN(weights=weights).to(device)
Discriminator = GAIN(weights=weights).to(device)

# Optimizers
optimizer_G = optim.Adam(Generator.parameters(), lr=hyper_params.lr)
optimizer_D = optim.Adam(Discriminator.parameters(), lr=hyper_params.lr)

In [None]:
train_GAIN(nets=(Generator, Discriminator), 
           optimizers=(optimizer_G, optimizer_D),
           dataloaders=(data_loader, mask_loader),
           batch_size=hyper_params.batch_size,
           alpha=hyper_params.alpha,
           p_hint=hyper_params.p_hint,
           epochs=hyper_params.epochs, verbose=False)

## Testing and Metrics for single experiment

In [None]:
# Test set
imputed_data, rmse = imputator(Generator, testX, testM, encoder, cuda=True)
testDecoded = encoder.smart_decode(testX)
means = []
for col in imputed_data.columns:
    mean = np.mean(np.where(imputed_data[col].values == testDecoded[col].values, 1, 0))
    means.append(mean)

accuracy = np.mean(np.array(means))
print("RMSE: {}\nAccuracy: {}".format(rmse, accuracy))

RMSE: 0.01770074737839651
Accuracy: 0.9625


In [None]:
# Entire set
imputed_data, rmse = imputator(Generator, X, M, encoder, cuda=True)

means = []
for col in imputed_data.columns:
    mean = np.mean(np.where(imputed_data[col].values == test_df[col].values, 1, 0))
    means.append(mean)

accuracy = np.mean(np.array(means))
print("RMSE: {}\nAccuracy: {}".format(rmse, accuracy))

RMSE: 0.0192599753542593
Accuracy: 0.95


## Multiple Trial Training and Testing

In [None]:
# Encoding data and preparing for training
encoder = WOEncoder()
X_raw = encoder.fit_transform(subset_df)
X = X_raw.values

# select only 100 samples
sampler = np.random.permutation(X.shape[0])
X = X[sampler[:100], :]
dims = X.shape

test_df = pd.DataFrame(test_df.values[sampler[:100], :], columns=test_df.columns)

In [None]:
trials = 30
pmissing = [.1, .2, .3, .4]
avg_rmses = []
sd_rmses = []
avg_accs = []
sd_accs = []

print("SurvGAIN test on Kaggle {} samples".format(dims[0]))

for p in pmissing:
    print("Test for {}% missing data".format(100*p))
    rmses = []
    accs = []
    for t in range(trials):
        print("Trial {}".format(t))
        #############
        # Preparing #
        #############

        Xt, mt, M = generate_trial_data(X, p_miss=p)
        trainX, testX = Xt
        trainM, testM = mt

        # Hyper params and network architectures
        weights = [dims[1]*2, 64, 128, 64, dims[1]]

        hyper_params = GAINHyper(batch_size=32, lr=0.001, 
                                alpha=100, epochs=30, p_hint=.9)

        # Data Loaders for training
        data_loader = DataLoader(torch.tensor(trainX, device=device).float(), 
                                batch_size=hyper_params.batch_size, 
                                shuffle=False)
        mask_loader = DataLoader(torch.tensor(trainM, device=device).float(), 
                                batch_size=hyper_params.batch_size, 
                                shuffle=False)

        # Generator and Discriminator
        Generator = GAIN(weights=weights).to(device)
        Discriminator = GAIN(weights=weights).to(device)

        # Optimizers
        optimizer_G = optim.Adam(Generator.parameters(), lr=hyper_params.lr)
        optimizer_D = optim.Adam(Discriminator.parameters(), lr=hyper_params.lr)

        ############
        # Training #
        ############
        train_GAIN(nets=(Generator, Discriminator), 
            optimizers=(optimizer_G, optimizer_D),
            dataloaders=(data_loader, mask_loader),
            batch_size=hyper_params.batch_size,
            alpha=hyper_params.alpha,
            p_hint=hyper_params.p_hint,
            epochs=hyper_params.epochs, verbose=False)
        

        ###########
        # Testing #
        ###########
        # Test set
        imputed_data, rmse = imputator(Generator, testX, testM, encoder, cuda=True)
        testDecoded = encoder.smart_decode(testX)
        means = []
        for col in imputed_data.columns:
            mean = np.mean(np.where(imputed_data[col].values == testDecoded[col].values, 1, 0))
            means.append(mean)

        accuracy = np.mean(np.array(means))
        print("\tTest RMSE: {0:.4f}".format(rmse))
        print("\tTest Accuracy: {0:.4f}".format(accuracy))


        # Entire set
        imputed_data, rmse = imputator(Generator, X, M, encoder, cuda=True)
        testDecoded = encoder.smart_decode(testX)
        means = []
        for col in imputed_data.columns:
            mean = np.mean(np.where(imputed_data[col].values == test_df[col].values, 1, 0))
            means.append(mean)

        accuracy = np.mean(np.array(means))
        print("\tTotal RMSE: {0:.4f}".format(rmse))
        print("\tTotal Accuracy: {0:.4f}".format(accuracy))
        rmses.append(rmse)
        accs.append(accuracy)


    ### Average metrics for experiment

    # RMSE
    mean_rmse = np.mean(np.array(rmses))
    avg_rmses.append(mean_rmse)
    sd_rmses.append(np.std(np.array(rmses)))
    # Accuracy
    mean_acc = np.mean(np.array(accs))
    avg_accs.append(mean_acc)
    sd_accs.append(np.std(np.array(accs)))
    print()
    print('#'*20)
    print("\tMetrics for {}% missing".format(100*p))
    print("\tAvg RMSE: {0:.4f}".format(mean_rmse))
    print("\tAvg Accuracy: {0:.4f}".format(mean_acc))
    print('#'*20)

SurvGAIN test on Kaggle 100 samples
Test for 10.0% missing data
Trial 0
	Test RMSE: 0.0090
	Test Accuracy: 0.9500
	Total RMSE: 0.0060
	Total Accuracy: 0.9550
Trial 1
	Test RMSE: 0.0042
	Test Accuracy: 0.9812
	Total RMSE: 0.0194
	Total Accuracy: 0.9675
Trial 2
	Test RMSE: 0.0127
	Test Accuracy: 0.9313
	Total RMSE: 0.0098
	Total Accuracy: 0.9575
Trial 3
	Test RMSE: 0.0033
	Test Accuracy: 0.9688
	Total RMSE: 0.0017
	Total Accuracy: 0.9525
Trial 4
	Test RMSE: 0.0003
	Test Accuracy: 0.9313
	Total RMSE: 0.0054
	Total Accuracy: 0.9325
Trial 5
	Test RMSE: 0.0097
	Test Accuracy: 0.9500
	Total RMSE: 0.0043
	Total Accuracy: 0.9587
Trial 6
	Test RMSE: 0.0199
	Test Accuracy: 0.9375
	Total RMSE: 0.0017
	Total Accuracy: 0.9438
Trial 7
	Test RMSE: 0.0216
	Test Accuracy: 0.9500
	Total RMSE: 0.0060
	Total Accuracy: 0.9563
Trial 8
	Test RMSE: 0.0211
	Test Accuracy: 0.9688
	Total RMSE: 0.0042
	Total Accuracy: 0.9587
Trial 9
	Test RMSE: 0.0186
	Test Accuracy: 0.9625
	Total RMSE: 0.0022
	Total Accuracy: 0.9

In [None]:
newresults = pd.DataFrame({
                        '100KaggleRMSE': avg_rmses, 
                        '100KaggleAcc': avg_accs, 
                        '100KaggleSdRMSE': sd_rmses,
                        '100KaggleSdAcc': sd_accs})
#results.to_csv("gain_results.csv")
#
newresults.head()

Unnamed: 0,100KaggleRMSE,100KaggleAcc,100KaggleSdRMSE,100KaggleSdAcc
0,0.005987,0.953417,0.004021,0.009054
1,0.00979,0.907167,0.007816,0.011775
2,0.013827,0.860583,0.009955,0.013547
3,0.028705,0.807792,0.022383,0.018579


In [None]:
results = pd.read_csv("gain_results.csv")

results = pd.concat([results, newresults], axis=1)
results.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,PMissing,FullKaggleRMSE,FullKaggleAcc,FullKaggleSdRMSE,FullKaggleSdAcc,1000KaggleRMSE,1000KaggleAcc,1000KaggleSdRMSE,1000KaggleSdAcc,100KaggleRMSE,100KaggleAcc,100KaggleSdRMSE,100KaggleSdAcc
0,0,0,0.1,0.003842,0.978563,0.003149,0.006506,0.004023,0.982487,0.002832,0.006039,0.005987,0.953417,0.004021,0.009054
1,1,1,0.2,0.008603,0.958363,0.006427,0.015054,0.006596,0.959387,0.004383,0.010633,0.00979,0.907167,0.007816,0.011775
2,2,2,0.3,0.010202,0.941573,0.008683,0.019399,0.012725,0.938763,0.010018,0.012309,0.013827,0.860583,0.009955,0.013547
3,3,3,0.4,0.012585,0.91562,0.012169,0.021049,0.01393,0.916654,0.011426,0.019586,0.028705,0.807792,0.022383,0.018579


In [None]:
results.to_csv("gain_results.csv")