<a href="https://colab.research.google.com/github/arrow789/cs760-project/blob/master/Data_Augmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# conda install pytorch-cpu torchvision-cpu -c pytorch

In [None]:
''' Adapted from: 
Data augmentation for Tabular Data. Data Science Blog von lschmiddey. (2021, March 14).
 Retrieved May 12, 2022, from https://lschmiddey.github.io/fastpages_/2021/03/14/tabular-data-variational-autoencoder.html 

'''

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import nn, optim
from torch.autograd import Variable

import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("CT Data.csv")

df['DEATH [d from CT]'] = df['DEATH [d from CT]'].fillna(0)
#df["Death"] = df['DEATH [d from CT]']/365

In [None]:
df.columns

Index(['Age at CT', 'DEATH [d from CT]', 'L1_HU_BMD', 'TAT Area (cm2)',
       'Total Body                Area EA (cm2)', 'VAT Area (cm2)',
       'SAT Area (cm2)', 'VAT/SAT     Ratio', 'Muscle HU',
       ' Muscle Area (cm2)', 'L3 SMI (cm2/m2)', 'AoCa        Agatston',
       'Liver HU    (Median)'],
      dtype='object')

In [None]:
df.iloc[:,7] = pd.to_numeric(df.iloc[:,7], errors='coerce')
df.iloc[:,12] = pd.to_numeric(df.iloc[:,12], errors='coerce')
death_df = df[df['DEATH [d from CT]']!=0]

In [None]:
for i in range(2,13):
    death_df.iloc[:,i].fillna(value=death_df.iloc[:,i].mean(),inplace = True)



In [None]:
death_df.to_csv("Cleaned CT Data",index = False)

In [None]:
cols = death_df.columns

In [None]:
DATA_PATH = "Cleaned CT Data"

In [None]:
df_base = pd.read_csv(DATA_PATH, sep=',')

In [None]:
df_base.head()

Unnamed: 0,Age at CT,DEATH [d from CT],L1_HU_BMD,TAT Area (cm2),Total Body Area EA (cm2),VAT Area (cm2),SAT Area (cm2),VAT/SAT Ratio,Muscle HU,Muscle Area (cm2),L3 SMI (cm2/m2),AoCa Agatston,Liver HU (Median)
0,73,967.0,135.0,407.463977,721.279362,212.316698,199.007129,1.220394,18.2,170.1,58.7,5473.6,35.0
1,55,359.0,192.0,485.5,694.3,183.5,302.0,0.61,16.2,124.0,48.4,2709.1,52.0
2,60,2351.0,149.0,289.1,584.5,144.0,145.1,0.99,30.8,212.3,69.1,2586.6,57.0
3,88,658.0,106.0,315.5,588.9,202.3,113.2,1.79,-3.2,168.9,47.8,431.5,53.0
4,82,1970.0,77.0,213.5,452.9,113.6,99.9,1.14,1.5,89.5,30.9,1600.0,49.0


## Build Data Loader

In [None]:
def load_and_standardize_data(path):
    # read in from csv
    df = pd.read_csv(path, sep=',')
    # replace nan with -99
    df = df.fillna(-99)
    df = df.values.reshape(-1, df.shape[1]).astype('float32')
    # randomly split
    X_train, X_test = train_test_split(df, test_size=0.3, random_state=42)
    # standardize values
    scaler = preprocessing.StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)   
    return X_train, X_test, scaler

In [None]:
from torch.utils.data import Dataset, DataLoader
class DataBuilder(Dataset):
    def __init__(self, path, train=True):
        self.X_train, self.X_test, self.standardizer = load_and_standardize_data(DATA_PATH)
        if train:
            self.x = torch.from_numpy(self.X_train)
            self.len=self.x.shape[0]
        else:
            self.x = torch.from_numpy(self.X_test)
            self.len=self.x.shape[0]
        del self.X_train
        del self.X_test 
    def __getitem__(self,index):      
        return self.x[index]
    def __len__(self):
        return self.len

In [None]:
traindata_set=DataBuilder(DATA_PATH, train=True)
testdata_set=DataBuilder(DATA_PATH, train=False)

trainloader=DataLoader(dataset=traindata_set,batch_size=1024)
testloader=DataLoader(dataset=testdata_set,batch_size=1024)

In [None]:
type(trainloader.dataset.x), type(testloader.dataset.x)

(torch.Tensor, torch.Tensor)

In [None]:
trainloader.dataset.x.shape, testloader.dataset.x.shape

(torch.Size([384, 13]), torch.Size([165, 13]))

In [None]:
(torch.Size([124, 14]), torch.Size([54, 14]))

(torch.Size([124, 14]), torch.Size([54, 14]))

## Build Model

In [None]:
class Autoencoder(nn.Module):
    def __init__(self,D_in,H=50,H2=12,latent_dim=3):
        
        #Encoder
        super(Autoencoder,self).__init__()
        self.linear1=nn.Linear(D_in,H)
        self.lin_bn1 = nn.BatchNorm1d(num_features=H)
        self.linear2=nn.Linear(H,H2)
        self.lin_bn2 = nn.BatchNorm1d(num_features=H2)
        self.linear3=nn.Linear(H2,H2)
        self.lin_bn3 = nn.BatchNorm1d(num_features=H2)
        
        # Latent vectors mu and sigma
        self.fc1 = nn.Linear(H2, latent_dim)
        self.bn1 = nn.BatchNorm1d(num_features=latent_dim)
        self.fc21 = nn.Linear(latent_dim, latent_dim)
        self.fc22 = nn.Linear(latent_dim, latent_dim)

        # Sampling vector
        self.fc3 = nn.Linear(latent_dim, latent_dim)
        self.fc_bn3 = nn.BatchNorm1d(latent_dim)
        self.fc4 = nn.Linear(latent_dim, H2)
        self.fc_bn4 = nn.BatchNorm1d(H2)
        
        # Decoder
        self.linear4=nn.Linear(H2,H2)
        self.lin_bn4 = nn.BatchNorm1d(num_features=H2)
        self.linear5=nn.Linear(H2,H)
        self.lin_bn5 = nn.BatchNorm1d(num_features=H)
        self.linear6=nn.Linear(H,D_in)
        self.lin_bn6 = nn.BatchNorm1d(num_features=D_in)
        
        self.relu = nn.ReLU()
        
    def encode(self, x):
        lin1 = self.relu(self.lin_bn1(self.linear1(x)))
        lin2 = self.relu(self.lin_bn2(self.linear2(lin1)))
        lin3 = self.relu(self.lin_bn3(self.linear3(lin2)))

        fc1 = F.relu(self.bn1(self.fc1(lin3)))

        r1 = self.fc21(fc1)
        r2 = self.fc22(fc1)
        
        return r1, r2
    
    def reparameterize(self, mu, logvar):
        if self.training:
            std = logvar.mul(0.5).exp_()
            eps = Variable(std.data.new(std.size()).normal_())
            return eps.mul(std).add_(mu)
        else:
            return mu
        
    def decode(self, z):
        fc3 = self.relu(self.fc_bn3(self.fc3(z)))
        fc4 = self.relu(self.fc_bn4(self.fc4(fc3)))

        lin4 = self.relu(self.lin_bn4(self.linear4(fc4)))
        lin5 = self.relu(self.lin_bn5(self.linear5(lin4)))
        return self.lin_bn6(self.linear6(lin5))


        
    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

In [None]:
class customLoss(nn.Module):
    def __init__(self):
        super(customLoss, self).__init__()
        self.mse_loss = nn.MSELoss(reduction="sum")
    
    def forward(self, x_recon, x, mu, logvar):
        loss_MSE = self.mse_loss(x_recon, x)
        loss_KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

        return loss_MSE + loss_KLD

In [None]:
D_in = trainloader.dataset.x.shape[1]
H = 50
H2 = 12
model = Autoencoder(D_in, H, H2).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [None]:
loss_mse = customLoss()

## Train Model

In [None]:
epochs = 1500
log_interval = 50
val_losses = []
train_losses = []
test_losses = []

In [None]:
def train(epoch):
    model.train()
    train_loss = 0
    for batch_idx, data in enumerate(trainloader):
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
        loss = loss_mse(recon_batch, data, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    if epoch % 200 == 0:        
        print('====> Epoch: {} Average training loss: {:.4f}'.format(
            epoch, train_loss / len(trainloader.dataset)))
        train_losses.append(train_loss / len(trainloader.dataset))

In [None]:
def test(epoch):
    with torch.no_grad():
        test_loss = 0
        for batch_idx, data in enumerate(testloader):
            data = data.to(device)
            optimizer.zero_grad()
            recon_batch, mu, logvar = model(data)
            loss = loss_mse(recon_batch, data, mu, logvar)
            test_loss += loss.item()
            if epoch % 200 == 0:        
                print('====> Epoch: {} Average test loss: {:.4f}'.format(
                    epoch, test_loss / len(testloader.dataset)))
            test_losses.append(test_loss / len(testloader.dataset))

In [None]:
for epoch in range(1, epochs + 1):
    train(epoch)
    test(epoch)

====> Epoch: 200 Average training loss: 15.9407
====> Epoch: 200 Average test loss: 17.8016
====> Epoch: 400 Average training loss: 12.6176
====> Epoch: 400 Average test loss: 13.3210
====> Epoch: 600 Average training loss: 11.3283
====> Epoch: 600 Average test loss: 12.4122
====> Epoch: 800 Average training loss: 10.5410
====> Epoch: 800 Average test loss: 12.0615
====> Epoch: 1000 Average training loss: 9.9562
====> Epoch: 1000 Average test loss: 11.0453
====> Epoch: 1200 Average training loss: 9.8385
====> Epoch: 1200 Average test loss: 10.8062
====> Epoch: 1400 Average training loss: 9.2621
====> Epoch: 1400 Average test loss: 10.4874


In [None]:
with torch.no_grad():
    for batch_idx, data in enumerate(testloader):
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)

In [None]:
scaler = trainloader.dataset.standardizer
recon_row = scaler.inverse_transform(recon_batch[0].cpu().numpy())
real_row = scaler.inverse_transform(testloader.dataset.x[0].cpu().numpy())

In [None]:
df = pd.DataFrame(np.stack((recon_row, real_row)), columns = cols)
df

Unnamed: 0,Age at CT,DEATH [d from CT],L1_HU_BMD,TAT Area (cm2),Total Body Area EA (cm2),VAT Area (cm2),SAT Area (cm2),VAT/SAT Ratio,Muscle HU,Muscle Area (cm2),L3 SMI (cm2/m2),AoCa Agatston,Liver HU (Median)
0,59.926189,2592.822754,164.614563,310.39682,605.156067,150.213226,181.656113,0.947205,27.276741,154.115799,51.387344,1514.487305,59.838516
1,50.0,2930.0,181.0,211.599991,442.700012,55.299995,156.300003,0.35,33.5,120.300003,47.0,8.9e-05,62.0


In [None]:
sigma = torch.exp(logvar/2)

In [None]:
no_samples = 400
q = torch.distributions.Normal(mu.mean(axis=0), sigma.mean(axis=0))
z = q.rsample(sample_shape=torch.Size([no_samples]))

In [None]:
with torch.no_grad():
    pred = model.decode(z).cpu().numpy()

In [None]:
fake_data = scaler.inverse_transform(pred)
fake_data.shape

(400, 13)

In [None]:
df_fake = pd.DataFrame(fake_data, columns = cols)

In [None]:
df_fake.head(10)

Unnamed: 0,Age at CT,DEATH [d from CT],L1_HU_BMD,TAT Area (cm2),Total Body Area EA (cm2),VAT Area (cm2),SAT Area (cm2),VAT/SAT Ratio,Muscle HU,Muscle Area (cm2),L3 SMI (cm2/m2),AoCa Agatston,Liver HU (Median)
0,65.196548,2516.894043,158.098267,796.908142,1229.049683,501.648468,312.72113,2.031366,11.879823,206.169464,68.523666,4519.041992,44.4202
1,72.960014,2390.637939,122.997078,349.223877,647.512695,183.032776,179.626282,1.169327,13.084172,130.681839,46.695396,2450.358398,52.684757
2,69.081055,1717.504272,146.523453,596.362732,922.082214,364.152344,246.299118,1.738507,13.295793,164.35704,53.725204,5343.875977,49.279922
3,62.556129,2805.172607,154.640213,380.483643,672.694275,183.638428,200.471115,1.041703,26.518488,157.24527,53.579075,1506.351929,57.784134
4,73.694305,2205.321045,117.404503,242.025192,515.926208,109.727127,145.774551,0.902599,12.56631,116.709084,43.046181,3398.472412,54.70892
5,61.703285,2762.822754,158.515854,363.083923,656.7005,164.100464,197.635254,0.980402,27.397604,155.723984,53.269917,1249.78064,58.443344
6,75.236465,2133.245605,114.668694,238.729111,515.514832,114.766457,129.795792,0.995354,11.665176,116.504745,42.952633,3417.105225,54.007248
7,72.883804,1931.361328,127.980095,454.724579,759.879089,260.809662,195.576263,1.557112,13.101714,146.596649,49.968761,4173.647949,48.603863
8,69.354202,2561.164551,134.685959,311.081238,586.609436,143.128693,171.185379,0.958821,19.624866,132.94278,46.619251,2427.425537,56.790905
9,65.280243,942.707214,135.228149,116.418282,379.224121,49.052773,62.400974,0.950647,15.91188,71.642044,29.938686,2461.429688,54.189777


In [None]:
df_base


Unnamed: 0,Age at CT,DEATH [d from CT],L1_HU_BMD,TAT Area (cm2),Total Body Area EA (cm2),VAT Area (cm2),SAT Area (cm2),VAT/SAT Ratio,Muscle HU,Muscle Area (cm2),L3 SMI (cm2/m2),AoCa Agatston,Liver HU (Median)
0,73,967.0,135.0,407.463977,721.279362,212.316698,199.007129,1.220394,18.200000,170.100000,58.700000,5473.6,35.0
1,55,359.0,192.0,485.500000,694.300000,183.500000,302.000000,0.610000,16.200000,124.000000,48.400000,2709.1,52.0
2,60,2351.0,149.0,289.100000,584.500000,144.000000,145.100000,0.990000,30.800000,212.300000,69.100000,2586.6,57.0
3,88,658.0,106.0,315.500000,588.900000,202.300000,113.200000,1.790000,-3.200000,168.900000,47.800000,431.5,53.0
4,82,1970.0,77.0,213.500000,452.900000,113.600000,99.900000,1.140000,1.500000,89.500000,30.900000,1600.0,49.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
544,49,337.0,259.0,757.400000,1318.100000,355.900000,401.600000,0.890000,21.200000,228.300000,76.500000,122.4,56.0
545,70,917.0,138.0,339.800000,585.400000,146.000000,193.800000,0.750000,18.200000,70.600000,28.500000,2688.1,55.0
546,63,757.0,118.0,457.400000,930.100000,148.200000,309.200000,0.480000,8.700000,99.100000,38.700000,0.0,43.0
547,42,105.0,271.0,407.463977,721.279362,212.316698,199.007129,1.220394,20.822527,150.863736,52.223938,149.4,55.0


In [None]:
df2 = pd.concat([df_base, df_fake], ignore_index=True, sort=False)

In [None]:
df2.shape

(949, 13)

In [None]:
df2.to_csv("Augmented CT Data",index = False)