In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from copulas.multivariate import GaussianMultivariate
from statsmodels.regression.linear_model import OLS
from stargazer.stargazer import Stargazer

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('MOX2_5_data_labelled.csv')

In [None]:
final_cols = ['Sedentary', 'LPA', 'MPA', 'VPA', 'Steps', 'active']

In [None]:
len(df)

In [None]:
df.head()

In [None]:
df = df.fillna(df.median())

In [None]:
df.head()

In [None]:
df.columns

In [None]:
fig, ax = plt.subplots()
sns.heatmap(df[final_cols].corr(method='pearson'), annot=True, fmt='.2f', 
            cmap=plt.get_cmap('Greys'), cbar=False, linewidths=1, ax=ax)
ax.set_yticklabels(ax.get_yticklabels(), rotation="horizontal")
plt.savefig('result.png', bbox_inches='tight', pad_inches=0.0, dpi=300)

In [None]:
X = ['Sedentary', 'LPA', 'MPA', 'VPA', 'Steps']
Y = ['active']
all_X = X+Y

In [None]:
#Synthetic data GP
s_df = GaussianMultivariate()
s_df.fit(df[all_X])

In [None]:
s_data = s_df.sample(len(df))
s_data.head()

In [None]:
len(s_data)

In [None]:
df[all_X].describe().transpose().round(2)

In [None]:
s_data[all_X].describe().transpose().round(2)

In [None]:
df[all_X].cov().round(2).iloc[:,:]

In [None]:
s_data[all_X].cov().round(2).iloc[:,:]

In [None]:
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(1,2,1,projection='3d')
ax1.scatter(df.LPA, df.MPA, df.VPA)

ax2 = fig.add_subplot(1,2,2,projection='3d')
ax2.scatter(s_data.LPA, s_data.MPA, s_data.VPA)

In [None]:
fit1 = OLS(df[Y], df[X]).fit()
fit2 = OLS(s_data[Y], s_data[X]).fit()
Stargazer([fit1, fit2])

In [None]:
def calculate_lebel(df):
    if ((df['Steps'] < 5000) and ((df['VPA']*2 + df['MPA'])*7 < 90) and (df['LPA'] >=0)): #sedentary
        return 0
    elif ((df['Steps'] >= 5000) and ((df['VPA']*2 + df['MPA'])*7 >= 90) and (df['VPA']*2 + df['MPA'])*7 < 210) or (df['Steps'] >= 5000 and df['Steps'] <= 7499): #LPA
        return 1
    elif ((df['Steps'] >= 5000) and ((df['VPA']*2 + df['MPA'])*7 >= 210) and (df['VPA']*2 + df['MPA'])*7 < 300) or (df['Steps'] >= 7500 and df['Steps'] <= 9999): #MPA
        return 2
    elif ((df['Steps'] >= 5000) and ((df['VPA']*2 + df['MPA'])*7 >= 300) and (df['VPA']*2 + df['MPA'])*7 < 360) or (df['Steps'] >= 10000 and df['Steps'] <= 12499): #MPA
        return 3
    elif ((df['Steps'] >= 5000) and (df['VPA']*2 + df['MPA'])*7 >= 360) or (df['Steps'] >= 12500): #VPA
        return 4
    elif (df['Steps'] < 5000):
        return 0

In [None]:
s_data['active'] = s_data.apply(calculate_lebel, axis=1)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
 
# count plot on single categorical variable
sns.countplot(x ='active', data = df, palette = "Set2", saturation = 0.1) 
plt.savefig('result_1.png', bbox_inches='tight', pad_inches=1.0, dpi=300)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
 
# count plot on single categorical variable
sns.countplot(x ='active', data = s_data, palette = "Set2", saturation = 0.1) 
plt.savefig('result_2.png', bbox_inches='tight', pad_inches=1.0, dpi=300)

In [None]:
fig, ax = plt.subplots()
sns.heatmap(s_data[final_cols].corr(method='pearson'), annot=True, fmt='.2f', 
            cmap=plt.get_cmap('Greys'), cbar=False, linewidths=1, ax=ax)
ax.set_yticklabels(ax.get_yticklabels(), rotation="horizontal")
plt.savefig('result.png', bbox_inches='tight', pad_inches=0.0, dpi=300)

In [None]:
s_data[all_X].describe().transpose().round(2)

In [None]:
fit1 = OLS(df[Y], df[X]).fit()
fit2 = OLS(s_data[Y], s_data[X]).fit()
Stargazer([fit1, fit2])

In [None]:
s_data.to_csv('augmented_data_GC.csv')

In [None]:
#CTGAN
import pandas as pd
from ctgan.synthesizer import CTGANSynthesizer
from sdv.tabular import CTGAN

In [None]:
data = df.copy()
data.columns.values

In [None]:
data.head()

In [None]:
data.shape, data.shape[0], data.shape[1]

In [None]:
all_ = ['ID', 'Sedentary', 'LPA', 'MPA', 'VPA', 'Steps', 'active']

In [None]:
ctgan = CTGANSynthesizer()
ctgan.fit(data, all_, epochs=200)

In [None]:
# Synthetic copy
samples = ctgan.sample(data.shape[0])
samples.head()

In [None]:
len(samples)

In [None]:
#Create new samples
data_aug = df.copy()

In [None]:
model = CTGAN(primary_key='ID')

In [None]:
model.fit(data_aug)

In [None]:
augmented_data = model.sample(data.shape[0])
augmented_data.head()

In [None]:
len(augmented_data)

In [None]:
augmented_data = augmented_data[final_cols]

In [None]:
augmented_data['active'] = augmented_data.apply(calculate_lebel, axis=1)

In [None]:
fig, ax = plt.subplots()
sns.heatmap(augmented_data[final_cols].corr(method='pearson'), annot=True, fmt='.2f', 
            cmap=plt.get_cmap('Greys'), cbar=False, linewidths=1, ax=ax)
ax.set_yticklabels(ax.get_yticklabels(), rotation="horizontal")
plt.savefig('result_3.png', bbox_inches='tight', pad_inches=0.0, dpi=300)

In [None]:
# count plot on single categorical variable
sns.countplot(x ='active', data = augmented_data, palette = "Set2", saturation = 0.1) 
plt.savefig('result_4.png', bbox_inches='tight', pad_inches=1.0, dpi=300)

In [None]:
augmented_data.to_csv('augmented_data_CTGAN.csv')

In [None]:
data[all_X].describe().transpose().round(2)

In [None]:
augmented_data[all_X].describe().transpose().round(2)

In [None]:
data[all_X].cov().round(2).iloc[:,:]

In [None]:
augmented_data[all_X].cov().round(2).iloc[:,:]

In [None]:
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(1,2,1,projection='3d')
ax1.scatter(data.LPA, data.MPA, data.VPA)

ax2 = fig.add_subplot(1,2,2,projection='3d')
ax2.scatter(augmented_data.LPA, augmented_data.MPA, augmented_data.VPA)

In [None]:
fit1 = OLS(data[Y], data[X]).fit()
fit2 = OLS(augmented_data[Y], augmented_data[X]).fit()
Stargazer([fit1, fit2])

In [None]:
#TABGAN
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense
from numpy.random import randn
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [None]:
data2 = df.copy()
data2.head()

In [None]:
data2.shape

In [None]:
data2.columns

In [None]:
#Generate Synthetic Data
def generate_latent_points(latent_dim, n_samples):
    x_input = randn(latent_dim * n_samples)
    x_input = x_input.reshape(n_samples, latent_dim)
    return x_input

In [None]:
# use the generator to generate n fake examples, with class labels
def generate_fake_samples(generator, latent_dim, n_samples):
    x_input = generate_latent_points(latent_dim, n_samples)
    X = generator.predict(x_input)
    y = np.zeros((n_samples, 1))
    
    return X, y

In [None]:
# generate n real samples with class labels; We randomly select n samples from the real data
#The label for the real data sample is 1
def generate_real_samples(n):
    X = data.sample(n)
    y = np.ones((n, 1))
    return X, y

In [None]:
def define_generator(latent_dim, n_outputs=7):
    model = Sequential()
    model.add(Dense(15, activation='relu',  kernel_initializer='he_uniform', input_dim=latent_dim))
    model.add(Dense(30, activation='relu'))
    model.add(Dense(n_outputs, activation='linear'))
    return model

In [None]:
generator1 = define_generator(10, data2.shape[1])
generator1.summary()

In [None]:
def define_discriminator(n_inputs=7):
    model = Sequential()
    model.add(Dense(25, activation='relu', kernel_initializer='he_uniform', input_dim=n_inputs))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [None]:
discriminator1 = define_discriminator(data2.shape[1])
discriminator1.summary()

In [None]:
# define the combined generator and discriminator model, for updating the generator
def define_gan(generator, discriminator):
    # make weights in the discriminator not trainable
    discriminator.trainable = False
    model = Sequential()
    # add generator
    model.add(generator)
    # add the discriminator
    model.add(discriminator)
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

In [None]:
# create a line plot of loss for the gan and save to file
def plot_history(d_hist, g_hist):
    # plot loss
    plt.subplot(1, 1, 1)
    plt.plot(d_hist, label='d')
    plt.plot(g_hist, label='gen')
    plt.show()
    plt.close()

In [None]:
def train(g_model, d_model, gan_model, latent_dim, n_epochs=10000, n_batch=128, n_eval=200):
    # determine half the size of one batch, for updating the  discriminator
    half_batch = int(n_batch / 2)
    d_history = []
    g_history = []
    # manually enumerate epochs
    for epoch in range(n_epochs):
    
        # prepare real samples
        x_real, y_real = generate_real_samples(half_batch)
        # prepare fake examples
        x_fake, y_fake = generate_fake_samples(g_model, latent_dim, half_batch)
        # update discriminator
        d_loss_real, d_real_acc = d_model.train_on_batch(x_real, y_real)
        d_loss_fake, d_fake_acc = d_model.train_on_batch(x_fake, y_fake)
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
        # prepare points in latent space as input for the generator
        x_gan = generate_latent_points(latent_dim, n_batch)
        # create inverted labels for the fake samples
        y_gan = np.ones((n_batch, 1))
        # update the generator via the discriminator's error
        g_loss_fake = gan_model.train_on_batch(x_gan, y_gan)
        print('>%d, d1=%.3f, d2=%.3f d=%.3f g=%.3f' % (epoch+1, d_loss_real, d_loss_fake, d_loss,  g_loss_fake))
        d_history.append(d_loss)
        g_history.append(g_loss_fake)
        plot_history(d_history, g_history)
        g_model.save('trained_generated_model.h5')

In [None]:
# size of the latent space
latent_dim = 10
# create the discriminator
discriminator = define_discriminator()
# create the generator
generator = define_generator(latent_dim)
# create the gan
gan_model = define_gan(generator, discriminator)
# train model
train(generator, discriminator, gan_model, latent_dim)

In [None]:
#Evaluate the Quality of Generated Fake Data With Model
from keras.models import load_model
model =load_model('trained_generated_model.h5')

In [None]:
latent_points = generate_latent_points(latent_dim, data2.shape[0])
XX = model.predict(latent_points)
data_fake = pd.DataFrame(data=XX,  columns=data2.columns)
data_fake.head()

In [None]:
data_fake = data_fake[final_cols]

In [None]:
data_fake['active'] = data_fake.apply(calculate_lebel, axis=1)

In [None]:
# count plot on single categorical variable
sns.countplot(x ='active', data = data_fake, palette = "Set2", saturation = 0.1) 
plt.savefig('result_1.png', bbox_inches='tight', pad_inches=1.0, dpi=300)

In [None]:
fig, ax = plt.subplots()
sns.heatmap(data_fake[final_cols].corr(method='pearson'), annot=True, fmt='.2f', 
            cmap=plt.get_cmap('Greys'), cbar=False, linewidths=1, ax=ax)
ax.set_yticklabels(ax.get_yticklabels(), rotation="horizontal")
plt.savefig('result.png', bbox_inches='tight', pad_inches=0.0, dpi=300)

In [None]:
data_fake.to_csv("augmented_data_TBGAN.csv")

In [None]:
data_fake[all_X].describe().transpose().round(2)

In [None]:
data_fake[all_X].cov().round(2).iloc[:,:]

In [None]:
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(1,2,1,projection='3d')
ax1.scatter(data.LPA, data.MPA, data.VPA)

ax2 = fig.add_subplot(1,2,2,projection='3d')
ax2.scatter(data_fake.LPA, data_fake.MPA, data_fake.VPA)

In [None]:
fit1 = OLS(data[Y], data[X]).fit()
fit2 = OLS(data_fake[Y], data_fake[X]).fit()
Stargazer([fit1, fit2])

In [None]:
fit1 = OLS(data[Y], data[X]).fit()
fit2 = OLS(s_data[Y], s_data[X]).fit() #GC
fit3 = OLS(augmented_data[Y], augmented_data[X]).fit() #CTGAN
fit4 = OLS(data_fake[Y], data_fake[X]).fit() #TABGAN
Stargazer([fit1, fit2, fit3, fit4])

In [None]:
from table_evaluator import load_data, TableEvaluator

table_evaluator = TableEvaluator(data[final_cols], augmented_data[final_cols])
table_evaluator.evaluate(target_col='active')

In [None]:
table_evaluator.visual_evaluation()

In [None]:
from table_evaluator import load_data, TableEvaluator

table_evaluator = TableEvaluator(data[final_cols], s_data[final_cols])
table_evaluator.evaluate(target_col='active')

In [None]:
table_evaluator.visual_evaluation()

In [None]:
X1 = data[X]
y1 = data[Y]

X_true_train, X_true_test, y_true_train, y_true_test = train_test_split(X1, y1, test_size=0.30, random_state=42)

clf_true = RandomForestClassifier(n_estimators=100)
clf_true.fit(X_true_train,y_true_train)
y_true_pred=clf_true.predict(X_true_test)

print("Base Accuracy:",metrics.accuracy_score(y_true_test, y_true_pred))
print("Base classification report:",metrics.classification_report(y_true_test, y_true_pred))

In [None]:
X_fake_created = data_fake[X]
y_fake_created = data_fake[Y]


X_fake_train, X_fake_test, y_fake_train, y_fake_test = train_test_split(X_fake_created, y_fake_created, test_size=0.30, random_state=42)

clf_fake = RandomForestClassifier(n_estimators=100)
clf_fake.fit(X_fake_train,y_fake_train)
y_fake_pred=clf_fake.predict(X_fake_test)

print("Accuracy of fake data model:",metrics.accuracy_score(y_fake_test, y_fake_pred))
print("Classification report of fake data model:",metrics.classification_report(y_fake_test, y_fake_pred))

In [None]:
X_fake_created = augmented_data[X]
y_fake_created = augmented_data[Y]


X_fake_train, X_fake_test, y_fake_train, y_fake_test = train_test_split(X_fake_created, y_fake_created, test_size=0.30, random_state=42)

clf_fake = RandomForestClassifier(n_estimators=100)
clf_fake.fit(X_fake_train,y_fake_train)
y_fake_pred=clf_fake.predict(X_fake_test)

print("Accuracy of fake data model:",metrics.accuracy_score(y_fake_test, y_fake_pred))
print("Classification report of fake data model:",metrics.classification_report(y_fake_test, y_fake_pred))

In [None]:
X_fake_created = s_data[X]
y_fake_created = s_data[Y]


X_fake_train, X_fake_test, y_fake_train, y_fake_test = train_test_split(X_fake_created, y_fake_created, test_size=0.30, random_state=42)

clf_fake = RandomForestClassifier(n_estimators=100)
clf_fake.fit(X_fake_train,y_fake_train)
y_fake_pred=clf_fake.predict(X_fake_test)

print("Accuracy of fake data model:",metrics.accuracy_score(y_fake_test, y_fake_pred))
print("Classification report of fake data model:",metrics.classification_report(y_fake_test, y_fake_pred))