In [1]:
import sys
import os

sys.path.append('../src/')

from read_data import DataObject
from constants import spend_categories
from visualizations import KDEPlot
from dataframe_utilities import get_filter_index, subgroup_sum, upsample

In [2]:
OVS1619 = DataObject(source = "OVS", version = "1619")
TVAE = DataObject(source = "Synthetic", version = "?", model = "TVAE")
TGAN = DataObject(source = "Synthetic", version = "?", model = "TGAN")
CTGAN = DataObject(source = "Synthetic", version = "?", model = "CTGAN")

In [3]:
print(OVS1619.size)
print(TVAE.size)
print(TGAN.size)
print(CTGAN.size)

(88731, 293)
(9905, 293)
(9905, 293)
(9905, 293)


In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [5]:
import pandas as pd

# recode
recode_b10 = {0: '0 days',
              1: '1 day',
              2: '2 days',
              3: '3 days',
              4: '4 days',
              5: '5 days',
              6: '6 days',
              7: '7 days'}
# all others: greater than or equal to 8 days

recode_a4 = {3: 'American',
             25: 'Chinese',
             38: 'Emirati',
             39: 'English',
             57: 'Indian',
             65: 'Japanese',
             79: 'Malaysian',
             115: 'South Korean'}
# all others: other nationalities

def recode(input_df, colname, recode_dict, fill_label = ""):
    assert isinstance(input_df, pd.DataFrame), "Require input_df to be a pandas dataframe."
    assert colname in input_df.columns, "The column you wish to recode does not exist in input_df."
    assert isinstance(recode_dict, dict), "Require recode_dict to be specified as a dictionary."
    assert fill_label not in list(recode_dict.values()), f"'{fill_label}' is already present in the recode_dict. Please specify an unused filler."
    
    try:
        output_df = input_df.copy()
        for ii in output_df.index:
            item = output_df[colname][ii]
            if item in recode_dict.keys():
                output_df.loc[ii, colname] = recode_dict[item]
            else:
                output_df.loc[ii, colname] = fill_label
        return output_df
        
    except Exception as e:
        print("Failed to recode dataframe. Terminate.")
        current_dateTime = str(datetime.now())[0:19]
        print(current_dateTime + ': ' + str(e))

In [6]:
for data_object in (OVS1619, TVAE, TGAN, CTGAN):
    data_object.data = recode(data_object.data, 'b10', recode_b10, 'Greater than or equal to 8 days')
    data_object.data = recode(data_object.data, 'a4', recode_a4, 'Other Nationalities')

In [7]:
OVS1619.data[['month', 'Mode of Transport (m1)', 'b10', 'a4', 'a2.r.10l']].head(10)

Unnamed: 0,month,Mode of Transport (m1),b10,a4,a2.r.10l
0,1,1,6 days,Indian,4
1,1,1,Greater than or equal to 8 days,Indian,3
2,1,1,7 days,Indian,5
3,1,1,4 days,Chinese,6
4,1,1,6 days,American,10
5,1,1,3 days,Other Nationalities,5
6,1,1,Greater than or equal to 8 days,Other Nationalities,10
7,1,1,Greater than or equal to 8 days,Other Nationalities,5
8,1,1,2 days,Other Nationalities,4
9,1,1,3 days,Japanese,3


In [8]:
columns = spend_categories

n = 1
for PERM1 in OVS1619.data['Mode of Transport (m1)'].unique():
    for PERM2 in OVS1619.data['a2.r.10l'].unique():
        for PERM4 in OVS1619.data['b10'].unique():
            print(f"SUBGROUP #{n}:\n\
            Mode of Transport == {PERM1}\n\
            Age Group == {PERM2}\n\
            Length of Stay == {PERM4}")
            n += 1

            value_filters = {'Mode of Transport (m1)': PERM1,
                             'a2.r.10l': PERM2,
                             'b10': PERM4}

            # filter data
            df1 = OVS1619.data.iloc[get_filter_index(OVS1619.data, value_filters)]
            df2 = TVAE.data.iloc[get_filter_index(TVAE.data, value_filters)]
            df3 = TGAN.data.iloc[get_filter_index(TGAN.data, value_filters)]
            df4 = CTGAN.data.iloc[get_filter_index(CTGAN.data, value_filters)]
            
            
            if df1.shape[0] > 30 and df2.shape[0] > 30 and df3.shape[0] > 30 and df4.shape[0] > 30:
                input_dfs = (df1, df2, df4, df4)
                labels = ['OVS1619', 'TVAE', 'TGAN', 'CTGAN']
                KDEPlot(input_dfs, columns, labels,
                        figname = f"Mode{PERM1}_Age{PERM2}_LOS{PERM4}",
                        save = True)

SUBGROUP #1:
            Mode of Transport == 1
            Age Group == 4
            Length of Stay == 6 days
SUBGROUP #2:
            Mode of Transport == 1
            Age Group == 4
            Length of Stay == Greater than or equal to 8 days
SUBGROUP #3:
            Mode of Transport == 1
            Age Group == 4
            Length of Stay == 7 days
SUBGROUP #4:
            Mode of Transport == 1
            Age Group == 4
            Length of Stay == 4 days
SUBGROUP #5:
            Mode of Transport == 1
            Age Group == 4
            Length of Stay == 3 days
SUBGROUP #6:
            Mode of Transport == 1
            Age Group == 4
            Length of Stay == 2 days
SUBGROUP #7:
            Mode of Transport == 1
            Age Group == 4
            Length of Stay == 0 days
SUBGROUP #8:
            Mode of Transport == 1
            Age Group == 4
            Length of Stay == 5 days
SUBGROUP #9:
            Mode of Transport == 1
            Age Group == 4
    

SUBGROUP #73:
            Mode of Transport == 1
            Age Group == 8
            Length of Stay == 6 days
SUBGROUP #74:
            Mode of Transport == 1
            Age Group == 8
            Length of Stay == Greater than or equal to 8 days
SUBGROUP #75:
            Mode of Transport == 1
            Age Group == 8
            Length of Stay == 7 days
SUBGROUP #76:
            Mode of Transport == 1
            Age Group == 8
            Length of Stay == 4 days
SUBGROUP #77:
            Mode of Transport == 1
            Age Group == 8
            Length of Stay == 3 days
SUBGROUP #78:
            Mode of Transport == 1
            Age Group == 8
            Length of Stay == 2 days
SUBGROUP #79:
            Mode of Transport == 1
            Age Group == 8
            Length of Stay == 0 days
SUBGROUP #80:
            Mode of Transport == 1
            Age Group == 8
            Length of Stay == 5 days
SUBGROUP #81:
            Mode of Transport == 1
            Age Group 

SUBGROUP #146:
            Mode of Transport == 3
            Age Group == 9
            Length of Stay == Greater than or equal to 8 days
SUBGROUP #147:
            Mode of Transport == 3
            Age Group == 9
            Length of Stay == 7 days
SUBGROUP #148:
            Mode of Transport == 3
            Age Group == 9
            Length of Stay == 4 days
SUBGROUP #149:
            Mode of Transport == 3
            Age Group == 9
            Length of Stay == 3 days
SUBGROUP #150:
            Mode of Transport == 3
            Age Group == 9
            Length of Stay == 2 days
SUBGROUP #151:
            Mode of Transport == 3
            Age Group == 9
            Length of Stay == 0 days
SUBGROUP #152:
            Mode of Transport == 3
            Age Group == 9
            Length of Stay == 5 days
SUBGROUP #153:
            Mode of Transport == 3
            Age Group == 9
            Length of Stay == 1 day
SUBGROUP #154:
            Mode of Transport == 3
            Ag

SUBGROUP #220:
            Mode of Transport == 2
            Age Group == 10
            Length of Stay == 4 days
SUBGROUP #221:
            Mode of Transport == 2
            Age Group == 10
            Length of Stay == 3 days
SUBGROUP #222:
            Mode of Transport == 2
            Age Group == 10
            Length of Stay == 2 days
SUBGROUP #223:
            Mode of Transport == 2
            Age Group == 10
            Length of Stay == 0 days
SUBGROUP #224:
            Mode of Transport == 2
            Age Group == 10
            Length of Stay == 5 days
SUBGROUP #225:
            Mode of Transport == 2
            Age Group == 10
            Length of Stay == 1 day
SUBGROUP #226:
            Mode of Transport == 2
            Age Group == 1
            Length of Stay == 6 days
SUBGROUP #227:
            Mode of Transport == 2
            Age Group == 1
            Length of Stay == Greater than or equal to 8 days
SUBGROUP #228:
            Mode of Transport == 2
        

In [8]:
import pandas as pd

resampled_TVAE = pd.concat([TVAE.data,
                           upsample(TVAE.data, nrows = OVS1619.size[0] - TVAE.size[0])],
                           ignore_index = True)
resampled_TGAN = pd.concat([TGAN.data,
                           upsample(TGAN.data, nrows = OVS1619.size[0] - TGAN.size[0])],
                           ignore_index = True)
resampled_CTGAN = pd.concat([CTGAN.data,
                            upsample(CTGAN.data, nrows = OVS1619.size[0] - CTGAN.size[0])],
                            ignore_index = True)

print(OVS1619.size)
print(resampled_TVAE.shape)
print(resampled_TGAN.shape)
print(resampled_CTGAN.shape)

(88731, 293)
(88731, 293)
(88731, 293)
(88731, 293)


In [None]:
df1 = OVS1619.data
df2 = resampled_TVAE
df3 = resampled_TGAN
df4 = resampled_CTGAN

n = 1
for PERM1 in OVS1619.data['Mode of Transport (m1)'].unique():
    for PERM2 in OVS1619.data['a2.r.10l'].unique():
        for PERM4 in OVS1619.data['b10'].unique():
            print(f"SUBGROUP #{n}:\n\
            Mode of Transport == {PERM1}\n\
            Age Group == {PERM2}\n\
            Length of Stay == {PERM4}")
            n += 1

            value_filters = {'Mode of Transport (m1)': PERM1,
                             'a2.r.10l': PERM2,
                             'b10': PERM4}
            
            # get subgroup sums
            print(f"\n\
            Target |\t{np.round(subgroup_sum(df1, spend_categories[0], value_filters),2)}\t{np.round(subgroup_sum(df1, spend_categories[1], value_filters),2)}\t{np.round(subgroup_sum(df1, spend_categories[2], value_filters),2)}\t{np.round(subgroup_sum(df1, spend_categories[3], value_filters),2)}\t{np.round(subgroup_sum(df1, spend_categories[4], value_filters),2)}\n\
            ---------------------------------------------------------------------------------\n\
            TVAE   |\t{np.round(subgroup_sum(df2, spend_categories[0], value_filters),2)}\t{np.round(subgroup_sum(df2, spend_categories[1], value_filters),2)}\t{np.round(subgroup_sum(df2, spend_categories[2], value_filters),2)}\t{np.round(subgroup_sum(df2, spend_categories[3], value_filters),2)}\t{np.round(subgroup_sum(df2, spend_categories[4], value_filters),2)}\n\
            TGAN   |\t{np.round(subgroup_sum(df3, spend_categories[0], value_filters),2)}\t{np.round(subgroup_sum(df3, spend_categories[1], value_filters),2)}\t{np.round(subgroup_sum(df3, spend_categories[2], value_filters),2)}\t{np.round(subgroup_sum(df3, spend_categories[3], value_filters),2)}\t{np.round(subgroup_sum(df3, spend_categories[4], value_filters),2)}\n\
            CTGAN  |\t{np.round(subgroup_sum(df4, spend_categories[0], value_filters),2)}\t{np.round(subgroup_sum(df4, spend_categories[1], value_filters),2)}\t{np.round(subgroup_sum(df4, spend_categories[2], value_filters),2)}\t{np.round(subgroup_sum(df4, spend_categories[3], value_filters),2)}\t{np.round(subgroup_sum(df4, spend_categories[4], value_filters),2)}\n")

SUBGROUP #1:
            Mode of Transport == 1
            Age Group == 4
            Length of Stay == 6 days

            Target |	286088.95	124003.58	38026.22	62546.88	118807.01
            ---------------------------------------------------------------------------------
            TVAE   |	395516.27	205492.58	66051.04	115568.89	204283.54
            TGAN   |	176104.81	124592.09	31681.97	56008.03	87648.86
            CTGAN  |	195865.06	128934.39	32658.67	63608.75	95774.93

SUBGROUP #2:
            Mode of Transport == 1
            Age Group == 4
            Length of Stay == Greater than or equal to 8 days

            Target |	396066.45	164040.59	56671.95	55655.24	171866.26
            ---------------------------------------------------------------------------------
            TVAE   |	205762.06	110977.25	35736.11	51767.83	105104.84
            TGAN   |	56467.53	51152.73	15882.99	38386.16	83401.87
            CTGAN  |	61090.44	56583.52	17064.21	41622.69	77898.88

SUBGROUP #3:
 

#### LOS >= 8 days

In [10]:
columns = spend_categories

for PERM4 in OVS1619.data['b10'].unique():
    print(f"Length of Stay == {PERM4}")
    
    value_filters = {'b10': PERM4}

    # filter data
    df1 = OVS1619.data.iloc[get_filter_index(OVS1619.data, value_filters)]
    df2 = TVAE.data.iloc[get_filter_index(TVAE.data, value_filters)]
    df3 = TGAN.data.iloc[get_filter_index(TGAN.data, value_filters)]
    df4 = CTGAN.data.iloc[get_filter_index(CTGAN.data, value_filters)]
            
            
    input_dfs = (df1, df2, df4, df4)
    labels = ['OVS1619', 'TVAE', 'TGAN', 'CTGAN']
    KDEPlot(input_dfs, columns, labels,
            figname = f"LOS{PERM4}",
            save = True)

Length of Stay == 6 days
Length of Stay == Greater than or equal to 8 days
Length of Stay == 7 days
Length of Stay == 4 days
Length of Stay == 3 days
Length of Stay == 2 days
Length of Stay == 0 days
Length of Stay == 5 days
Length of Stay == 1 day
