# Prepare Environment

In [23]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re

import warnings
warnings.filterwarnings('ignore')

drive_path = '/content/drive/MyDrive/Kuliah/Tugas Akhir/Final Project Shared Folder'
base_url = "https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?CycleBeginYear=2017"
dataset_names = ['Demographics', 'Dietary', 'Examination', 'Laboratory', 'Questionnaire']

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Ingest Data

In [2]:
data_path = "Dataset/Data Versioning/Combined_All_V7.csv"

df_raw = pd.read_csv(os.path.join(drive_path, data_path), index_col = 0)
df_raw = df_raw.set_index('SEQN')
df_raw.head()

Unnamed: 0_level_0,Demog1_RIAGENDR,Demog1_RIDAGEYR,Demog1_RIDRETH3,Demog1_DMDMARTL,Demog1_DMDHHSIZ,Demog1_DMDFMSIZ,Demog1_DMDHHSZA,Demog1_DMDHHSZB,Demog1_DMDHHSZE,Demog1_DMDEDUC,...,Quest20_PFQ061C,Quest20_PFQ061H,Quest22_SMQ020,Quest22_SMQ890,Quest22_SMQ900,Quest23_SMD470,Quest3_CDQ009,Quest6_DED1225,Quest21_SLD123,Quest21_SLQ3032
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
93703.0,2.0,2.0,6.0,5.0,5.0,5.0,3.0,5.397605e-79,5.397605e-79,1.0,...,9.0,9.0,9.0,0.0,0.0,0,0.0,132.184626,8.057854,870.5
93704.0,1.0,2.0,3.0,5.0,4.0,4.0,2.0,5.397605e-79,5.397605e-79,1.0,...,9.0,9.0,9.0,0.0,0.0,0,0.0,132.184626,8.057854,870.5
93705.0,2.0,66.0,4.0,3.0,1.0,1.0,5.397605e-79,5.397605e-79,1.0,2.0,...,1.0,1.0,1.0,2.0,2.0,0,0.0,132.184626,8.0,1380.0
93706.0,1.0,18.0,6.0,5.0,5.0,5.0,5.397605e-79,5.397605e-79,1.0,4.0,...,9.0,9.0,2.0,2.0,2.0,0,0.0,161.279272,11.0,720.0
93707.0,1.0,13.0,7.0,5.0,7.0,7.0,5.397605e-79,3.0,5.397605e-79,1.0,...,9.0,9.0,9.0,0.0,0.0,1,0.0,132.184626,8.057854,870.5


In [3]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9254 entries, 93703.0 to 102956.0
Data columns (total 83 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Demog1_RIAGENDR  9254 non-null   float64
 1   Demog1_RIDAGEYR  9254 non-null   float64
 2   Demog1_RIDRETH3  9254 non-null   float64
 3   Demog1_DMDMARTL  9254 non-null   float64
 4   Demog1_DMDHHSIZ  9254 non-null   float64
 5   Demog1_DMDFMSIZ  9254 non-null   float64
 6   Demog1_DMDHHSZA  9254 non-null   float64
 7   Demog1_DMDHHSZB  9254 non-null   float64
 8   Demog1_DMDHHSZE  9254 non-null   float64
 9   Demog1_DMDEDUC   9254 non-null   float64
 10  Dieta1_DR1TKCAL  9254 non-null   float64
 11  Dieta1_DR1TPROT  9254 non-null   float64
 12  Dieta1_DR1TCARB  9254 non-null   float64
 13  Dieta1_DR1TSUGR  9254 non-null   float64
 14  Dieta1_DR1TFIBE  9254 non-null   float64
 15  Dieta1_DR1TTFAT  9254 non-null   float64
 16  Dieta1_DR1TSFAT  9254 non-null   float64
 17  Dieta1_DR

# Clean Data

In [4]:
df = df_raw.copy()

# ----------------------------------------------------
# Fill in cvd status for person with less then 20 years old
# ----------------------------------------------------

# conditions = (df['Quest16_MCQ160B'].isna()) | (df['Quest16_MCQ160C'].isna()) | (df['Quest16_MCQ160D'].isna()) | (df['Quest16_MCQ160E'].isna()) | (df['Quest16_MCQ160F'].isna())
# df.loc[conditions & (df['Demog1_RIDAGEYR'] <= 20), ['Quest16_MCQ160B']] = 0
# df['Quest16_MCQ160B'] = df['Quest16_MCQ160B'].dropna()

# df['Quest16_MCQ160B'] = df['Quest16_MCQ160B'].fillna(0)

# ----------------------------------------------------
# Fill in Asthma and others
# ----------------------------------------------------

df['Quest16_MCQ010'] = df['Quest16_MCQ010'].replace({9:0})
df['Quest16_MCQ010'] = df['Quest16_MCQ010'].fillna(0)
df[['Quest16_MCQ220', 'Quest16_MCQ300C', 'Quest16_MCQ300A']] = df.loc[:, ['Quest16_MCQ220', 'Quest16_MCQ300C', 'Quest16_MCQ300A']].fillna(value=9)

# ----------------------------------------------------
# Drop Unnecessary Column
# ----------------------------------------------------

df.drop(['Quest16_MCQ092', 'Quest16_MCQ366A', 'Quest16_MCQ366B'], axis=1, inplace=True)

# ----------------------------------------------------
# Label of heart failure disease (dropped other than heart failure)
# ----------------------------------------------------

df.drop(['Quest16_MCQ160C', 'Quest16_MCQ160D', 'Quest16_MCQ160E', 'Quest16_MCQ160F'], axis=1, inplace=True)

df = df.dropna()

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5569 entries, 93705.0 to 102956.0
Data columns (total 76 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Demog1_RIAGENDR  5569 non-null   float64
 1   Demog1_RIDAGEYR  5569 non-null   float64
 2   Demog1_RIDRETH3  5569 non-null   float64
 3   Demog1_DMDMARTL  5569 non-null   float64
 4   Demog1_DMDHHSIZ  5569 non-null   float64
 5   Demog1_DMDFMSIZ  5569 non-null   float64
 6   Demog1_DMDHHSZA  5569 non-null   float64
 7   Demog1_DMDHHSZB  5569 non-null   float64
 8   Demog1_DMDHHSZE  5569 non-null   float64
 9   Demog1_DMDEDUC   5569 non-null   float64
 10  Dieta1_DR1TKCAL  5569 non-null   float64
 11  Dieta1_DR1TPROT  5569 non-null   float64
 12  Dieta1_DR1TCARB  5569 non-null   float64
 13  Dieta1_DR1TSUGR  5569 non-null   float64
 14  Dieta1_DR1TFIBE  5569 non-null   float64
 15  Dieta1_DR1TTFAT  5569 non-null   float64
 16  Dieta1_DR1TSFAT  5569 non-null   float64
 17  Dieta1_DR

In [5]:
non_pos_to_zero = ['Quest15_KIQ022', 'Quest15_KIQ026', 'Quest16_MCQ010', 'Quest16_MCQ220',
                    'Quest1_ALQ111', 'Quest9_DLQ050', 'Quest3_CDQ008', 'Quest3_CDQ010',
                   'Quest22_SMQ020', 'Quest22_SMQ890', 'Quest22_SMQ900'
                   ]

for col in non_pos_to_zero:
    df.loc[df[col] > 1, col] = 0
    df[col] = df[col].replace({2:0, 9:0})

case_2 = ['Quest17_DPQ020', 'Quest17_DPQ030', 'Quest17_DPQ040']
for col in case_2:
    df[col] = df[col].replace({7:0, 9:0, 2:1, 3:1})
    df.loc[df[col] < 1, col] = 0

In [6]:
df[['Quest16_MCQ300A','Quest16_MCQ300C']] = df[['Quest16_MCQ300A','Quest16_MCQ300C']].replace({7:0, 2:0})
df['Quest7_DIQ010'] = df['Quest7_DIQ010'].replace({2:0, 3:2, 9:0})

In [7]:
special_treatment_cols = [
    'Quest20_PFQ061B', 'Quest20_PFQ061C', 'Quest20_PFQ061H'
]

for col in special_treatment_cols:
    df[col] = df[col].replace({
        1:0, 7:0, 9:0, 2:1, 3:1, 4:1, 5:1
    })

In [8]:
above_0 = ['Demog1_DMDHHSZA', 'Demog1_DMDHHSZB', 'Demog1_DMDHHSZE']
for col in above_0:
    df.loc[df[col] < 1, col] = 0

In [9]:
df = df.drop(columns=[
    'Quest14_IND235','Quest10_ECQ020', 'Quest18_OCQ210', 'Quest20_PFQ020', 'Quest4_CBD111', 'Quest4_CBD121',
    'Labor1_LBXTC','Quest14_INQ020', 'Quest14_INQ012'
    ])

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5569 entries, 93705.0 to 102956.0
Data columns (total 67 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Demog1_RIAGENDR  5569 non-null   float64
 1   Demog1_RIDAGEYR  5569 non-null   float64
 2   Demog1_RIDRETH3  5569 non-null   float64
 3   Demog1_DMDMARTL  5569 non-null   float64
 4   Demog1_DMDHHSIZ  5569 non-null   float64
 5   Demog1_DMDFMSIZ  5569 non-null   float64
 6   Demog1_DMDHHSZA  5569 non-null   float64
 7   Demog1_DMDHHSZB  5569 non-null   float64
 8   Demog1_DMDHHSZE  5569 non-null   float64
 9   Demog1_DMDEDUC   5569 non-null   float64
 10  Dieta1_DR1TKCAL  5569 non-null   float64
 11  Dieta1_DR1TPROT  5569 non-null   float64
 12  Dieta1_DR1TCARB  5569 non-null   float64
 13  Dieta1_DR1TSUGR  5569 non-null   float64
 14  Dieta1_DR1TFIBE  5569 non-null   float64
 15  Dieta1_DR1TTFAT  5569 non-null   float64
 16  Dieta1_DR1TSFAT  5569 non-null   float64
 17  Dieta1_DR

# Select Feature

## Variable that can be used


1. RIDAGEYR - Age
2. Smoking
- SMQ890 - Cigarette
- SMQ900 - E-cigarette
3. Sleep Time (Average for weekend & weekdays)
- SLQ300 - Usual sleep time on weekdays or workdays
- SLQ320 - Usual sleep time on weekends
- SLD012 - Sleep hours - weekdays or workdays
- SLD013 - Sleep hours - weekends
- SLQ330 - Usual wake time on weekends
4. Pain in Chest Area
- CDQ008	Severe pain in chest more than half hour
5. Dietary
- DRDINT	Number of days of intake
- DR1DAY	Intake day of the week
- DR1TKCAL	Energy (kcal)
- DR1TPROT	Protein (gm)
- DR1TCARB	Carbohydrate (gm)
- DR1TSUGR	Total sugars (gm)
- DR1TFIBE	Dietary fiber (gm)
- DR1TTFAT	Total fat (gm)
- DR1TSFAT	Total saturated fatty acids (gm)
- DR1TMFAT	Total monounsaturated fatty acids (gm)
- DR1TPFAT	Total polyunsaturated fatty acids (gm)
- DR1TCHOL	Cholesterol (mg)
- DR1TCALC	Calcium (mg)
6. Activity (SUM)
- PAD615	Minutes vigorous-intensity work
- PAD645	Minutes walk/bicycle for transportation
- PAD660	Minutes vigorous recreational activities
7. Height & Weight
- BMXWT	Weight (kg)
- BMXHT	Standing Height (cm)
- BMXBMI	Body Mass Index (kg/m**2)
8. Systolic & Diatoloc Pressure
- BPXSY1	Systolic: Blood pres (1st rdg) mm Hg
- BPXDI1	Diastolic: Blood pres (1st rdg) mm Hg
- BPXSY2	Systolic: Blood pres (2nd rdg) mm Hg
- BPXDI2	Diastolic: Blood pres (2nd rdg) mm Hg
- BPXSY3	Systolic: Blood pres (3rd rdg) mm Hg
- BPXDI3	Diastolic: Blood pres (3rd rdg) mm Hg


## Feature Selection & Engineering

In [14]:
# -----------------
# 1. RIDAGEYR - Age
# -----------------

age = df.copy()[['Demog1_RIDAGEYR']]

# -----------------
# 2. Smoking
# -----------------

smoking = df.copy()[['Quest22_SMQ890', 'Quest22_SMQ900']]

# -----------------
# 3. Sleep Time (Average)
# -----------------


## Averaging Sleep timestamp for weekend & weekdays
# sleep['Quest21_SLQ300'] = (sleep['Quest21_SLQ300'] + sleep['Quest21_SLQ320']) / 2
sleep = df.copy()[['Quest21_SLD123', 'Quest21_SLQ3032']]

## Averaging Sleep duration for weekend & weekdays
# sleep['Quest21_SLD012'] = (sleep['Quest21_SLD012'] + sleep['Quest21_SLD013']) / 2

# sleep = sleep.drop(columns=['Quest21_SLQ320', 'Quest21_SLD013'])

# -----------------
# 4. Pain in Chest Area
# -----------------

pain = df.copy()['Quest3_CDQ008']

# -----------------
# 5. Dietary
# -----------------

food = df.copy()[['Dieta1_DR1TKCAL', 'Dieta1_DR1TPROT', 'Dieta1_DR1TCARB', 'Dieta1_DR1TSUGR', 'Dieta1_DR1TFIBE', 'Dieta1_DR1TTFAT', 'Dieta1_DR1TSFAT', 'Dieta1_DR1TMFAT', 'Dieta1_DR1TPFAT', 'Dieta1_DR1TCHOL', 'Dieta1_DR1TCALC']]

# -----------------
# 6. Activity
# -----------------

activity = df.copy()[['Quest19_PAD615', 'Quest19_PAD645', 'Quest19_PAD660']]

activity['Quest19_VigorousActivity'] = activity['Quest19_PAD615'] + activity['Quest19_PAD645'] + activity['Quest19_PAD660']
activity = activity.drop(columns=['Quest19_PAD615', 'Quest19_PAD645', 'Quest19_PAD660'])

# -----------------
# 7. Height & Weight
# -----------------

height_weight = df.copy()[['Exami2_BMXWT', 'Exami2_BMXHT', 'Exami2_BMXBMI']]

# -----------------
# 8. Systolic & Diatoloc Pressure
# -----------------

pressure = df.copy()[['Exami1_SysPulse', 'Exami1_DiaPulse']]


# -----------------
# 8. Label
# -----------------

label = df.copy()[['Quest16_MCQ160B']]


# Rearrange Column

Lifestyle - Characteristics - Label

In [15]:

# Lifestyle
df_custom = smoking.copy()
df_custom = df_custom.join(sleep)
df_custom = df_custom.join(food)
df_custom = df_custom.join(activity)

lifestyle_cols = df_custom.columns.tolist()

# Characteristics
df_custom = df_custom.join(age)
df_custom = df_custom.join(height_weight)
df_custom = df_custom.join(pain)
df_custom = df_custom.join(pressure)

charac_cols = list(set(df_custom.columns.tolist()) - set(lifestyle_cols))

# Label
df_custom = df_custom.join(label)
label_cols = label.columns.tolist()

df_custom.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5569 entries, 93705.0 to 102956.0
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Quest22_SMQ890            5569 non-null   float64
 1   Quest22_SMQ900            5569 non-null   float64
 2   Quest21_SLD123            5569 non-null   float64
 3   Quest21_SLQ3032           5569 non-null   float64
 4   Dieta1_DR1TKCAL           5569 non-null   float64
 5   Dieta1_DR1TPROT           5569 non-null   float64
 6   Dieta1_DR1TCARB           5569 non-null   float64
 7   Dieta1_DR1TSUGR           5569 non-null   float64
 8   Dieta1_DR1TFIBE           5569 non-null   float64
 9   Dieta1_DR1TTFAT           5569 non-null   float64
 10  Dieta1_DR1TSFAT           5569 non-null   float64
 11  Dieta1_DR1TMFAT           5569 non-null   float64
 12  Dieta1_DR1TPFAT           5569 non-null   float64
 13  Dieta1_DR1TCHOL           5569 non-null   float64
 14  Die

In [16]:
lifestyle_cols

['Quest22_SMQ890',
 'Quest22_SMQ900',
 'Quest21_SLD123',
 'Quest21_SLQ3032',
 'Dieta1_DR1TKCAL',
 'Dieta1_DR1TPROT',
 'Dieta1_DR1TCARB',
 'Dieta1_DR1TSUGR',
 'Dieta1_DR1TFIBE',
 'Dieta1_DR1TTFAT',
 'Dieta1_DR1TSFAT',
 'Dieta1_DR1TMFAT',
 'Dieta1_DR1TPFAT',
 'Dieta1_DR1TCHOL',
 'Dieta1_DR1TCALC',
 'Quest19_VigorousActivity']

In [17]:
df_custom['Quest22_SMQ890'].value_counts()

Quest22_SMQ890
0.0    3545
1.0    2024
Name: count, dtype: int64

In [None]:
list(df_custom[df_custom['Quest16_MCQ160B'] == 1].sample(1).values.tolist())

[[2.0,
  2.0,
  1140.0,
  360.0,
  10.5,
  1823.1,
  66.528,
  227.35550000000003,
  103.9145,
  16.322499999999998,
  74.31949999999999,
  25.42265,
  24.74745,
  16.916775,
  294.875,
  840.575,
  295.08816665959523,
  76.0,
  79.98,
  164.92666666666668,
  29.42,
  9.0,
  66.24786324786324,
  66.24786324786324,
  1.0]]

In [None]:
list(df_custom[df_custom['Quest16_MCQ160B'] == 1].sample(1))

['Quest22_SMQ890',
 'Quest22_SMQ900',
 'Quest21_SLQ300',
 'Quest21_SLQ330',
 'Quest21_SLD012',
 'Dieta1_DR1TKCAL',
 'Dieta1_DR1TPROT',
 'Dieta1_DR1TCARB',
 'Dieta1_DR1TSUGR',
 'Dieta1_DR1TFIBE',
 'Dieta1_DR1TTFAT',
 'Dieta1_DR1TSFAT',
 'Dieta1_DR1TMFAT',
 'Dieta1_DR1TPFAT',
 'Dieta1_DR1TCHOL',
 'Dieta1_DR1TCALC',
 'Quest19_VigorousActivity',
 'Demog1_RIDAGEYR',
 'Exami2_BMXWT',
 'Exami2_BMXHT',
 'Exami2_BMXBMI',
 'Quest3_CDQ008',
 'Exami1_SysPulse',
 'Exami1_DiaPulse',
 'Quest16_MCQ160B']

# Discretization

In [18]:
discretized_df = df_custom.copy()

temp = discretized_df.copy()

variable_discrete_val = {}

for ls in lifestyle_cols:
    variable_discrete_val[ls] = {}

    # Dont discretize already discretized columns
    if ls == 'Dieta2_DR1DAY' or len(discretized_df[ls].value_counts().to_list()) <=5:
        for k in df[ls].unique():
            variable_discrete_val[ls][str(k)] = k
        continue

    print(f'Categorizing column: {ls}, with value: ', end=' ')
    try:
        temp[ls] = pd.qcut(discretized_df[ls].to_numpy(), q=4, duplicates='drop') # For discretization label
        discretized_df[ls] = pd.qcut(discretized_df[ls].to_numpy(), q=4, labels=False, duplicates='drop') # Discretized columns

        for k, v in zip(discretized_df[ls].unique(), temp[ls].unique()):
            variable_discrete_val[ls][str(k)]=str(v)

    except Exception as error:
        print(f'{ls} cannot be discretized')
        print(f"ERROR DUE TO {error}")
        continue

print(variable_discrete_val)

Categorizing column: Quest21_SLD123, with value:  Categorizing column: Quest21_SLQ3032, with value:  Categorizing column: Dieta1_DR1TKCAL, with value:  Categorizing column: Dieta1_DR1TPROT, with value:  Categorizing column: Dieta1_DR1TCARB, with value:  Categorizing column: Dieta1_DR1TSUGR, with value:  Categorizing column: Dieta1_DR1TFIBE, with value:  Categorizing column: Dieta1_DR1TTFAT, with value:  Categorizing column: Dieta1_DR1TSFAT, with value:  Categorizing column: Dieta1_DR1TMFAT, with value:  Categorizing column: Dieta1_DR1TPFAT, with value:  Categorizing column: Dieta1_DR1TCHOL, with value:  Categorizing column: Dieta1_DR1TCALC, with value:  Categorizing column: Quest19_VigorousActivity, with value:  {'Quest22_SMQ890': {'0.0': 0.0, '1.0': 1.0}, 'Quest22_SMQ900': {'0.0': 0.0, '1.0': 1.0}, 'Quest21_SLD123': {'1': '(7.0, 8.0]', '0': '(1.999, 7.0]', '2': '(8.0, 9.0]', '3': '(9.0, 14.0]'}, 'Quest21_SLQ3032': {'3': '(1350.0, 1425.0]', '2': '(1260.0, 1350.0]', '1': '(517.0, 1260.0

In [19]:
import json

with open('vAPP_variable_discrete_value.json', 'w') as json_file:
    json.dump(variable_discrete_val, json_file)

In [20]:
discretized_df.head()

Unnamed: 0_level_0,Quest22_SMQ890,Quest22_SMQ900,Quest21_SLD123,Quest21_SLQ3032,Dieta1_DR1TKCAL,Dieta1_DR1TPROT,Dieta1_DR1TCARB,Dieta1_DR1TSUGR,Dieta1_DR1TFIBE,Dieta1_DR1TTFAT,...,Dieta1_DR1TCALC,Quest19_VigorousActivity,Demog1_RIDAGEYR,Exami2_BMXWT,Exami2_BMXHT,Exami2_BMXBMI,Quest3_CDQ008,Exami1_SysPulse,Exami1_DiaPulse,Quest16_MCQ160B
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
93705.0,0.0,0.0,1,3,0,0,0,1,0,1,...,0,0,66.0,79.5,158.3,31.7,0.0,157.72237,69.961686,2.0
93708.0,0.0,0.0,1,2,0,0,0,0,2,1,...,0,0,66.0,53.5,150.2,23.7,0.0,139.094017,75.777778,2.0
93709.0,1.0,1.0,0,2,1,2,1,2,2,2,...,2,1,75.0,88.8,151.1,38.9,0.0,118.666667,66.666667,2.0
93711.0,0.0,0.0,1,2,3,3,3,3,3,3,...,3,1,56.0,62.1,170.6,21.3,0.0,101.333333,66.666667,2.0
93713.0,0.0,0.0,0,3,2,1,1,2,1,3,...,1,0,67.0,74.9,178.6,23.5,0.0,104.666667,72.0,2.0


# Save Data

In [21]:
final = discretized_df.copy()

target_path = "Dataset/Data Versioning/Trained_App-4.csv"

final.to_csv(os.path.join(drive_path, target_path))

In [22]:
discretized_df['Quest19_VigorousActivity'].value_counts()

Quest19_VigorousActivity
1    1403
0    1401
3    1391
2    1374
Name: count, dtype: int64