# Step 2: Data Cleaning and file export

---

## 1. Importing the necessary libraries📗

In [1]:
# Imports library for data manipulation and visualisation
import sys
sys.path.append("../utilities")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import ticker
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import seaborn as sns
from color import color
import warnings

# ignore all warnings
warnings.filterwarnings("ignore")

plt.style.use('ggplot')

Loading dataset

In [2]:
# Load datasets
df_train = pd.read_csv("../dataset/data_train.csv")
df_test = pd.read_csv("../dataset/data_test.csv")

- No null value
- No duplications
- Drop column `Id`
- Drop column `CholCheck`
- Drop column `AnyHealthcare`
- Drop column `Education`

In [3]:
df_train = df_train.drop(columns=['Id','CholCheck','AnyHealthcare', 'Education']) 
df_test = df_test.drop(columns=['Id','CholCheck','AnyHealthcare', 'Education']) 

# Remove duplication after dropping collumns, since there maybe some new duplication showing up
df_train.drop_duplicates(inplace=True)
# df_test.drop_duplicates(inplace=True)

- Convert `MentHlth` and `PhysHlth` in to binary categorical values

In [4]:
def convert_to_binary(value):
    value = int(value)
    if value <= 3:
        return 0
    else:
        return 1

df_train['MentHlth'] = df_train['MentHlth'].apply(convert_to_binary)
df_train['PhysHlth'] = df_train['PhysHlth'].apply(convert_to_binary)
df_test['MentHlth'] = df_test['MentHlth'].apply(convert_to_binary)
df_test['PhysHlth'] = df_test['PhysHlth'].apply(convert_to_binary)

# Remove duplication after feature engineering, since there maybe some new duplication showing up
df_train.drop_duplicates(inplace=True)
# df_test.drop_duplicates(inplace=True)

In [5]:
# Save processed data into csv
df_train.to_csv("../dataset/cleaned_train.csv", index=False)
df_test.to_csv("../dataset/cleaned_test.csv", index=False)

New processed dataset information:

In [6]:
df_train = pd.read_csv("../dataset/cleaned_train.csv")
df_test = pd.read_csv("../dataset/cleaned_test.csv")

In [7]:
print(color.BOLD + color.RED + 'Train Data Overview: ')
df_train.head(5)

[1m[91mTrain Data Overview: 


Unnamed: 0,HighBP,HighChol,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,...,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Income,ExtraMedTest,ExtraAlcoholTest,Status
0,0,0,24,1,0,0,1,0,1,0,...,2,0,0,0,0,8,5,60,0,0
1,0,0,28,0,0,0,1,1,1,0,...,1,0,0,0,0,2,8,0,-64,0
2,0,0,36,1,0,0,1,1,0,0,...,3,1,1,1,0,3,1,-46,0,0
3,0,1,35,0,0,0,1,1,1,0,...,3,0,0,0,0,8,8,-83,-188,0
4,0,1,27,0,0,0,1,0,1,0,...,3,0,0,0,0,9,4,-58,0,0


In [8]:
print(color.BOLD + color.RED + 'Training data shape:' + color.END)
print(df_train.shape)

print(color.BOLD + color.RED + 'Training data Duplication Row Count: ' + color.END)
print(df_train.duplicated().sum())

print(color.BOLD + color.RED +'Training data Null Value Count:' + color.END)
print(df_train.isna().sum())

[1m[91mTraining data shape:[0m
(202329, 21)
[1m[91mTraining data Duplication Row Count: [0m
0
[1m[91mTraining data Null Value Count:[0m
HighBP                  0
HighChol                0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Income                  0
ExtraMedTest            0
ExtraAlcoholTest        0
Status                  0
dtype: int64


In [9]:
print(color.BOLD + color.BLUE + 'Test Data Overview: ')
df_test.head(5)

[1m[94mTest Data Overview: 


Unnamed: 0,HighBP,HighChol,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,...,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Income,ExtraMedTest,ExtraAlcoholTest,Status
0,1,1,30,1,0,1,1,1,1,0,...,2,0,0,0,1,11,7,0,-145,
1,0,0,27,1,0,0,0,0,1,0,...,1,0,1,1,1,13,5,28,-145,
2,0,0,21,0,0,0,1,1,1,0,...,3,0,1,0,0,10,8,-26,72,
3,0,0,24,1,0,0,1,1,1,0,...,4,0,1,0,0,9,7,0,0,
4,1,0,34,0,0,0,1,1,0,0,...,4,0,0,1,0,11,7,-73,-63,


In [10]:
print(color.BOLD + color.BLUE + 'Test data shape:' + color.END)
print(df_test.shape)

print(color.BOLD + color.BLUE + 'Test data Duplication Row Count: ' + color.END)
print(df_test.duplicated().sum())

print(color.BOLD + color.BLUE +'Test data Null Value Count:' + color.END)
print(df_test.isna().sum())

[1m[94mTest data shape:[0m
(50736, 21)
[1m[94mTest data Duplication Row Count: [0m
50
[1m[94mTest data Null Value Count:[0m
HighBP                      0
HighChol                    0
BMI                         0
Smoker                      0
Stroke                      0
HeartDiseaseorAttack        0
PhysActivity                0
Fruits                      0
Veggies                     0
HvyAlcoholConsump           0
NoDocbcCost                 0
GenHlth                     0
MentHlth                    0
PhysHlth                    0
DiffWalk                    0
Sex                         0
Age                         0
Income                      0
ExtraMedTest                0
ExtraAlcoholTest            0
Status                  50736
dtype: int64
