# Imports
---

In [1]:
import pandas as pd
import numpy as np
import math

# Import Dataset
---
**Column Description (Data Source: [Smoking and Drinking Dataset with body signal on Kaggle](https://www.kaggle.com/datasets/sooyoungher/smoking-drinking-dataset/data))**


In [27]:
def percentage_missing_values(data):
    total_cells = np.product(data.shape)

    missing_cells = data.isnull().sum().sum()

    percentage_missing = (missing_cells / total_cells) * 100
    return f"Percentage of missing values: {percentage_missing:.2f}%"


In [28]:
df = pd.read_csv('Data/sd.csv')
df_copy = df.copy()
df_copy.head()
df.to_csv('outpu.csv', index=False)

Unnamed: 0,sex,age,height,weight,waistline,sight_left,sight_right,hear_left,hear_right,SBP,...,LDL_chole,triglyceride,hemoglobin,urine_protein,serum_creatinine,SGOT_AST,SGOT_ALT,gamma_GTP,SMK_stat_type_cd,DRK_YN
0,Male,35,170,75,90.0,1.0,1.0,1.0,1.0,120.0,...,126.0,92.0,17.1,1.0,1.0,21.0,35.0,40.0,1.0,Y
1,Male,30,180,80,89.0,0.9,1.2,1.0,1.0,130.0,...,148.0,121.0,15.8,1.0,0.9,20.0,36.0,27.0,3.0,N
2,Male,40,165,75,91.0,1.2,1.5,1.0,1.0,120.0,...,74.0,104.0,15.8,1.0,0.9,47.0,32.0,68.0,1.0,N
3,Male,50,175,80,91.0,1.5,1.2,1.0,1.0,145.0,...,104.0,106.0,17.6,1.0,1.1,29.0,34.0,18.0,1.0,N
4,Male,50,165,60,80.0,1.0,1.2,1.0,1.0,138.0,...,117.0,104.0,13.8,1.0,0.8,19.0,12.0,25.0,1.0,N


In [29]:
percentage_missing_values(df_copy)

'Percentage of missing values: 0.00%'

# Create Variable Mappings
---

In [30]:
df_copy['hear_left'].value_counts()

1.0    960124
2.0     31222
Name: hear_left, dtype: int64

In [31]:
df_copy['SMK_stat_type_cd'] = df_copy['SMK_stat_type_cd'].map({1: 'N', 2: 'Q', 3: 'S'}) # Smoking state, 1(never) = N , 2(used to smoke but quit) = Q, 3(still smoke) = S
df_copy['hear_left'] = df_copy['hear_left'].map({1: 'Normal', 2: 'Abnormal'})
df_copy['hear_right'] = df_copy['hear_right'].map({1: 'Normal', 2: 'Abnormal'})
df_copy['urine_protein'] = df_copy['urine_protein'].map({1: '-', 2: '+/-', 3: '+1', 4: '+2', 5: '+3', 6: '+4'})

In [32]:
print(df_copy.select_dtypes(include=[object,bool]).columns)

Index(['sex', 'hear_left', 'hear_right', 'urine_protein', 'SMK_stat_type_cd',
       'DRK_YN'],
      dtype='object')


# Encoding 
---

In [33]:
def one_hot_encode_dataframe(df,columns):
    copy = df.copy()
    ohe = pd.get_dummies(df_copy, columns=columns)
    return ohe

In [36]:
columns = ['sex','hear_left','hear_right','urine_protein'] # We can then re-use the one hot encode method depending if we want to predict smoker or drinker
ohe_df = one_hot_encode_dataframe(df_copy, columns)

In [37]:
ohe_df.head()

Unnamed: 0,age,height,weight,waistline,sight_left,sight_right,SBP,DBP,BLDS,tot_chole,...,hear_left_Abnormal,hear_left_Normal,hear_right_Abnormal,hear_right_Normal,urine_protein_+/-,urine_protein_+1,urine_protein_+2,urine_protein_+3,urine_protein_+4,urine_protein_-
0,35,170,75,90.0,1.0,1.0,120.0,80.0,99.0,193.0,...,0,1,0,1,0,0,0,0,0,1
1,30,180,80,89.0,0.9,1.2,130.0,82.0,106.0,228.0,...,0,1,0,1,0,0,0,0,0,1
2,40,165,75,91.0,1.2,1.5,120.0,70.0,98.0,136.0,...,0,1,0,1,0,0,0,0,0,1
3,50,175,80,91.0,1.5,1.2,145.0,87.0,95.0,201.0,...,0,1,0,1,0,0,0,0,0,1
4,50,165,60,80.0,1.0,1.2,138.0,82.0,101.0,199.0,...,0,1,0,1,0,0,0,0,0,1
