# Preprocessing of smoking.csv dataset

In [45]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
smoking_data = pd.read_csv("data/smoking.csv")
smoking_data.head(5)

Unnamed: 0,ID,gender,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),...,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,oral,dental caries,tartar,smoking
0,0,F,40,155,60,81.3,1.2,1.0,1.0,1.0,...,12.9,1.0,0.7,18.0,19.0,27.0,Y,0,Y,0
1,1,F,40,160,60,81.0,0.8,0.6,1.0,1.0,...,12.7,1.0,0.6,22.0,19.0,18.0,Y,0,Y,0
2,2,M,55,170,60,80.0,0.8,0.8,1.0,1.0,...,15.8,1.0,1.0,21.0,16.0,22.0,Y,0,N,1
3,3,M,40,165,70,88.0,1.5,1.5,1.0,1.0,...,14.7,1.0,1.0,19.0,26.0,18.0,Y,0,Y,0
4,4,F,40,155,60,86.0,1.0,1.0,1.0,1.0,...,12.5,1.0,0.6,16.0,14.0,22.0,Y,0,N,0


### We can see that we have no missing values, therefore it is not needed to account for them


In [46]:
smoking_data.isnull().sum()

ID                     0
gender                 0
age                    0
height(cm)             0
weight(kg)             0
waist(cm)              0
eyesight(left)         0
eyesight(right)        0
hearing(left)          0
hearing(right)         0
systolic               0
relaxation             0
fasting blood sugar    0
Cholesterol            0
triglyceride           0
HDL                    0
LDL                    0
hemoglobin             0
Urine protein          0
serum creatinine       0
AST                    0
ALT                    0
Gtp                    0
oral                   0
dental caries          0
tartar                 0
smoking                0
dtype: int64

### We can see that columns gender, oral and tartar are not represented by a number, but rather by characters

In [47]:
smoking_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55692 entries, 0 to 55691
Data columns (total 27 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   55692 non-null  int64  
 1   gender               55692 non-null  object 
 2   age                  55692 non-null  int64  
 3   height(cm)           55692 non-null  int64  
 4   weight(kg)           55692 non-null  int64  
 5   waist(cm)            55692 non-null  float64
 6   eyesight(left)       55692 non-null  float64
 7   eyesight(right)      55692 non-null  float64
 8   hearing(left)        55692 non-null  float64
 9   hearing(right)       55692 non-null  float64
 10  systolic             55692 non-null  float64
 11  relaxation           55692 non-null  float64
 12  fasting blood sugar  55692 non-null  float64
 13  Cholesterol          55692 non-null  float64
 14  triglyceride         55692 non-null  float64
 15  HDL                  55692 non-null 

### Here we want to change character values to corresponding numerical
### Y is exchanged to 1, N to 0 and correspondingly, M to 1, F to 0

In [48]:
smoking_data["tartar"] = smoking_data["tartar"].replace(["Y","N"],[1,0])
smoking_data["gender"] = smoking_data["gender"].replace(["M","F"],[1,0])
smoking_data["oral"] = smoking_data["oral"].replace(["Y","N"], [1,0])

### We want to make sure that each columns has multiple values and that we don't have any redundant columns in our data

In [49]:
smoking_data.columns[list(smoking_data.nunique() == 1)]

Index(['oral'], dtype='object')

### Column oral is containing only 1 unique value, therefore it is redundant and we can drop it
### We can also drop the column id, as it is not useful to us in any way

In [50]:
smoking_data = smoking_data.drop(["ID","oral"], axis = 1)

In [18]:
#adding bmi column based on weight and height
#by using bmi, we can evaluate if the patient is obese
smoking_data["bmi"] = round(smoking_data["weight(kg)"]/(smoking_data["height(cm)"]/100)**2, 2)

In [22]:
#splitting data into train and test
from sklearn.model_selection import train_test_split

train, test = train_test_split(smoking_data, test_size=0.2, train_size=0.8)

train.to_csv("data/train_data.csv")
test.to_csv("data/test_data.csv")