# Imports
---

In [37]:
import pandas as pd
import numpy as np
import math

# Import Dataset
---
**Column Description (Data Source: [Smoking and Drinking Dataset with body signal on Kaggle](https://www.kaggle.com/datasets/sooyoungher/smoking-drinking-dataset/data))**

- Sex - male, female
- Age - round up to 5 year
- Height - round up to 5 cm[cm]
- Weight [kg]
- Waistline
- Sight_left - eyesight(left) the column shows how well the left eye sees, where perfect vision = 1.0
- Sight_right - eyesight(right) the column shows how well the right eye sees where perfect vision = 1.0
- Hear_left - hearing left, 1(normal), 2(abnormal)
- Hear_right - hearing right, 1(normal), 2(abnormal)
- SBP - Systolic blood pressure[mmHg]
- DBP - Diastolic blood pressure[mmHg]
- BLDS - BLDS or FSG(fasting blood glucose)[mg/dL]
- Tot_chole - total cholesterol[mg/dL]
- HDL_chole - HDL cholesterol[mg/dL] - the only fraction of cholesterol that is called "good", "useful" cholesterol.
- LDL_chole - LDL cholesterol[mg/dL]
- Triglyceride - triglyceride[mg/dL]
- Hemoglobin - hemoglobin[g/dL]
- Urine_protein - protein in urine, 1(-), 2(+/-), 3(+1), 4(+2), 5(+3), 6(+4)
- Serum_creatinine - serum(blood) creatinine[mg/dL]
- SGOT_AST - SGOT(Glutamate-oxaloacetate transaminase) AST(Aspartate transaminase)[IU/L]
- SGOT_ALT - ALT(Alanine transaminase)[IU/L]
- Gamma_GTP - y-glutamyl transpeptidase[IU/L]
- SMK_stat_type_cd - Smoking state, 1(never), 2(used to smoke but quit), 3(still smoke)
- DRK_YN - Drinker or Not


In [42]:
def percentage_missing_values(data):
    total_cells = np.product(data.shape)

    missing_cells = data.isnull().sum().sum()

    percentage_missing = (missing_cells / total_cells) * 100
    return f"Percentage of missing values: {percentage_missing:.2f}%"

In [45]:
df = pd.read_csv('data/sd.csv')
df_copy = df.copy()
df_copy.head()
df.to_csv('outpu.csv', index=False)

In [46]:
percentage_missing_values(df_copy)

'Percentage of missing values: 0.00%'

# Useful functions
---

In [47]:
def remove_outliers_from_columns(df, column_names):

    for column_name in column_names:
        # Calculate Q1, Q3, and IQR
        Q1 = df[column_name].quantile(0.25)
        Q3 = df[column_name].quantile(0.75)
        IQR = Q3 - Q1

        # Define bounds
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Filter out outliers
        df = df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)]

    return df



In [48]:
def percentage_missing_values(data):
    total_cells = np.product(data.shape)

    missing_cells = data.isnull().sum().sum()

    percentage_missing = (missing_cells / total_cells) * 100
    return f"Percentage of missing values: {percentage_missing:.2f}%"


In [49]:
def duplicate_data(data):

    duplicates_cells = data[data.duplicated].shape
    
    return f"Number of duplicates: {duplicates_cells[0]}"

# Clean data
---
- Remove outliers
- Remove duplicates
- impute missing values

## Remove Outliers
---

In [25]:
df_copy.describe()

Unnamed: 0,age,height,weight,waistline,sight_left,sight_right,SBP,DBP,BLDS,tot_chole,HDL_chole,LDL_chole,triglyceride,hemoglobin,serum_creatinine,SGOT_AST,SGOT_ALT,gamma_GTP
count,676654.0,676654.0,676654.0,676654.0,676654.0,676654.0,676654.0,676654.0,676654.0,676654.0,676654.0,676654.0,676654.0,676654.0,676654.0,676654.0,676654.0,676654.0
mean,46.909824,161.324872,60.784027,79.02745,0.944601,0.942262,119.854122,74.367279,94.720488,192.44153,58.757409,112.6261,105.011242,13.968156,0.827369,22.006646,19.127022,22.095479
std,14.451683,9.171283,11.280088,8.972096,0.327505,0.327287,12.957561,8.642721,11.145641,34.25217,14.969314,31.830963,50.361684,1.527757,0.18852,5.59677,7.797004,10.720541
min,20.0,130.0,25.0,53.6,0.1,0.1,84.0,54.0,63.0,97.0,1.0,1.0,1.0,1.0,0.3,6.0,1.0,1.0
25%,35.0,155.0,50.0,72.5,0.7,0.7,110.0,69.0,87.0,168.0,48.0,90.0,67.0,13.0,0.7,18.0,13.0,14.0
50%,45.0,160.0,60.0,79.0,1.0,1.0,120.0,74.0,94.0,191.0,57.0,111.0,94.0,13.9,0.8,21.0,18.0,19.0
75%,55.0,170.0,70.0,85.0,1.2,1.2,130.0,80.0,101.0,215.0,67.0,133.0,132.0,15.1,1.0,25.0,23.0,28.0
max,85.0,190.0,125.0,108.3,1.9,1.9,159.0,97.0,130.0,292.0,933.0,1750.0,272.0,25.0,1.4,41.0,44.0,56.0


In [50]:
df_copy = remove_outliers_from_columns(df_copy, ['waistline','SBP', 'DBP','BLDS','tot_chole','triglyceride','serum_creatinine','SGOT_AST', 'SGOT_ALT','sight_left','sight_right','gamma_GTP'])

In [27]:
df_copy.describe()

Unnamed: 0,age,height,weight,waistline,sight_left,sight_right,SBP,DBP,BLDS,tot_chole,HDL_chole,LDL_chole,triglyceride,hemoglobin,serum_creatinine,SGOT_AST,SGOT_ALT,gamma_GTP
count,573651.0,573651.0,573651.0,573651.0,573651.0,573651.0,573651.0,573651.0,573651.0,573651.0,573651.0,573651.0,573651.0,573651.0,573651.0,573651.0,573651.0,573651.0
mean,46.323357,160.955006,59.812952,78.142515,0.946419,0.943691,119.112279,73.912123,93.743605,191.479112,59.591089,112.161415,98.252654,13.861863,0.811364,21.275268,17.880037,19.97701
std,14.470566,9.077929,10.846816,8.725124,0.327462,0.327193,12.892416,8.579736,10.136665,33.300378,14.898888,31.045891,43.55867,1.511501,0.176734,4.956651,6.706855,8.431347
min,20.0,130.0,25.0,54.0,0.1,0.1,84.0,54.0,66.0,100.0,1.0,1.0,1.0,1.0,0.4,8.0,1.0,1.0
25%,35.0,155.0,50.0,72.0,0.7,0.7,110.0,68.0,87.0,168.0,49.0,90.0,65.0,12.9,0.7,18.0,13.0,14.0
50%,45.0,160.0,60.0,78.0,1.0,1.0,119.0,74.0,93.0,190.0,58.0,111.0,89.0,13.8,0.8,21.0,17.0,18.0
75%,55.0,165.0,65.0,84.0,1.2,1.2,129.0,80.0,100.0,214.0,68.0,132.0,124.0,14.9,0.9,24.0,22.0,25.0
max,85.0,190.0,120.0,103.7,1.9,1.9,159.0,96.0,122.0,284.0,933.0,1750.0,227.0,25.0,1.2,35.0,38.0,44.0


## Remove Duplicates
---

In [51]:
duplicate_data(df_copy)

'Number of duplicates: 14'

## Impute Missing Values 
---

In [52]:
percentage_missing_values(df_copy)

'Percentage of missing values: 0.00%'

# Create Variable Mappings
---

In [30]:
"""df_copy['SMK_stat_type_cd'] = df_copy['SMK_stat_type_cd'].map({1: 'N', 2: 'Q', 3: 'S'}) # Smoking state, 1(never) = N , 2(used to smoke but quit) = Q, 3(still smoke) = S
df_copy['hear_left'] = df_copy['hear_left'].map({1: 'Normal', 2: 'Abnormal'})
df_copy['hear_right'] = df_copy['hear_right'].map({1: 'Normal', 2: 'Abnormal'})
df_copy['urine_protein'] = df_copy['urine_protein'].map({1: '-', 2: '+/-', 3: '+1', 4: '+2', 5: '+3', 6: '+4'})"""

In [31]:
"""print(df_copy.select_dtypes(include=[object,bool]).columns)"""

Index(['sex', 'hear_left', 'hear_right', 'urine_protein', 'SMK_stat_type_cd',
       'DRK_YN'],
      dtype='object')


In [32]:
"""df_copy.info()"""

<class 'pandas.core.frame.DataFrame'>
Index: 573651 entries, 0 to 991344
Data columns (total 24 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   sex               573651 non-null  object 
 1   age               573651 non-null  int64  
 2   height            573651 non-null  int64  
 3   weight            573651 non-null  int64  
 4   waistline         573651 non-null  float64
 5   sight_left        573651 non-null  float64
 6   sight_right       573651 non-null  float64
 7   hear_left         0 non-null       object 
 8   hear_right        0 non-null       object 
 9   SBP               573651 non-null  float64
 10  DBP               573651 non-null  float64
 11  BLDS              573651 non-null  float64
 12  tot_chole         573651 non-null  float64
 13  HDL_chole         573651 non-null  float64
 14  LDL_chole         573651 non-null  float64
 15  triglyceride      573651 non-null  float64
 16  hemoglobin        573651 

### Modeling with One Hot Encoding

In [33]:
import random
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, r2_score, mean_squared_error
import matplotlib.pyplot as plt
import seaborn 
%matplotlib inline

In [53]:
categ = df_copy.select_dtypes(include=["object", "category"]).columns.to_list()
num = df_copy.select_dtypes(exclude=["object", "category"]).columns.to_list()

num.remove('SMK_stat_type_cd')
preprocessor = make_column_transformer(
        (OneHotEncoder(), categ),
        (StandardScaler(), num)
    )


Classifier Function

In [54]:
def evaluate_classifier_metrics(classifier, X_train, y_train, X_test, y_test):

    classifier.fit(X_train, y_train)

    test_predictions = classifier.predict(X_test)
    train_predictions = classifier.predict(X_train)
    print('Train Set Classification Results:')
    print(classification_report(y_train,train_predictions))
    print('\nTest Set Classification Results:')
    print(classification_report(y_test,test_predictions))

KNN Classifier

In [55]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

X = df_copy.drop('SMK_stat_type_cd',axis=1)
y = df_copy['SMK_stat_type_cd'] 

X = preprocessor.fit_transform(X)
knn_classifier = KNeighborsClassifier(n_neighbors=4)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=42)
knn_metrics = evaluate_classifier_metrics(knn_classifier, X_train, y_train, X_test, y_test)

Logistic Regression

In [None]:
logistic_classifier = LogisticRegression()
logistic_metrics = evaluate_classifier_metrics(logistic_classifier, X_train, y_train, X_test, y_test)
print("Logistic Regression Metrics:")