# Imports
---

In [4]:
import pandas as pd
import numpy as np
import math

# Import Dataset
---
**Column Description (Data Source: [Smoking and Drinking Dataset with body signal on Kaggle](https://www.kaggle.com/datasets/sooyoungher/smoking-drinking-dataset/data))**

- Sex - male, female
- Age - round up to 5 year
- Height - round up to 5 cm[cm]
- Weight [kg]
- Waistline
- Sight_left - eyesight(left) the column shows how well the left eye sees, where perfect vision = 1.0
- Sight_right - eyesight(right) the column shows how well the right eye sees where perfect vision = 1.0
- Hear_left - hearing left, 1(normal), 2(abnormal)
- Hear_right - hearing right, 1(normal), 2(abnormal)
- SBP - Systolic blood pressure[mmHg]
- DBP - Diastolic blood pressure[mmHg]
- BLDS - BLDS or FSG(fasting blood glucose)[mg/dL]
- Tot_chole - total cholesterol[mg/dL]
- HDL_chole - HDL cholesterol[mg/dL] - the only fraction of cholesterol that is called "good", "useful" cholesterol.
- LDL_chole - LDL cholesterol[mg/dL]
- Triglyceride - triglyceride[mg/dL]
- Hemoglobin - hemoglobin[g/dL]
- Urine_protein - protein in urine, 1(-), 2(+/-), 3(+1), 4(+2), 5(+3), 6(+4)
- Serum_creatinine - serum(blood) creatinine[mg/dL]
- SGOT_AST - SGOT(Glutamate-oxaloacetate transaminase) AST(Aspartate transaminase)[IU/L]
- SGOT_ALT - ALT(Alanine transaminase)[IU/L]
- Gamma_GTP - y-glutamyl transpeptidase[IU/L]
- SMK_stat_type_cd - Smoking state, 1(never), 2(used to smoke but quit), 3(still smoke)
- DRK_YN - Drinker or Not


In [16]:
df = pd.read_csv('Data/sd.csv')
df_copy = df.copy()

# Useful functions
---

In [17]:
def remove_outliers_from_columns(df, column_names):

    for column_name in column_names:
        # Calculate Q1, Q3, and IQR
        Q1 = df[column_name].quantile(0.25)
        Q3 = df[column_name].quantile(0.75)
        IQR = Q3 - Q1

        # Define bounds
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Filter out outliers
        df = df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)]

    return df



In [18]:
def percentage_missing_values(data):
    total_cells = np.product(data.shape)

    missing_cells = data.isnull().sum().sum()

    percentage_missing = (missing_cells / total_cells) * 100
    return f"Percentage of missing values: {percentage_missing:.2f}%"


In [19]:
def duplicate_data(data):

    duplicates_cells = data[data.duplicated].shape
    
    return f"Number of duplicates: {duplicates_cells[0]}"

# Clean data
---
- Remove outliers
- Remove duplicates
- impute missing values

## Remove Outliers
---

In [57]:
df_copy.describe()

Unnamed: 0,age,height,weight,waistline,sight_left,sight_right,hear_left,hear_right,SBP,DBP,...,HDL_chole,LDL_chole,triglyceride,hemoglobin,urine_protein,serum_creatinine,SGOT_AST,SGOT_ALT,gamma_GTP,SMK_stat_type_cd
count,544456.0,544456.0,544456.0,544456.0,544456.0,544456.0,544456.0,544456.0,544456.0,544456.0,...,544456.0,544456.0,544456.0,544456.0,544456.0,544456.0,544456.0,544456.0,544456.0,544456.0
mean,46.158386,160.846276,59.538539,77.894888,0.946704,0.943934,1.027685,1.026878,118.848678,73.764778,...,59.858299,111.987174,96.39992,13.826581,1.065186,0.808354,21.123082,17.466982,19.380848,1.432323
std,14.489838,9.044924,10.763349,8.709568,0.327453,0.327151,0.164067,0.161728,12.811646,8.559513,...,14.9034,30.943446,41.953641,1.506384,0.34019,0.176191,4.896659,6.29644,7.828522,0.738428
min,20.0,130.0,25.0,53.6,0.1,0.1,1.0,1.0,84.0,54.0,...,1.0,1.0,1.0,1.0,1.0,0.4,8.0,1.0,1.0,1.0
25%,35.0,155.0,50.0,71.2,0.7,0.7,1.0,1.0,110.0,68.0,...,49.0,90.0,64.0,12.9,1.0,0.7,18.0,13.0,13.0,1.0
50%,45.0,160.0,60.0,78.0,1.0,1.0,1.0,1.0,119.0,73.0,...,58.0,110.0,88.0,13.8,1.0,0.8,21.0,16.0,18.0,1.0
75%,55.0,165.0,65.0,84.0,1.2,1.2,1.0,1.0,128.0,80.0,...,69.0,132.0,122.0,14.9,1.0,0.9,24.0,21.0,24.0,2.0
max,85.0,190.0,120.0,104.5,1.9,1.9,2.0,2.0,157.0,96.0,...,933.0,1750.0,218.0,25.0,6.0,1.2,35.0,35.0,41.0,3.0


In [58]:
df_copy = remove_outliers_from_columns(df_copy, ['waistline','SBP', 'DBP','BLDS','tot_chole','triglyceride','serum_creatinine','SGOT_AST', 'SGOT_ALT','sight_left','sight_right','gamma_GTP'])

In [59]:
df_copy.describe()

Unnamed: 0,age,height,weight,waistline,sight_left,sight_right,hear_left,hear_right,SBP,DBP,...,HDL_chole,LDL_chole,triglyceride,hemoglobin,urine_protein,serum_creatinine,SGOT_AST,SGOT_ALT,gamma_GTP,SMK_stat_type_cd
count,514490.0,514490.0,514490.0,514490.0,514490.0,514490.0,514490.0,514490.0,514490.0,514490.0,...,514490.0,514490.0,514490.0,514490.0,514490.0,514490.0,514490.0,514490.0,514490.0,514490.0
mean,45.888278,160.794554,59.291434,77.635536,0.948281,0.94526,1.027064,1.026238,118.574266,73.624008,...,60.124424,111.732883,94.428313,13.799538,1.064654,0.805935,20.856471,17.067026,19.026622,1.423658
std,14.471204,9.004381,10.647231,8.639888,0.327297,0.327096,0.162269,0.159841,12.720438,8.542271,...,14.871205,30.535743,40.000063,1.503407,0.33819,0.175724,4.65078,5.942221,7.567117,0.734103
min,20.0,130.0,25.0,53.6,0.1,0.1,1.0,1.0,84.0,54.0,...,1.0,1.0,1.0,1.0,1.0,0.4,9.0,1.0,1.0,1.0
25%,35.0,155.0,50.0,71.0,0.7,0.7,1.0,1.0,110.0,68.0,...,50.0,90.0,63.0,12.9,1.0,0.7,17.0,13.0,13.0,1.0
50%,45.0,160.0,60.0,77.5,1.0,1.0,1.0,1.0,118.0,73.0,...,59.0,110.0,87.0,13.7,1.0,0.8,20.0,16.0,17.0,1.0
75%,55.0,165.0,65.0,84.0,1.2,1.2,1.0,1.0,128.0,80.0,...,69.0,132.0,119.0,14.8,1.0,0.9,24.0,21.0,23.0,2.0
max,85.0,190.0,120.0,103.1,1.9,1.9,2.0,2.0,155.0,96.0,...,933.0,1750.0,206.0,25.0,6.0,1.2,33.0,33.0,40.0,3.0


## Remove Duplicates
---

In [60]:
duplicate_data(df_copy)

'Number of duplicates: 0'

## Impute Missing Values 
---

In [20]:
percentage_missing_values(df_copy)

'Percentage of missing values: 0.00%'

# Create Variable Mappings
---

In [62]:
df_copy['SMK_stat_type_cd'] = df_copy['SMK_stat_type_cd'].map({1: 'N', 2: 'Q', 3: 'S'}) # Smoking state, 1(never) = N , 2(used to smoke but quit) = Q, 3(still smoke) = S
df_copy['hear_left'] = df_copy['hear_left'].map({1: 'Normal', 2: 'Abnormal'})
df_copy['hear_right'] = df_copy['hear_right'].map({1: 'Normal', 2: 'Abnormal'})
df_copy['urine_protein'] = df_copy['urine_protein'].map({1: '-', 2: '+/-', 3: '+1', 4: '+2', 5: '+3', 6: '+4'})

In [63]:
print(df_copy.select_dtypes(include=[object,bool]).columns)

Index(['sex', 'hear_left', 'hear_right', 'urine_protein', 'SMK_stat_type_cd',
       'DRK_YN'],
      dtype='object')


# Encoding 
---

In [11]:
def one_hot_encode_dataframe(df,columns):
    copy = df.copy()
    ohe = pd.get_dummies(df_copy, columns=columns)
    return ohe

In [12]:
columns = ['sex','hear_left','hear_right','urine_protein','DRK_YN'] # We can then re-use the one hot encode method depending if we want to predict smoker or drinker
ohe_df = one_hot_encode_dataframe(df_copy, columns)

In [13]:
ohe_df.head()

Unnamed: 0,age,height,weight,waistline,sight_left,sight_right,SBP,DBP,BLDS,tot_chole,...,hear_right_Abnormal,hear_right_Normal,urine_protein_+/-,urine_protein_+1,urine_protein_+2,urine_protein_+3,urine_protein_+4,urine_protein_-,DRK_YN_N,DRK_YN_Y
0,35,170,75,90.0,1.0,1.0,120.0,80.0,99.0,193.0,...,0,1,0,0,0,0,0,1,0,1
1,30,180,80,89.0,0.9,1.2,130.0,82.0,106.0,228.0,...,0,1,0,0,0,0,0,1,1,0
2,40,165,75,91.0,1.2,1.5,120.0,70.0,98.0,136.0,...,0,1,0,0,0,0,0,1,1,0
3,50,175,80,91.0,1.5,1.2,145.0,87.0,95.0,201.0,...,0,1,0,0,0,0,0,1,1,0
4,50,165,60,80.0,1.0,1.2,138.0,82.0,101.0,199.0,...,0,1,0,0,0,0,0,1,1,0


In [14]:
ohe_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 991346 entries, 0 to 991345
Data columns (total 33 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   age                  991346 non-null  int64  
 1   height               991346 non-null  int64  
 2   weight               991346 non-null  int64  
 3   waistline            991346 non-null  float64
 4   sight_left           991346 non-null  float64
 5   sight_right          991346 non-null  float64
 6   SBP                  991346 non-null  float64
 7   DBP                  991346 non-null  float64
 8   BLDS                 991346 non-null  float64
 9   tot_chole            991346 non-null  float64
 10  HDL_chole            991346 non-null  float64
 11  LDL_chole            991346 non-null  float64
 12  triglyceride         991346 non-null  float64
 13  hemoglobin           991346 non-null  float64
 14  serum_creatinine     991346 non-null  float64
 15  SGOT_AST         

### Modeling with One Hot Encoding

In [15]:
import random
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, r2_score, mean_squared_error
import matplotlib.pyplot as plt
import seaborn 
%matplotlib inline

Linear Regression

In [20]:
categ = df_copy.select_dtypes(include=[object, "category",bool])
num = df_copy.select_dtypes(exclude=["object", "category"])


prep = make_column_transformer( 
        (OneHotEncoder(), categ),(
        StandardScaler(),num))

l  = LinearRegression()
pipe = make_pipeline(prep,l)
pipe
ohe_df

Unnamed: 0,age,height,weight,waistline,sight_left,sight_right,SBP,DBP,BLDS,tot_chole,...,hear_right_Abnormal,hear_right_Normal,urine_protein_+/-,urine_protein_+1,urine_protein_+2,urine_protein_+3,urine_protein_+4,urine_protein_-,DRK_YN_N,DRK_YN_Y
0,35,170,75,90.0,1.0,1.0,120.0,80.0,99.0,193.0,...,0,1,0,0,0,0,0,1,0,1
1,30,180,80,89.0,0.9,1.2,130.0,82.0,106.0,228.0,...,0,1,0,0,0,0,0,1,1,0
2,40,165,75,91.0,1.2,1.5,120.0,70.0,98.0,136.0,...,0,1,0,0,0,0,0,1,1,0
3,50,175,80,91.0,1.5,1.2,145.0,87.0,95.0,201.0,...,0,1,0,0,0,0,0,1,1,0
4,50,165,60,80.0,1.0,1.2,138.0,82.0,101.0,199.0,...,0,1,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
991341,45,175,80,92.1,1.5,1.5,114.0,80.0,88.0,198.0,...,0,1,0,0,0,0,0,1,1,0
991342,35,170,75,86.0,1.0,1.5,119.0,83.0,83.0,133.0,...,0,1,0,0,0,0,0,1,1,0
991343,40,155,50,68.0,1.0,0.7,110.0,70.0,90.0,205.0,...,0,1,0,0,0,0,0,1,0,1
991344,25,175,60,72.0,1.5,1.0,119.0,74.0,69.0,122.0,...,0,1,0,0,0,0,0,1,1,0


In [21]:
X = df_copy.drop('age', axis =1)
y = df_copy['age'] 

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)


pipe.fit(X_train, y_train)

linear_predictions = pipe.predict(X_test) 


linear_mse = mean_squared_error(y_test, linear_predictions) 
linear_r2 = r2_score(y_test, linear_predictions) 
linear_rmse = np.sqrt(linear_mse) 
linear_meany = np.mean(y)
linear_rmsem = linear_rmse/ linear_meany 

print("Linear Regression Model:")
print("Mean Squared Error:", linear_mse)
print("R-squared:", linear_r2)
print("RMSEM:", linear_rmsem)

ValueError: No valid specification of the columns. Only a scalar, list or slice of all integers or all strings, or boolean mask is allowed

KNN Regression 


In [None]:
knn_regressor = KNeighborsRegressor(n_neighbors=5)  

pipeline_knn = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', knn_regressor)
])

X = ohe_df.drop('', axis=1)
y = ohe_df['']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

pipeline_knn.fit(X_train, y_train)

knn_predictions = pipeline_knn.predict(X_test)

knn_mse = mean_squared_error(y_test, knn_predictions)
knn_r2 = r2_score(y_test, knn_predictions)
knn_rmse = np.sqrt(knn_mse)
knn_meany = np.mean(y)
knn_rmsem = knn_rmse / knn_meany

print("KNN Regression Model:")
print("Mean Squared Error:", knn_mse)
print("R-squared:", knn_r2)
print("RMSEM:", knn_rmsem)

Classifiers Function 

In [17]:
def evaluate_classifier_metrics(classifier, X_train, y_train, X_test, y_test):
    categ = ohe_df.select_dtypes(include=["object", "category"])
    num = ohe_df.select_dtypes(exclude=["object", "category"])

    preprocessor = make_column_transformer(
        (OneHotEncoder(), categ),
        (StandardScaler(), num)
    )

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', classifier)
    ])

    pipeline.fit(X_train, y_train)

    test_predictions = pipeline.predict(X_test)
    train_predictions = pipeline.predict(X_train)

    accuracy_test = accuracy_score(y_test, test_predictions)
    roc_auc_test = roc_auc_score(y_test, test_predictions)
    kappa_test = cohen_kappa_score(y_test, test_predictions)
    f1_test = f1_score(y_test, test_predictions)

    accuracy_train = accuracy_score(y_train, train_predictions)
    roc_auc_train = roc_auc_score(y_train, train_predictions)
    kappa_train = cohen_kappa_score(y_train, train_predictions)
    f1_train = f1_score(y_train, train_predictions)

    return {
        'Test Accuracy': accuracy_test,
        'Test ROC AUC': roc_auc_test,
        'Test Kappa': kappa_test,
        'Test F1 Score': f1_test,
        'Train Accuracy': accuracy_train,
        'Train ROC AUC': roc_auc_train,
        'Train Kappa': kappa_train,
        'Train F1 Score': f1_train
    }

KNN Classifier 

In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

X = ohe_df.drop('SMK_stat_type_cd',axis=1)
y = ohe_df['SMK_stat_type_cd'] 


knn_classifier = KNeighborsClassifier(n_neighbors=4)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=42)
knn_metrics = evaluate_classifier_metrics(knn_classifier, X_train, y_train, X_test, y_test)
print("KNN Metrics:")
for metric, value in knn_metrics.items(): 
    print(f"{metric}: {value}")

ValueError: No valid specification of the columns. Only a scalar, list or slice of all integers or all strings, or boolean mask is allowed

Logistic Regression

In [None]:
logistic_classifier = LogisticRegression()
logistic_metrics = evaluate_classifier_metrics(logistic_classifier, X_train, y_train, X_test, y_test)
print("Logistic Regression Metrics:")
for metric, value in logistic_metrics.items(): 
    print(f"{metric}: {value}")