# Imports
---

In [1]:
import pandas as pd
import numpy as np
import math

# Import Dataset
---
**Column Description (Data Source: [Smoking and Drinking Dataset with body signal on Kaggle](https://www.kaggle.com/datasets/sooyoungher/smoking-drinking-dataset/data))**


In [2]:
def percentage_missing_values(data):
    total_cells = np.product(data.shape)

    missing_cells = data.isnull().sum().sum()

    percentage_missing = (missing_cells / total_cells) * 100
    return f"Percentage of missing values: {percentage_missing:.2f}%"


In [3]:
df = pd.read_csv('Data/sd.csv')
df_copy = df.copy()
df_copy.head()
df.to_csv('outpu.csv', index=False)

In [4]:
percentage_missing_values(df_copy)

'Percentage of missing values: 0.00%'

# Create Variable Mappings
---

In [5]:
df_copy['hear_left'].value_counts()

hear_left
1.0    960124
2.0     31222
Name: count, dtype: int64

In [6]:
df_copy['SMK_stat_type_cd'] = df_copy['SMK_stat_type_cd'].map({1: 'N', 2: 'Q', 3: 'S'}) # Smoking state, 1(never) = N , 2(used to smoke but quit) = Q, 3(still smoke) = S
df_copy['hear_left'] = df_copy['hear_left'].map({1: 'Normal', 2: 'Abnormal'})
df_copy['hear_right'] = df_copy['hear_right'].map({1: 'Normal', 2: 'Abnormal'})
df_copy['urine_protein'] = df_copy['urine_protein'].map({1: '-', 2: '+/-', 3: '+1', 4: '+2', 5: '+3', 6: '+4'})

In [7]:
print(df_copy.select_dtypes(include=[object,bool]).columns)

Index(['sex', 'hear_left', 'hear_right', 'urine_protein', 'SMK_stat_type_cd',
       'DRK_YN'],
      dtype='object')


# Encoding 
---

In [8]:
def one_hot_encode_dataframe(df,columns):
    copy = df.copy()
    ohe = pd.get_dummies(df_copy, columns=columns)
    return ohe

In [9]:
columns = ['sex','hear_left','hear_right','urine_protein'] # We can then re-use the one hot encode method depending if we want to predict smoker or drinker
ohe_df = one_hot_encode_dataframe(df_copy, columns)

In [10]:
ohe_df.head()

Unnamed: 0,age,height,weight,waistline,sight_left,sight_right,SBP,DBP,BLDS,tot_chole,...,hear_left_Abnormal,hear_left_Normal,hear_right_Abnormal,hear_right_Normal,urine_protein_+/-,urine_protein_+1,urine_protein_+2,urine_protein_+3,urine_protein_+4,urine_protein_-
0,35,170,75,90.0,1.0,1.0,120.0,80.0,99.0,193.0,...,False,True,False,True,False,False,False,False,False,True
1,30,180,80,89.0,0.9,1.2,130.0,82.0,106.0,228.0,...,False,True,False,True,False,False,False,False,False,True
2,40,165,75,91.0,1.2,1.5,120.0,70.0,98.0,136.0,...,False,True,False,True,False,False,False,False,False,True
3,50,175,80,91.0,1.5,1.2,145.0,87.0,95.0,201.0,...,False,True,False,True,False,False,False,False,False,True
4,50,165,60,80.0,1.0,1.2,138.0,82.0,101.0,199.0,...,False,True,False,True,False,False,False,False,False,True


In [13]:
ohe_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 991346 entries, 0 to 991345
Data columns (total 32 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   age                  991346 non-null  int64  
 1   height               991346 non-null  int64  
 2   weight               991346 non-null  int64  
 3   waistline            991346 non-null  float64
 4   sight_left           991346 non-null  float64
 5   sight_right          991346 non-null  float64
 6   SBP                  991346 non-null  float64
 7   DBP                  991346 non-null  float64
 8   BLDS                 991346 non-null  float64
 9   tot_chole            991346 non-null  float64
 10  HDL_chole            991346 non-null  float64
 11  LDL_chole            991346 non-null  float64
 12  triglyceride         991346 non-null  float64
 13  hemoglobin           991346 non-null  float64
 14  serum_creatinine     991346 non-null  float64
 15  SGOT_AST         

### Modeling with One Hot Encoding

In [11]:
import random
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, r2_score, mean_squared_error
import matplotlib.pyplot as plt
import seaborn 
%matplotlib inline

Linear Regression

In [12]:
categ = ohe_df.select_dtypes(include=["object", "category"])
num = ohe_df.select_dtypes(exclude=["object", "category"])


prep = make_column_transformer( 
        (OneHotEncoder(), categ),(
        StandardScaler(),num))

l  = LinearRegression()
pipe = make_pipeline(prep,l)
pipe

In [18]:
X = ohe_df.drop('age', axis =1)
y = ohe_df['age'] 

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)


pipe.fit(X_train, y_train)

linear_predictions = pipe.predict(X_test) 


linear_mse = mean_squared_error(y_test, linear_predictions) 
linear_r2 = r2_score(y_test, linear_predictions) 
linear_rmse = np.sqrt(linear_mse) 
linear_meany = np.mean(y)
linear_rmsem = linear_rmse/ linear_meany 

print("Linear Regression Model:")
print("Mean Squared Error:", linear_mse)
print("R-squared:", linear_r2)
print("RMSEM:", linear_rmsem)

ValueError: No valid specification of the columns. Only a scalar, list or slice of all integers or all strings, or boolean mask is allowed

KNN Regression 


In [None]:
knn_regressor = KNeighborsRegressor(n_neighbors=5)  

pipeline_knn = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', knn_regressor)
])

X = ohe_df.drop('', axis=1)
y = ohe_df['']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

pipeline_knn.fit(X_train, y_train)

knn_predictions = pipeline_knn.predict(X_test)

knn_mse = mean_squared_error(y_test, knn_predictions)
knn_r2 = r2_score(y_test, knn_predictions)
knn_rmse = np.sqrt(knn_mse)
knn_meany = np.mean(y)
knn_rmsem = knn_rmse / knn_meany

print("KNN Regression Model:")
print("Mean Squared Error:", knn_mse)
print("R-squared:", knn_r2)
print("RMSEM:", knn_rmsem)

Classifiers Function 

In [17]:
def evaluate_classifier_metrics(classifier, X_train, y_train, X_test, y_test):
    categ = ohe_df.select_dtypes(include=["object", "category"])
    num = ohe_df.select_dtypes(exclude=["object", "category"])

    preprocessor = make_column_transformer(
        (OneHotEncoder(), categ),
        (StandardScaler(), num)
    )

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', classifier)
    ])

    pipeline.fit(X_train, y_train)

    test_predictions = pipeline.predict(X_test)
    train_predictions = pipeline.predict(X_train)

    accuracy_test = accuracy_score(y_test, test_predictions)
    roc_auc_test = roc_auc_score(y_test, test_predictions)
    kappa_test = cohen_kappa_score(y_test, test_predictions)
    f1_test = f1_score(y_test, test_predictions)

    accuracy_train = accuracy_score(y_train, train_predictions)
    roc_auc_train = roc_auc_score(y_train, train_predictions)
    kappa_train = cohen_kappa_score(y_train, train_predictions)
    f1_train = f1_score(y_train, train_predictions)

    return {
        'Test Accuracy': accuracy_test,
        'Test ROC AUC': roc_auc_test,
        'Test Kappa': kappa_test,
        'Test F1 Score': f1_test,
        'Train Accuracy': accuracy_train,
        'Train ROC AUC': roc_auc_train,
        'Train Kappa': kappa_train,
        'Train F1 Score': f1_train
    }

KNN Classifier 

In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

X = ohe_df.drop('SMK_stat_type_cd',axis=1)
y = ohe_df['SMK_stat_type_cd'] 


knn_classifier = KNeighborsClassifier(n_neighbors=4)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=42)
knn_metrics = evaluate_classifier_metrics(knn_classifier, X_train, y_train, X_test, y_test)
print("KNN Metrics:")
for metric, value in knn_metrics.items(): 
    print(f"{metric}: {value}")

ValueError: No valid specification of the columns. Only a scalar, list or slice of all integers or all strings, or boolean mask is allowed

Logistic Regression

In [None]:
logistic_classifier = LogisticRegression()
logistic_metrics = evaluate_classifier_metrics(logistic_classifier, X_train, y_train, X_test, y_test)
print("Logistic Regression Metrics:")
for metric, value in logistic_metrics.items(): 
    print(f"{metric}: {value}")