In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report


In [2]:
normalized_df = pd.read_csv('../data/processed/normalized_df.csv')
df = normalized_df.copy(deep = True)
df

Unnamed: 0,age,eyesight(left),eyesight(right),systolic,relaxation,fasting blood sugar,Cholesterol,triglyceride,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking,BMI
0,0.230769,0.45,0.45,0.290123,0.358491,0.135279,0.471795,0.146317,0.185915,0.075847,0.919753,0.0,0.078261,0.050738,0.039135,0.123370,1.0,1.0,0.537363
1,0.000000,0.35,0.45,0.296296,0.367925,0.111406,0.400000,0.121090,0.188732,0.060785,0.679012,0.0,0.086957,0.011993,0.008239,0.028084,1.0,0.0,0.768256
2,0.384615,0.45,0.45,0.240741,0.377358,0.090186,0.353846,0.113017,0.149296,0.059710,0.543210,0.4,0.043478,1.000000,0.480261,0.274824,0.0,0.0,0.453737
3,0.384615,0.40,0.35,0.537037,0.452830,0.538462,0.397436,0.361251,0.118310,0.048413,0.740741,0.0,0.069565,0.023985,0.012015,0.034102,0.0,0.0,0.536405
4,0.000000,0.75,0.05,0.234568,0.226415,0.143236,0.317949,0.193744,0.121127,0.048951,0.617284,0.0,0.095652,0.018450,0.009269,0.013039,0.0,0.0,0.275719
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33462,0.307692,0.75,0.75,0.240741,0.207547,0.119363,0.338462,0.076690,0.211268,0.048413,0.691358,0.0,0.069565,0.020295,0.014418,0.034102,1.0,1.0,0.291787
33463,0.307692,0.20,0.30,0.222222,0.188679,0.124668,0.228205,0.045409,0.160563,0.038193,0.456790,0.0,0.034783,0.011070,0.005836,0.019057,1.0,0.0,0.275719
33464,0.384615,0.75,0.60,0.339506,0.301887,0.119363,0.441026,0.092836,0.202817,0.069930,0.469136,0.2,0.043478,0.015683,0.003433,0.010030,0.0,0.0,0.306033
33465,0.307692,0.75,0.75,0.148148,0.273585,0.148541,0.387179,0.040363,0.211268,0.061861,0.438272,0.0,0.043478,0.016605,0.006522,0.015045,0.0,1.0,0.256053


In [3]:
y = df['smoking']
df.drop(columns=['smoking'], inplace=True)
print(y.value_counts())
for col in df.columns:
    print(df[col].value_counts())

smoking
0.0    21209
1.0    12258
Name: count, dtype: int64
age
0.307692    9118
0.384615    4235
0.615385    3657
0.461538    3347
0.538462    3024
0.230769    2715
0.153846    2439
0.076923    2114
0.000000     975
0.692308     792
0.769231     504
0.846154     363
0.923077     171
1.000000      13
Name: count, dtype: int64
eyesight(left)
0.60    7616
0.50    7379
0.75    4709
0.45    3107
0.40    3103
0.35    2672
0.30    1519
0.25    1266
0.20     737
0.15     526
0.10     278
1.00     252
0.05     219
0.00      59
0.80      13
0.65       8
0.55       2
0.90       1
0.95       1
Name: count, dtype: int64
eyesight(right)
0.60    7563
0.50    7495
0.75    4561
0.40    3187
0.45    3174
0.35    2582
0.30    1444
0.25    1300
0.20     809
0.15     510
0.10     309
1.00     231
0.05     223
0.00      60
0.80      12
0.65       5
0.55       2
Name: count, dtype: int64
systolic
0.240741    2116
0.302469    2070
0.364198    2010
0.290123    1805
0.277778     943
            ... 
0.771605  

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size = 0.2, random_state = 15, stratify = y)

In [None]:
def scale_dataset(x, scaler=None):
    if scaler is None:
        scaler = StandardScaler()
        x = scaler.fit_transform(x)
    else:
        x = scaler.transform(x)
    return x, scaler

x_train, scaler = scale_dataset(X_train)
x_test, scaler = scale_dataset(X_test, scaler)

### Support Vector Machine (SVM)

In [9]:
from sklearn.svm import SVC

svc = SVC(kernel='rbf', C=1.0, gamma='scale')
svc.fit(x_train, y_train)

y_pred = svc.predict(x_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.7336420675231551
              precision    recall  f1-score   support

         0.0       0.77      0.84      0.80      4242
         1.0       0.66      0.56      0.61      2452

    accuracy                           0.73      6694
   macro avg       0.71      0.70      0.70      6694
weighted avg       0.73      0.73      0.73      6694



#### SVC with Hyperparameter Tuning

In [10]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 0.001, 0.01, 0.1, 1],
    'kernel': ['rbf', 'poly']
}

svc = SVC()

grid = GridSearchCV(
    estimator=svc,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid.fit(x_train, y_train)

print("Best parameters:", grid.best_params_)
best_svc = grid.best_estimator_

y_pred = best_svc.predict(x_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


KeyboardInterrupt: 

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score 
from sklearn.preprocessing import label_binarize

# class_labels = sorted(np.unique(y))
# y_test_bin = label_binarize(y_test, classes = class_labels)
lg_model = LogisticRegression(solver = "lbfgs", penalty = 'l2', max_iter = 1000)
lg_model = lg_model.fit(x_train, y_train)
y_pred = lg_model.predict(x_test)
y_pred_prob = lg_model.predict_proba(x_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))
# print("AUC Score: ", roc_auc_score(y_test, y_pred_prob, multi_class = 'ovr'))

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'penalty' : ['l1','l2','elasticnet','none'],
    'C' : np.logspace(-4,4,20),
    'solver' : ['lbfgs','newton-cg','sag','saga'],
    'max_iter' : [100, 200, 300, 500]
}

lg_model = LogisticRegression()
grid_search = GridSearchCV(lg_model, param_grid, cv = 10, scoring = 'accuracy', n_jobs = -1)
grid_search.fit(x_train, y_train)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)
y_pred_prob = best_model.predict_proba(x_test)
print(grid_search.best_params_)
print("Accuracy: ", accuracy_score(y_test, y_pred))

### Clustering 

### Naive Bayes Classifier

### Neural Networks

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras import Sequential
from keras.layers import Dense, Dropout
import keras_tuner as kt
from sklearn.metrics import r2_score, accuracy_score

def build_model(hp):
  nn_model= Sequential()
  nn_model.add(Dense(18,activation='relu',input_shape=(18,)))

  for i in range(hp.Int('num_of_layers', min_value=1, max_value=3)):
      nn_model.add(Dense(hp.Int('num_nodes' + str(i), min_value=2, max_value=16, step=3),
                         activation='relu'))
      nn_model.add(Dropout(0.5))

  nn_model.add(Dense(1, activation='sigmoid'))

  nn_model.compile(optimizer=hp.Choice('optimizer', values=['RMSprop','Adam']),
                   loss='binary_crossentropy',
                   metrics=['accuracy'])
  return nn_model

In [None]:
tuner=kt.GridSearch(build_model, objective= 'val_accuracy',directory="mydir",project_name="dib38")
tuner.search(x_train,y_train,epochs=5, validation_split=0.2)
tuner.get_best_hyperparameters()[0].values

In [None]:
import matplotlib.pyplot as plt

def plot_history(history):
  plt.plot(history.history['loss'])
  plt.plot(history.history['val_loss'])
  plt.title('Model Loss')
  plt.ylabel('Loss')
  plt.xlabel('Epoch')
  plt.legend(['Training Loss', 'Validation Loss'], loc='upper left')
  plt.show()

In [None]:
model=tuner.get_best_models(num_models=1)[0]
hist=model.fit(x_train,y_train,epochs=55,initial_epoch=5,validation_split=0.2,verbose=0)
y_pred=model.predict(x_test)
loss,accuracy=model.evaluate(x_test,y_test)
print("Accuracy: ",accuracy)
print("AUC score: ",roc_auc_score(y_test,y_pred))