In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report


In [None]:
normalized_df = pd.read_csv('../data/processed/normalized_df.csv')
df = normalized_df.copy(deep = True)
df

In [None]:
y = df['smoking']
df.drop(columns=['smoking'], inplace=True)
print(y.value_counts())
for col in df.columns:
    print(df[col].value_counts())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size = 0.2, random_state = 15, stratify = y)

In [None]:
def scale_dataset(x, scaler=None):
    if scaler is None:
        scaler = StandardScaler()
        x = scaler.fit_transform(x)
    else:
        x = scaler.transform(x)
    return x, scaler

x_train, scaler = scale_dataset(X_train)
x_test, scaler = scale_dataset(X_test, scaler)

### Support Vector Machine (SVM)

In [None]:
from sklearn.svm import SVC

svc = SVC(kernel='rbf', C=1.0, gamma='scale')
svc.fit(x_train, y_train)

y_pred = svc.predict(x_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

#### SVC with Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 0.001, 0.01, 0.1, 1],
    'kernel': ['rbf', 'poly']
}

svc = SVC()

grid = GridSearchCV(
    estimator=svc,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid.fit(x_train, y_train)

print("Best parameters:", grid.best_params_)
best_svc = grid.best_estimator_

y_pred = best_svc.predict(x_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
import optuna
from sklearn.metrics import cross_val_score

def objective(trial):

    svc_C = trial.suggest_float("svc_C",0.1,1000)
    svc_gamma = trial.suggest_float("svc_gamma",0.0001,1)
    svc_kernal = trial.suggest_categorical("svc_kernel",['rbf','poly']) 
    model = SVC(
        C=svc_C,
        gamma=svc_gamma,
        kernel = svc_kernal
    )  
    score = cross_val_score(model, x_train, y_train, cv=3)
    accuracy = score.mean()
    return accuracy
search_space ={
         'svc_C': [0.1, 1, 10, 100, 1000],
         'svc_gamma': [1, 0.1, 0.01, 0.001, 0.0001],
         'svc_kernel': ['rbf','poly']
}
study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.GridSampler(search_space)
)
study.optimize(objective)


### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score 
from sklearn.preprocessing import label_binarize

# class_labels = sorted(np.unique(y))
# y_test_bin = label_binarize(y_test, classes = class_labels)
lg_model = LogisticRegression(solver = "lbfgs", penalty = 'l2', max_iter = 1000)
lg_model = lg_model.fit(x_train, y_train)
y_pred = lg_model.predict(x_test)
y_pred_prob = lg_model.predict_proba(x_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))
# print("AUC Score: ", roc_auc_score(y_test, y_pred_prob, multi_class = 'ovr'))

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix for Logistic Regression")
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'penalty' : ['l1','l2'],
    'C' : np.logspace(-4,4,20),
    'solver' : ['lbfgs','newton-cg'],
    'max_iter' : [100, 200, 300, 500]
}

lg_model = LogisticRegression()
grid_search = GridSearchCV(lg_model, param_grid, cv = 10, scoring = 'accuracy', n_jobs = -1)
grid_search.fit(x_train, y_train)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)
y_pred_prob = best_model.predict_proba(x_test)
print(grid_search.best_params_)
print("Accuracy: ", accuracy_score(y_test, y_pred))

### Clustering 

### Naive Bayes Classifier

### Neural Networks

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras import Sequential
from keras.layers import Dense, Dropout
import keras_tuner as kt
from sklearn.metrics import r2_score, accuracy_score

def build_model(hp):
  nn_model= Sequential()
  nn_model.add(Dense(16,activation='relu',input_shape=(16,)))

  for i in range(hp.Int('num_of_layers', min_value=1, max_value=3)):
      nn_model.add(Dense(hp.Int('num_nodes' + str(i), min_value=2, max_value=16, step=3),
                         activation='relu'))
      nn_model.add(Dropout(0.5))

  nn_model.add(Dense(1, activation='sigmoid'))

  nn_model.compile(optimizer=hp.Choice('optimizer', values=['Adam']),
                   loss='binary_crossentropy',
                   metrics=['accuracy'])
  return nn_model

In [None]:
tuner=kt.GridSearch(build_model, objective= 'val_accuracy',directory="mydir",project_name="dib30")
tuner.search(x_train,y_train,epochs=3, validation_split=0.2)
tuner.get_best_hyperparameters()[0].values

In [None]:
import matplotlib.pyplot as plt

def plot_history(history):
  plt.plot(history.history['loss'])
  plt.plot(history.history['val_loss'])
  plt.title('Model Loss')
  plt.ylabel('Loss')
  plt.xlabel('Epoch')
  plt.legend(['Training Loss', 'Validation Loss'], loc='upper left')
  plt.show()

In [None]:
model=tuner.get_best_models(num_models=1)[0]
hist=model.fit(x_train,y_train,epochs=55,initial_epoch=5,validation_split=0.2,verbose=0)
y_pred=model.predict(x_test)
loss,accuracy=model.evaluate(x_test,y_test)
print("Accuracy: ",accuracy)
print("AUC score: ",roc_auc_score(y_test,y_pred))