In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv("../data/raw/train_dataset.csv")
df = data.copy(deep = True)
df

In [None]:
df.loc[df['eyesight(left)'] == 9.9, 'eyesight(left)'] = 0
df.loc[df['eyesight(right)'] == 9.9, 'eyesight(right)'] = 0

df

In [None]:
df.duplicated().sum()

In [None]:
df.isnull().sum()

In [None]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

In [None]:
df.info()
df.describe()

In [None]:
import smoker_stat_pred.plots as custom_plots
fig = custom_plots.plot_histogram(df, 'eyesight(left)')
fig.show()

In [None]:
fig = custom_plots.plot_histogram(df, 'eyesight(right)')
fig.show()

In [None]:
# hearing
def merge_hearing(row):
    if row['hearing(left)'] == 1 and row['hearing(right)'] == 1:
        return 0   # normal in both ears
    elif row['hearing(left)'] == 2 and row['hearing(right)'] == 2:
        return 2   # abnormal in both ears
    else:
        return 1   # abnormal in one ear only

df['hearing'] = df.apply(merge_hearing, axis=1)
df.drop(columns = ['hearing(left)', 'hearing(right)'], inplace = True)
df

In [None]:
fig = custom_plots.plot_histogram(df, 'hearing')
fig.show()

In [None]:
df.drop(columns=["hearing"], inplace=True)

In [None]:
fig = custom_plots.plot_correlation_matrix(df)
fig.show()

In [None]:
df

In [None]:
# scaling
cols_to_norm = df.columns
df[cols_to_norm] = df[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
df

In [None]:
y = df['smoking']
df.drop(columns=['smoking'], inplace=True)
print(y.value_counts())
for col in df.columns:
    print(df[col].value_counts())

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size = 0.2, random_state = 15, stratify = y)

In [None]:
from sklearn.preprocessing import StandardScaler

def scale_dataset(x, scaler=None):
    if scaler is None:
        scaler = StandardScaler()
        x = scaler.fit_transform(x)
    else:
        x = scaler.transform(x)
    return x, scaler

In [None]:
x_train, scaler = scale_dataset(X_train)
x_test, scaler = scale_dataset(X_test, scaler)

In [None]:
y.shape

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score 
from sklearn.preprocessing import label_binarize

# class_labels = sorted(np.unique(y))
# y_test_bin = label_binarize(y_test, classes = class_labels)
lg_model = LogisticRegression(solver = "lbfgs", penalty = 'l2', max_iter = 1000)
lg_model = lg_model.fit(x_train, y_train)
y_pred = lg_model.predict(x_test)
y_pred_prob = lg_model.predict_proba(x_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))
# print("AUC Score: ", roc_auc_score(y_test, y_pred_prob, multi_class = 'ovr'))

In [None]:
from sklearn.decomposition import KernelPCA

kpca = KernelPCA(kernel = 'rbf', gamma = 15)
x_kpca = kpca.fit_transform(x_train)

lg_kpca_model = LogisticRegression()
lg_kpca_model.fit(x_kpca, y_train)

x_test_kpca = kpca.transform(x_test)

y_pred = lg_kpca_model.predict(x_test_kpca)
y_pred_prob = lg_kpca_model.predict_proba(x_test_kpca)
print("Accuracy: ", accuracy_score(y_test, y_pred))

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'penalty' : ['l1','l2','elasticnet','none'],
    'C' : np.logspace(-4,4,20),
    'solver' : ['lbfgs','newton-cg','sag','saga'],
    'max_iter' : [100, 200, 300, 500]
}

lg_model = LogisticRegression()
grid_search = GridSearchCV(lg_model, param_grid, cv = 10, scoring = 'accuracy', n_jobs = -1)
grid_search.fit(x_train, y_train)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)
y_pred_prob = best_model.predict_proba(x_test)
print(grid_search.best_params_)
print("Accuracy: ", accuracy_score(y_test, y_pred))