### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split,GridSearchCV,RepeatedStratifiedKFold, cross_val_score
from sklearn.model_selection import KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn import metrics

from tqdm.auto import tqdm
import xgboost as xgb
from xgboost import XGBClassifier


### Data preparation and data clearning

**Some observation:**

According to the introduction of the competition, the data is interpreted as follows:
- `id`: Unique identifier for each data point.
- `age`: Age of the individual, categorized in 5-year intervals.
- `height(cm)`: Height of the individual in centimeters.
- `weight(kg)`: Weight of the individual in kilograms.
- `waist(cm)`: Waist circumference of the individual in centimeters.
- `eyesight(left/right)`: Eyesight measurements for the left and right eyes.
- `hearing(left/right)`: Hearing ability for the left and right ears, represented as binary.
- `systolic`: Systolic blood pressure measurement.
- `relaxation`: Diastolic blood pressure measurement.
- `fasting blood sugar`: Fasting blood sugar level.
- `Cholesterol`: Total cholesterol level.
- `triglyceride`: Triglyceride level.
- `HDL`: High-density lipoprotein cholesterol level.
- `LDL`: Low-density lipoprotein cholesterol level.
- `hemoglobin`: Hemoglobin level in the blood.
- `Urine protein`: Level of protein in urine, categorized.
- `serum creatinine`: Serum creatinine level.
- `AST`: Level of aspartate aminotransferase enzyme.
- `ALT`: Level of alanine aminotransferase enzyme.
- `Gtp`: Level of gamma-glutamyl transferase enzyme.
- `dental caries`: Presence (1) or absence (0) of dental cavities.
- `smoking`: Target variable indicating if the individual is a smoker (1) or not (0).

In [None]:
df = pd.read_csv('smoker_train_dataset.csv')
pg = pd.read_csv('train.csv') 
df.shape , pg.shape

In [None]:
df = pd.concat([pg, df])
df.head()

In [None]:
df.drop(columns=['id'], inplace=True)

In [None]:
def summary(df):
    sum = pd.DataFrame(df.dtypes, columns=['dtypes'])
    sum['missing #'] = df.isna().sum()
    sum['missing %'] = (df.isna().sum())/len(df)
    sum['uniques'] = df.nunique().values
    sum['count'] = df.count().values
    return sum
summary(df)

### EDA, feature importance analysis


In [None]:
df.columns

In [None]:
target = ['smoking']
discrete = ['age', 'height(cm)', 'weight(kg)', 'systolic', 'relaxation', 
           'fasting blood sugar', 'Cholesterol', 'triglyceride', 'HDL',
           'LDL', 'AST', 'ALT', 'Gtp']
continous = ['waist(cm)', 'eyesight(left)', 'eyesight(right)', 'hemoglobin',
             'serum creatinine']
binary = ['hearing(left)', 'hearing(right)', 'dental caries']
nominal = ['Urine protein']

num_var = discrete + continous


In [None]:
# for var in num_var:
#     sns.histplot(data=df, x=var, kde=True)
#     plt.title(f"Distribution of {var}")
#     plt.show()   

In [None]:
df[num_var].describe().T

In [None]:
cat_var = target + binary + nominal

for var in cat_var:
    sns.countplot(data=df, x=var)
    plt.title(f"Count of {var}")
    plt.show()  

In [None]:
selected_var = ['age', 'relaxation', 'systolic', 'hemoglobin', 'Gtp', 'ALT', 'AST', 'LDL', 'HDL', 'Cholesterol', 'eyesight(right)', 'eyesight(left)']
for var in selected_var:
    sns.kdeplot(data=df, x=var, hue=target[0])
    plt.title(f"Distribution of {var} for smokers v. non-smokers")
    plt.show()   

In [None]:
corr_matrix = df[num_var].corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

plt.figure(figsize=(15, 12))
sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='Blues',  fmt='.2f', linewidths=2 )

plt.title('Correlation Matrix', fontsize=15)
plt.show()

Since the correlation values between any two variables is not greater than 90%, none of the attributes were removed.



In [None]:
df['bmi'] = df['weight(kg)'] / (df['height(cm)'] / 100) ** 2
features = list(df.columns)
features.pop(-2)

### Model selection process and parameter tuning

In [None]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=1)

full_train = df_train_full.copy()
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train_full = df_train_full.smoking.values
y_train = df_train.smoking.values
y_val = df_val.smoking.values
y_test = df_test.smoking.values

del full_train['smoking']
del df_train['smoking']
del df_val['smoking']
del df_test['smoking']

X = df.drop(columns = ['smoking'], axis = 1)
Y = df['smoking']

In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=1)

def graph_roc(y,y_pred):
    fpr , tpr , thresholds = metrics.roc_curve(y, y_pred)
    plt.figure(figsize=(5, 5))
    plt.plot(fpr, tpr,label='Model')
    plt.plot([0, 1], [0, 1],label='Random', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve')
    plt.show()
    # print(metrics.auc(fpr, tpr))


def met(y,y_pred,above_average):
    print('Model Accuracy:',round(metrics.accuracy_score(y, above_average),4))
    print('ROC:',round(roc_auc_score(y, y_pred),4))
    print((metrics.confusion_matrix(y, above_average)/metrics.confusion_matrix(y, above_average).sum()).round(4))


#### Logistic Regression

In [None]:
def train(df, y,  c=1.0):
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=1)
    model.fit(df, y)      
    return model

def predict(df,model):
    y_pred = model.predict_proba(df)[:, 1]    
    return y_pred

model = train(df_train, y_train)
y_pred = predict(df_val,model)
above_average = (y_pred >= 0.5)
met(y_val,y_pred,above_average)
graph_roc(y_val,y_pred)

Determine best C for logistic regression

In [None]:
n_splits = 5
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)
skf = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 1, random_state = 42)

for c in tqdm([0.01, 0.1, 1.0 , 0.5, 10,15]):

    scores = []

    for train_idx ,val_idx in kfold.split(df_train_full):
        df_train = df_train_full.iloc[train_idx]
        df_val = df_train_full.iloc[val_idx]

        y_train = df_train.smoking.values
        y_val = df_val.smoking.values
        
        del df_train['smoking']
        del df_val['smoking']
        
        model = train(df_train,y_train,c)
        y_pred = predict(df_val,model)

        roc_auc = metrics.roc_auc_score(y_val,y_pred)
        scores.append(roc_auc)

    print('C=%s %f +- %f' % (c,np.mean(scores),np.std(scores)))

In [None]:
model = train(df_train, y_train,1.0)
y_pred = predict(df_val,model)
above_average = (y_pred >= 0.5)
met(y_val,y_pred,above_average)
graph_roc(y_val,y_pred)

Select C=1.0 as best tuning parameter for logistic regression

C=1.0 0.827558 +- 0.002522


#### Random Forest

In [None]:
rf = RandomForestClassifier(random_state=29)
rf.fit(df_train, y_train)

In [None]:
y_pred = rf.predict_proba(df_val)[:, 1]
above_average = (y_pred >= 0.5)
met(y_val,y_pred,above_average)
graph_roc(y_val,y_pred)

In [None]:
rf.get_params()

The untuned RF has an accuracy of 86.05% on the validation dataset.

In [None]:
n_estimators = [100,150, 200,250, 300,350]
max_depth =  [10,15,20,25,30,35]
all_scores =[]
for m in max_depth:
    for n in n_estimators:
        random_rf = RandomForestClassifier(n_estimators=n,max_depth=m,random_state=29,n_jobs=-1)
        random_rf.fit(full_train, y_train_full)
        y_pred_rf = random_rf.predict_proba(df_val)[:, 1] 
        all_scores.append((m,n,metrics.roc_auc_score(y_val, y_pred_rf ),metrics.accuracy_score(y_val,y_pred_rf>=0.5)))

In [None]:
df_all_scores = pd.DataFrame(all_scores, columns=['max_depth','n_estimator','roc','acc'])
df_all_scores.round(4).sort_values(by='acc',ascending=False)

In [None]:
roc_pivot = df_all_scores.pivot(index='max_depth', columns=['n_estimator'],values=['roc'])
roc_pivot.round(4)
plt.figure(figsize = (16,5))
sns.heatmap(roc_pivot,annot=True,fmt='.3f')

In [None]:
acc_pivot = df_all_scores.pivot(index='max_depth', columns=['n_estimator'],values=['acc'])
acc_pivot.round(4)
plt.figure(figsize = (16,5))
sns.heatmap(acc_pivot,annot=True,fmt='.3f')

In [None]:
rf = RandomForestClassifier(n_estimators=25,max_depth=200,random_state=29,n_jobs=-1)
rf.fit(full_train, y_train_full)

In [None]:
rf_pred = rf.predict_proba(df_test)[:, 1] 
print(metrics.accuracy_score(y_test,rf_pred>=0.5))
print(metrics.roc_auc_score(y_test,rf_pred))


In [None]:
importances = rf.feature_importances_
feature_importances = pd.DataFrame({'Feature': features, 'Importance': importances})
feature_importances.sort_values(by='Importance', ascending=False)

#### XGboost

In [None]:
dtrain = xgb.DMatrix(df_train, label= y_train , feature_names = features)
dval = xgb.DMatrix(df_val, label= y_val , feature_names = features)

watchlist = [(dtrain,'train'),(dval,'val')]

In [None]:
xgb_scores = []

for x in range(3,10):    
    xgb_params = {
        'eta': 0.1, 
        'max_depth': x,
        'min_child_weight': 1,
        'objective': 'binary:logistic',
        'nthread': 8,
        'eval_metric': 'auc',
        'seed': 29,
        'verbosity': 1,
    }
    xgb_model = xgb.train(xgb_params,dtrain,verbose_eval=5,evals=watchlist,num_boost_round=100)
    y_pred = xgb_model.predict(dval)
    xgb_scores.append((x,metrics.roc_auc_score(y_val, y_pred ),metrics.accuracy_score(y_val,y_pred>=0.5)))

picking best parameters for xgboost 

max_depth = 6

eta = 0.1

In [None]:
xgb_all_scores = pd.DataFrame(xgb_scores, columns=['max_depth','roc','acc'])
xgb_all_scores.round(3).sort_values(by='roc',ascending=False)

In [None]:
xgb_all_scores.round(3).sort_values(by='acc',ascending=False)

#### Picking XGboost to be less comples than Random Forest with the same performance.

In [None]:
# XGB_cv_routine = cross_val_score(XGBClassifier(tree_method='hist',
#                                                n_estimators=600,
#                                                learning_rate=0.1,
#                                                colsample_bytree=0.5),
#                                  X,
#                                  Y,
#                                  scoring='roc_auc',
#                                  cv=skf,
#                                  n_jobs=-1)

# print(
#     f"The average oof ROC-AUC score of the XGB model is {XGB_cv_routine.mean()}")

In [33]:
c = df.tail(1).to_json()
c

'{"age":{"38983":55},"height(cm)":{"38983":175},"weight(kg)":{"38983":60},"waist(cm)":{"38983":81.1},"eyesight(left)":{"38983":1.0},"eyesight(right)":{"38983":1.0},"hearing(left)":{"38983":1},"hearing(right)":{"38983":1},"systolic":{"38983":114},"relaxation":{"38983":66},"fasting blood sugar":{"38983":86},"Cholesterol":{"38983":212},"triglyceride":{"38983":57},"HDL":{"38983":64},"LDL":{"38983":137},"hemoglobin":{"38983":13.9},"Urine protein":{"38983":1},"serum creatinine":{"38983":1.0},"AST":{"38983":18},"ALT":{"38983":12},"Gtp":{"38983":16},"dental caries":{"38983":0},"smoking":{"38983":1},"bmi":{"38983":19.5918367347}}'