In [9]:
import sklearn
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import  accuracy_score,log_loss, r2_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import OneHotEncoder


import warnings
warnings.filterwarnings("ignore")


In [10]:
df = pd.read_csv(r'..\Datasets\Cases\human-resources-analytics\HR_comma_sep.csv')
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.1,0.77,6,247,4,0,1,0,sales,low
3,0.92,0.85,5,259,5,0,1,0,sales,low
4,0.89,1.0,5,224,5,0,1,0,sales,low


In [11]:
X = df.drop(columns=['salary'], axis=1)
y = df['salary']

In [12]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.1,0.77,6,247,4,0,1,0,sales,low
3,0.92,0.85,5,259,5,0,1,0,sales,low
4,0.89,1.0,5,224,5,0,1,0,sales,low


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14995 entries, 0 to 14994
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14995 non-null  float64
 1   last_evaluation        14995 non-null  float64
 2   number_project         14995 non-null  int64  
 3   average_montly_hours   14995 non-null  int64  
 4   time_spend_company     14995 non-null  int64  
 5   Work_accident          14995 non-null  int64  
 6   left                   14995 non-null  int64  
 7   promotion_last_5years  14995 non-null  int64  
 8   Department             14995 non-null  object 
 9   salary                 14995 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24, stratify=y)

**GridSearchCV**

In [15]:
ohe = OneHotEncoder(
    sparse_output=False, 
    drop='first'
).set_output(transform='pandas')


ct = make_column_transformer(
    ('passthrough', make_column_selector(dtype_exclude=[
        'object',
        'category',
    ])),
    (ohe, make_column_selector(dtype_include=[
        'object',
        'category',
    ])),
    verbose_feature_names_out=False,
).set_output(transform='pandas')


In [16]:
dtc = DecisionTreeClassifier(random_state=24)
lr = LogisticRegression(random_state=24)
nb = GaussianNB()
vote = VotingClassifier([('DT', dtc), ('LR', lr), ('NB', nb)], voting='soft')


params = { 
    # 'DT__criterion': ['absolute_error'],
    # 'DT__criterion': ['absolute_error', 'friedman_mse','squared_error', 'poisson'],
    'VT__LR__C': np.linspace(0.001, 3, 5),
    'VT__DT__max_depth': [None, 2, 3],
    'VT__DT__min_samples_leaf':  [2, 10, 20],
    'VT__DT__min_samples_split':  [1, 10, 20],
}




In [17]:
pipe = Pipeline([
    ('CT', ct),
    ('VT', vote),
])

kfold = StratifiedKFold(n_splits=5, random_state=24, shuffle=True)

gcv = GridSearchCV(
    estimator=pipe,
    param_grid=params,
    cv=kfold,
    verbose=2,
)



In [18]:
gcv.fit(X_train, y_train)

Fitting 5 folds for each of 135 candidates, totalling 675 fits
[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=2, VT__DT__min_samples_split=1, VT__LR__C=0.001; total time=   0.0s
[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=2, VT__DT__min_samples_split=1, VT__LR__C=0.001; total time=   0.0s
[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=2, VT__DT__min_samples_split=1, VT__LR__C=0.001; total time=   0.0s
[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=2, VT__DT__min_samples_split=1, VT__LR__C=0.001; total time=   0.0s
[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=2, VT__DT__min_samples_split=1, VT__LR__C=0.001; total time=   0.0s
[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=2, VT__DT__min_samples_split=1, VT__LR__C=0.75075; total time=   0.0s
[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=2, VT__DT__min_samples_split=1, VT__LR__C=0.75075; total time=   0.0s
[CV] END VT__DT__max_depth=None, VT__DT__min_sample

[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=2, VT__DT__min_samples_split=20, VT__LR__C=1.5005; total time=   0.8s
[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=2, VT__DT__min_samples_split=20, VT__LR__C=2.25025; total time=   1.1s
[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=2, VT__DT__min_samples_split=20, VT__LR__C=2.25025; total time=   0.8s
[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=2, VT__DT__min_samples_split=20, VT__LR__C=2.25025; total time=   0.9s
[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=2, VT__DT__min_samples_split=20, VT__LR__C=2.25025; total time=   0.8s
[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=2, VT__DT__min_samples_split=20, VT__LR__C=2.25025; total time=   0.8s
[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=2, VT__DT__min_samples_split=20, VT__LR__C=3.0; total time=   0.8s
[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=2, VT__DT__min_samples_split=20, VT__LR__C=3

[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=10, VT__DT__min_samples_split=20, VT__LR__C=0.001; total time=   1.2s
[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=10, VT__DT__min_samples_split=20, VT__LR__C=0.75075; total time=   0.9s
[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=10, VT__DT__min_samples_split=20, VT__LR__C=0.75075; total time=   1.2s
[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=10, VT__DT__min_samples_split=20, VT__LR__C=0.75075; total time=   1.4s
[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=10, VT__DT__min_samples_split=20, VT__LR__C=0.75075; total time=   1.0s
[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=10, VT__DT__min_samples_split=20, VT__LR__C=0.75075; total time=   0.7s
[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=10, VT__DT__min_samples_split=20, VT__LR__C=1.5005; total time=   0.5s
[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=10, VT__DT__min_samples_split=20, V

[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=20, VT__DT__min_samples_split=10, VT__LR__C=2.25025; total time=   1.0s
[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=20, VT__DT__min_samples_split=10, VT__LR__C=2.25025; total time=   1.0s
[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=20, VT__DT__min_samples_split=10, VT__LR__C=3.0; total time=   1.1s
[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=20, VT__DT__min_samples_split=10, VT__LR__C=3.0; total time=   1.1s
[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=20, VT__DT__min_samples_split=10, VT__LR__C=3.0; total time=   1.1s
[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=20, VT__DT__min_samples_split=10, VT__LR__C=3.0; total time=   1.1s
[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=20, VT__DT__min_samples_split=10, VT__LR__C=3.0; total time=   1.0s
[CV] END VT__DT__max_depth=None, VT__DT__min_samples_leaf=20, VT__DT__min_samples_split=20, VT__LR__C=0.001; t

[CV] END VT__DT__max_depth=2, VT__DT__min_samples_leaf=2, VT__DT__min_samples_split=10, VT__LR__C=0.75075; total time=   0.2s
[CV] END VT__DT__max_depth=2, VT__DT__min_samples_leaf=2, VT__DT__min_samples_split=10, VT__LR__C=0.75075; total time=   0.2s
[CV] END VT__DT__max_depth=2, VT__DT__min_samples_leaf=2, VT__DT__min_samples_split=10, VT__LR__C=1.5005; total time=   0.9s
[CV] END VT__DT__max_depth=2, VT__DT__min_samples_leaf=2, VT__DT__min_samples_split=10, VT__LR__C=1.5005; total time=   0.9s
[CV] END VT__DT__max_depth=2, VT__DT__min_samples_leaf=2, VT__DT__min_samples_split=10, VT__LR__C=1.5005; total time=   0.8s
[CV] END VT__DT__max_depth=2, VT__DT__min_samples_leaf=2, VT__DT__min_samples_split=10, VT__LR__C=1.5005; total time=   1.1s
[CV] END VT__DT__max_depth=2, VT__DT__min_samples_leaf=2, VT__DT__min_samples_split=10, VT__LR__C=1.5005; total time=   0.9s
[CV] END VT__DT__max_depth=2, VT__DT__min_samples_leaf=2, VT__DT__min_samples_split=10, VT__LR__C=2.25025; total time=   0.

[CV] END VT__DT__max_depth=2, VT__DT__min_samples_leaf=10, VT__DT__min_samples_split=1, VT__LR__C=3.0; total time=   0.0s
[CV] END VT__DT__max_depth=2, VT__DT__min_samples_leaf=10, VT__DT__min_samples_split=10, VT__LR__C=0.001; total time=   0.9s
[CV] END VT__DT__max_depth=2, VT__DT__min_samples_leaf=10, VT__DT__min_samples_split=10, VT__LR__C=0.001; total time=   0.6s
[CV] END VT__DT__max_depth=2, VT__DT__min_samples_leaf=10, VT__DT__min_samples_split=10, VT__LR__C=0.001; total time=   0.9s
[CV] END VT__DT__max_depth=2, VT__DT__min_samples_leaf=10, VT__DT__min_samples_split=10, VT__LR__C=0.001; total time=   0.9s
[CV] END VT__DT__max_depth=2, VT__DT__min_samples_leaf=10, VT__DT__min_samples_split=10, VT__LR__C=0.001; total time=   1.0s
[CV] END VT__DT__max_depth=2, VT__DT__min_samples_leaf=10, VT__DT__min_samples_split=10, VT__LR__C=0.75075; total time=   1.0s
[CV] END VT__DT__max_depth=2, VT__DT__min_samples_leaf=10, VT__DT__min_samples_split=10, VT__LR__C=0.75075; total time=   1.0s

[CV] END VT__DT__max_depth=2, VT__DT__min_samples_leaf=20, VT__DT__min_samples_split=1, VT__LR__C=2.25025; total time=   0.0s
[CV] END VT__DT__max_depth=2, VT__DT__min_samples_leaf=20, VT__DT__min_samples_split=1, VT__LR__C=2.25025; total time=   0.0s
[CV] END VT__DT__max_depth=2, VT__DT__min_samples_leaf=20, VT__DT__min_samples_split=1, VT__LR__C=2.25025; total time=   0.0s
[CV] END VT__DT__max_depth=2, VT__DT__min_samples_leaf=20, VT__DT__min_samples_split=1, VT__LR__C=3.0; total time=   0.0s
[CV] END VT__DT__max_depth=2, VT__DT__min_samples_leaf=20, VT__DT__min_samples_split=1, VT__LR__C=3.0; total time=   0.0s
[CV] END VT__DT__max_depth=2, VT__DT__min_samples_leaf=20, VT__DT__min_samples_split=1, VT__LR__C=3.0; total time=   0.0s
[CV] END VT__DT__max_depth=2, VT__DT__min_samples_leaf=20, VT__DT__min_samples_split=1, VT__LR__C=3.0; total time=   0.0s
[CV] END VT__DT__max_depth=2, VT__DT__min_samples_leaf=20, VT__DT__min_samples_split=1, VT__LR__C=3.0; total time=   0.0s
[CV] END VT_

KeyboardInterrupt: 

In [None]:
print(gcv.best_score_)
print(gcv.best_params_)

best_model = gcv.best_estimator_
best_model

In [None]:
y_pred = best_model.predict(X_test)
print('Accuracy Score:', accuracy_score(y_test, y_pred))

In [None]:
y_pred_prob = best_model.predict_proba(X_test)[:, 1] #predict_proba works with only voting='soft'
print(roc_auc_score(y_test, y_pred_prob))