In [2]:
import numpy as np 
import pandas as pd 

# Data loading and preparing 

In [3]:
data = pd.read_csv('employee-dataset/Employee.csv')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4653 entries, 0 to 4652
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Education                  4653 non-null   object
 1   JoiningYear                4653 non-null   int64 
 2   City                       4653 non-null   object
 3   PaymentTier                4653 non-null   int64 
 4   Age                        4653 non-null   int64 
 5   Gender                     4653 non-null   object
 6   EverBenched                4653 non-null   object
 7   ExperienceInCurrentDomain  4653 non-null   int64 
 8   LeaveOrNot                 4653 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 327.3+ KB


In [5]:
data['PaymentTier'] = data['PaymentTier'].astype(str)
data['Gender'] = (data['Gender']=='Male').astype(int)
data['EverBenched'] = (data['EverBenched']=='Yes').astype(int)

# ML 

In [40]:
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline,Pipeline

In [41]:
X,y = data.drop(['LeaveOrNot'],axis=1), data['LeaveOrNot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,shuffle=True,random_state=100)

In [42]:
from sklearn.compose import ColumnTransformer

cat_features = list(X_train.select_dtypes(object).columns)
ct = ColumnTransformer([('encode_cats', OneHotEncoder(drop='first'), cat_features)],
                       remainder='passthrough')

## Naive Bayes

In [43]:
from sklearn.naive_bayes import GaussianNB


model = Pipeline([
    ('ct',ct),   
    ('rf', GaussianNB())
])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(f'Score: {f1_score(y_test, y_pred)}')

Score: 0.537917087967644


## Logistic Regression

In [44]:
from sklearn.linear_model import LogisticRegression
model = Pipeline([
    ('ct',ct),   
    ('rf', LogisticRegression(fit_intercept=False))
])

model.fit(X_train, y_train)
y_pred_l = model.predict(X_test)

print(f'Score: {f1_score(y_test, y_pred_l)}')

Score: 0.51338199513382


## Random Forest 

### Base model 

In [45]:
from sklearn.ensemble import RandomForestClassifier

model = Pipeline([
    ('ct',ct),   
    ('rf', RandomForestClassifier())
])


model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f'Score: {f1_score(y_test, y_pred)}')

Score: 0.7410358565737052


### Tuning parameters 

In [46]:
import optuna
from optuna.visualization import plot_optimization_history
from sklearn.model_selection import cross_val_score

In [47]:
model = Pipeline([
    ('ct',ct),   
    ('rf', RandomForestClassifier())
])

def objective(trial):
    params = {
        'rf__criterion' : trial.suggest_categorical('rf__criterion',["gini", "entropy", "log_loss"]),
        'rf__n_estimators': trial.suggest_int('rf__n_estimators', 50, 1000),
        'rf__max_depth': trial.suggest_int('rf__max_depth', 4, 50),
        'rf__min_samples_split': trial.suggest_int('rf__min_samples_split', 2, 150),
        'rf__min_samples_leaf':trial.suggest_int('rf__min_samples_leaf', 1, 60)
    }
    model.set_params(**params)

    return  np.mean(cross_val_score(model, X_train, y_train, cv=3, n_jobs=-1,scoring='f1'))

In [48]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2023-12-04 16:16:48,471] A new study created in memory with name: no-name-1052845e-8522-4b53-a8d3-3846c675fa08
[I 2023-12-04 16:16:55,185] Trial 0 finished with value: 0.6409899654344099 and parameters: {'rf__criterion': 'gini', 'rf__n_estimators': 740, 'rf__max_depth': 7, 'rf__min_samples_split': 114, 'rf__min_samples_leaf': 34}. Best is trial 0 with value: 0.6409899654344099.
[I 2023-12-04 16:16:58,179] Trial 1 finished with value: 0.6786143279714297 and parameters: {'rf__criterion': 'gini', 'rf__n_estimators': 724, 'rf__max_depth': 20, 'rf__min_samples_split': 27, 'rf__min_samples_leaf': 29}. Best is trial 1 with value: 0.6786143279714297.
[I 2023-12-04 16:17:00,066] Trial 2 finished with value: 0.7039057039316597 and parameters: {'rf__criterion': 'gini', 'rf__n_estimators': 278, 'rf__max_depth': 35, 'rf__min_samples_split': 103, 'rf__min_samples_leaf': 2}. Best is trial 2 with value: 0.7039057039316597.
[I 2023-12-04 16:17:01,736] Trial 3 finished with value: 0.6410158467807778 

[I 2023-12-04 16:17:49,584] Trial 30 finished with value: 0.6817172445524203 and parameters: {'rf__criterion': 'entropy', 'rf__n_estimators': 761, 'rf__max_depth': 23, 'rf__min_samples_split': 13, 'rf__min_samples_leaf': 25}. Best is trial 24 with value: 0.7346643927294733.
[I 2023-12-04 16:17:51,344] Trial 31 finished with value: 0.7308563652505494 and parameters: {'rf__criterion': 'log_loss', 'rf__n_estimators': 669, 'rf__max_depth': 38, 'rf__min_samples_split': 49, 'rf__min_samples_leaf': 4}. Best is trial 24 with value: 0.7346643927294733.
[I 2023-12-04 16:17:53,154] Trial 32 finished with value: 0.7262450497546018 and parameters: {'rf__criterion': 'log_loss', 'rf__n_estimators': 665, 'rf__max_depth': 46, 'rf__min_samples_split': 48, 'rf__min_samples_leaf': 5}. Best is trial 24 with value: 0.7346643927294733.
[I 2023-12-04 16:17:55,224] Trial 33 finished with value: 0.7365267062433404 and parameters: {'rf__criterion': 'log_loss', 'rf__n_estimators': 748, 'rf__max_depth': 33, 'rf__m

[I 2023-12-04 16:18:51,678] Trial 60 finished with value: 0.7333127905415034 and parameters: {'rf__criterion': 'log_loss', 'rf__n_estimators': 780, 'rf__max_depth': 14, 'rf__min_samples_split': 7, 'rf__min_samples_leaf': 7}. Best is trial 55 with value: 0.7424400827170242.
[I 2023-12-04 16:18:54,192] Trial 61 finished with value: 0.7426168461897461 and parameters: {'rf__criterion': 'log_loss', 'rf__n_estimators': 922, 'rf__max_depth': 12, 'rf__min_samples_split': 24, 'rf__min_samples_leaf': 2}. Best is trial 61 with value: 0.7426168461897461.
[I 2023-12-04 16:18:56,613] Trial 62 finished with value: 0.7387528320207143 and parameters: {'rf__criterion': 'log_loss', 'rf__n_estimators': 920, 'rf__max_depth': 8, 'rf__min_samples_split': 22, 'rf__min_samples_leaf': 1}. Best is trial 61 with value: 0.7426168461897461.
[I 2023-12-04 16:18:59,058] Trial 63 finished with value: 0.7351741826942559 and parameters: {'rf__criterion': 'log_loss', 'rf__n_estimators': 942, 'rf__max_depth': 8, 'rf__min_

[I 2023-12-04 16:19:34,984] Trial 90 finished with value: 0.7397675784964886 and parameters: {'rf__criterion': 'log_loss', 'rf__n_estimators': 572, 'rf__max_depth': 22, 'rf__min_samples_split': 20, 'rf__min_samples_leaf': 5}. Best is trial 85 with value: 0.7477531081916228.
[I 2023-12-04 16:19:36,453] Trial 91 finished with value: 0.7431010545242698 and parameters: {'rf__criterion': 'log_loss', 'rf__n_estimators': 508, 'rf__max_depth': 15, 'rf__min_samples_split': 11, 'rf__min_samples_leaf': 4}. Best is trial 85 with value: 0.7477531081916228.
[I 2023-12-04 16:19:38,045] Trial 92 finished with value: 0.7440784873544168 and parameters: {'rf__criterion': 'log_loss', 'rf__n_estimators': 545, 'rf__max_depth': 18, 'rf__min_samples_split': 13, 'rf__min_samples_leaf': 2}. Best is trial 85 with value: 0.7477531081916228.
[I 2023-12-04 16:19:39,523] Trial 93 finished with value: 0.745423459427272 and parameters: {'rf__criterion': 'log_loss', 'rf__n_estimators': 501, 'rf__max_depth': 18, 'rf__mi

In [51]:
model.set_params(**study.best_params)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print(f'Score: {f1_score(y_test, y_pred)}')

Score: 0.7752100840336135


## Catboost 

### Base model 

In [49]:
X_train_small, X_val, y_train_small, y_val = train_test_split(X_train, y_train, test_size=0.33,shuffle=True,random_state=100)

In [50]:
from catboost import CatBoostClassifier

clf = CatBoostClassifier()
clf.fit(X_train_small, y_train_small, 
        cat_features=cat_features, 
        eval_set=(X_val, y_val), 
        verbose=False
)
y_pred = clf.predict(X_test)
print(f'Score: {f1_score(y_test, y_pred)}')

Score: 0.7657754010695187


# Neural-FCA

In [None]:
import torch
from fcapy.context import FormalContext
from fcapy.lattice import ConceptLattice

plt.rcParams['figure.facecolor'] = (1,1,1,1)