### **Importando bibliotecas**

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
from warnings import filterwarnings

In [68]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, recall_score, precision_score, 
                             f1_score, roc_auc_score, classification_report)
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [58]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping

### **Obtendo dados**

In [30]:
df = pd.read_csv("data/IBM_Employee_Attrition_clean.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 31 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   int64 
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EnvironmentSatisfaction   1470 non-null   int64 
 9   Gender                    1470 non-null   object
 10  HourlyRate                1470 non-null   int64 
 11  JobInvolvement            1470 non-null   int64 
 12  JobLevel                  1470 non-null   int64 
 13  JobRole                   1470 non-null   object
 14  JobSatisfaction         

In [31]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,1,11,3,1,0,8,0,1,6,4,0,5
1,49,0,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,0,23,4,4,1,10,3,3,10,7,1,7
2,37,1,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,1,15,3,2,0,7,3,3,0,0,0,0
3,33,0,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,1,11,3,3,0,8,3,3,8,7,3,0
4,27,0,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,0,12,3,4,1,6,3,3,2,2,2,2


### **Seleção de features**

In [33]:
y = df["Attrition"]
X = df.drop("Attrition", axis=1)

In [37]:
X_cat = X.select_dtypes("object")
X_num = X.drop(X_cat, axis=1)

In [40]:
num_pipe = Pipeline([
    ("scaler", StandardScaler())
])

cat_pipe = Pipeline([
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_pipe, X_num.columns),
    ("cat", cat_pipe, X_cat.columns)
])

X_processed = preprocessor.fit_transform(X)

In [46]:
x_train, x_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42, stratify=y)

### **Modelos do Scikit-learn**

In [77]:
sklearn_models = {
    "Random Forest": RFC(random_state=42),
    "Gaussian Naive Bayes": GaussianNB(),
    "Support Vector Machine": SVC(random_state=42),
    "K Nearest Neighbors": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(random_state=42)
}

sklearn_models_params = {
    "Random Forest": {"n_estimators": [100, 200, 300], "max_depth": [5, 10, 20], "max_features": ["sqrt", "log2"]},
    "Gaussian Naive Bayes": {"var_smoothing": [1e-9, 1e-7, 1e-5]},
    "Support Vector Machine": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"], "gamma": ["scale"]},
    "K Nearest Neighbors": {"n_neighbors": [3, 5, 7], "weights": ["uniform", "distance"], "algorithm": ["ball_tree", "kd_tree"]},
    "Logistic Regression": {"C": [0.1, 1, 10], "solver": ["lbfgs", "liblinear"]}
}

results = {
    "Model": [],
    "Best Parms": [],
    "Best Cross Val Score": [],
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1": [],
    "ROC_AUC": []
}

In [78]:
for name, model in sklearn_models.items():
    clf = GridSearchCV(model, sklearn_models_params[name], cv=5)
    clf.fit(x_train, y_train)
    best_model = clf.best_estimator_
    y_pred = best_model.predict(x_test)
    
    results["Best Parms"].append(clf.best_params_)
    results["Best Cross Val Score"].append(clf.best_score_)
    results["Model"].append(name)
    results["Accuracy"].append(accuracy_score(y_test, y_pred))
    results["Precision"].append(precision_score(y_test, y_pred))
    results["Recall"].append(recall_score(y_test, y_pred))
    results["F1"].append(f1_score(y_test, y_pred))
    results["ROC_AUC"].append(roc_auc_score(y_test, y_pred))
    
    print(model)
    print(classification_report(y_test, y_pred))
    print()

results_df = pd.DataFrame(results)
results_df

RandomForestClassifier(random_state=42)
              precision    recall  f1-score   support

           0       0.85      0.99      0.92       247
           1       0.67      0.09      0.15        47

    accuracy                           0.85       294
   macro avg       0.76      0.54      0.53       294
weighted avg       0.82      0.85      0.79       294


GaussianNB()
              precision    recall  f1-score   support

           0       0.91      0.64      0.75       247
           1       0.26      0.66      0.37        47

    accuracy                           0.65       294
   macro avg       0.58      0.65      0.56       294
weighted avg       0.80      0.65      0.69       294


SVC(random_state=42)
              precision    recall  f1-score   support

           0       0.89      0.99      0.93       247
           1       0.84      0.34      0.48        47

    accuracy                           0.88       294
   macro avg       0.86      0.66      0.71       29

Unnamed: 0,Model,Best Parms,Best Cross Val Score,Accuracy,Precision,Recall,F1,ROC_AUC
0,Random Forest,"{'max_depth': 20, 'max_features': 'log2', 'n_e...",0.863956,0.846939,0.666667,0.085106,0.150943,0.538505
1,Gaussian Naive Bayes,{'var_smoothing': 1e-05},0.646354,0.646259,0.260504,0.659574,0.373494,0.65165
2,Support Vector Machine,"{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}",0.888615,0.884354,0.842105,0.340426,0.484848,0.66414
3,K Nearest Neighbors,"{'algorithm': 'ball_tree', 'n_neighbors': 7, '...",0.846091,0.843537,0.538462,0.148936,0.233333,0.562322
4,Logistic Regression,"{'C': 10, 'solver': 'lbfgs'}",0.885222,0.857143,0.592593,0.340426,0.432432,0.647946


### **Modelo do Tensorflow (Redes Neurais)**

In [138]:
tf_model = Sequential([
    Input(shape=(x_train.shape[1],)),
    Dense(48, activation="relu"),
    Dropout(0.2),
    Dense(32, activation="relu"),
    Dropout(0.3),
    Dense(1, activation="sigmoid")
])

In [149]:
tf_model.summary()

In [139]:
tf_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
early_stopping = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)

history = tf_model.fit(x_train, y_train, validation_split=0.2, epochs=100, callbacks=[early_stopping], batch_size=32)

Epoch 1/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.7474 - loss: 0.5531 - val_accuracy: 0.8305 - val_loss: 0.4255
Epoch 2/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.8487 - loss: 0.4346 - val_accuracy: 0.8305 - val_loss: 0.4018
Epoch 3/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8463 - loss: 0.4162 - val_accuracy: 0.8305 - val_loss: 0.3887
Epoch 4/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8318 - loss: 0.4156 - val_accuracy: 0.8305 - val_loss: 0.3775
Epoch 5/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8230 - loss: 0.4187 - val_accuracy: 0.8305 - val_loss: 0.3669
Epoch 6/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8478 - loss: 0.3791 - val_accuracy: 0.8347 - val_loss: 0.3598
Epoch 7/100
[1m30/30[0m [32m━━

In [140]:
y_pred = tf_model.predict(x_test)
y_pred = np.where(y_pred > 0.5, 1, 0)

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


In [150]:
results = {
    "Model": "Sequential Model",
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred),
    "Recall": recall_score(y_test, y_pred),
    "F1": f1_score(y_test, y_pred),
    "ROC_AUC": roc_auc_score(y_test, y_pred)
}

In [151]:
tf_model_result = pd.DataFrame(results, index=[0])
tf_model_result

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,ROC_AUC
0,Sequential Model,0.884354,0.782609,0.382979,0.514286,0.681368


#### **Saving Model**

In [148]:
tf_model.save("D:\\ds-projects\\employee_attrition\\models\\model\\model.keras")