In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    classification_report,
    confusion_matrix,
)
from sklearn.model_selection import (
    RandomizedSearchCV,
    cross_val_score,
    train_test_split,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

from .utils._logger import logger
from .utils._validation import config_args

In [None]:
from pandas import DataFrame

pd.set_option("display.max_columns", None)

# Load the dataset
try:
    df: DataFrame = pd.read_csv(config_args.data_path)
except FileNotFoundError:
    logger.error("File not found.  Check the file path.")
    raise
except Exception as e:
    logger.error(f"Error loading CSV: {e}")
    raise

df.head()

In [None]:
# Data Exploration
logger.info("Initial data summary...")

employees_count = df["EmployeeNumber"].nunique()
logger.info(f"Number of unique Employee Number: \n{employees_count}")

marital_count = df["MaritalStatus"].value_counts()
logger.info(f"Marital Status Value Counts: \n{marital_count}")

over18_count = df["Over18"].value_counts()
logger.info(f"Over18 Value Counts: \n{over18_count}")

null_count = df.isnull().sum().sort_values(ascending=False)[:5]
logger.info(f"Number of null values: \n{null_count}")

attrition_rate: float = round(df["Attrition"].value_counts()[1] / len(df), 2) * 100
logger.info(f"Employee rate for left the company: \n%{attrition_rate}")

In [None]:
df.hist(grid=False, bins=10, figsize=(20, 15))

In [None]:
plt.figure(figsize=(16, 10))
sns.heatmap(df.corr(numeric_only=True), annot=True)

In [None]:
# Data Preprocessing
imputer = SimpleImputer(strategy="mean")
# scaler = StandardScaler()
scaler = MinMaxScaler()
onehot = OneHotEncoder()

In [None]:
df_copy = df.copy()
df_copy["Attrition"] = df_copy["Attrition"].apply(lambda x: 1 if x == "Yes" else 0)
df_copy["Gender"] = df_copy["Gender"].apply(lambda x: 1 if x == "Female" else 0)

marital_status: dict[str, int] = {"Married": 0, "Single": 1, "Divorced": 2}
df_copy["MaritalStatus"] = df_copy["MaritalStatus"].apply(lambda x: marital_status[x])
df_copy = df_copy.drop(["EmployeeCount", "StandardHours", "Over18"], axis=1)

In [None]:
# Numerical and categorical columns
num_cols: list[str] = [
    col
    for col in df_copy.columns
    if pd.api.types.is_numeric_dtype(df_copy[col]) and col != "Attrition"
]
cat_cols: list[str] = [
    col for col in df_copy.columns if col not in num_cols and col != "Attrition"
]

In [None]:
preprocessor = ColumnTransformer(
    [("numeric", scaler, num_cols), ("categorical", onehot, cat_cols)]
)

In [None]:
# Split Data
X = df_copy.drop("Attrition", axis=1)
y = df_copy["Attrition"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=42, stratify=y
)

In [None]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [None]:
model = RandomForestClassifier(1000, random_state=42, class_weight="balanced")
model.fit(X_train, y_train)

In [None]:
# Predictions
y_pred = model.predict(X_test)

In [None]:
# Evaluate the model
print(classification_report(y_test, y_pred))
confusion_matrix_display = ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred))
confusion_matrix_display.plot()
plt.show()

In [None]:
# Cross-Validation
logger.info("Cross-Validation...")
preprocessor_cv = Pipeline(steps=[("pre", preprocessor), ("model", model)])
cross_validation_score = cross_val_score(
    preprocessor_cv, X, y, cv=5, scoring="accuracy"
)
print(f"Cross-Validation Score: {cross_validation_score.mean()}")

In [None]:
# Check coef_
model.feature_importances_

In [None]:
# Match coefs of features to columns
feature_dict = dict(zip(df_copy.columns, model.feature_importances_.tolist()))
feature_dict

In [None]:
# Visualize feature importance
feature_df = pd.DataFrame(feature_dict, index=[0])
feature_df.T.plot.bar(title="Feature Importance", legend=False)

In [None]:
# Setup RandomizedSearchCV
param_grid = {
    "n_estimators": [10, 50, 100, 200],
    "max_depth": [None, 5, 10, 15],
    "min_samples_split": np.linspace(0.1, 1, 11),
    "bootstrap": [True, False],
}

forest_rs = RandomizedSearchCV(
    estimator=RandomForestClassifier(),
    param_distributions=param_grid,
    cv=5,
    n_iter=10,
    verbose=True,
    random_state=42,
)

forest_rs.fit(X_train, y_train)
forest_rs.best_params_