In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import os
os.environ["LOKY_MAX_CPU_COUNT"] = "4"  # Cores I want to use

In [3]:
file_path = 'WA_Fn-UseC_-HR-Employee-Attrition.csv'
data = pd.read_csv(file_path)

print(data.head())

   Age Attrition     BusinessTravel  DailyRate              Department  \
0   41       Yes      Travel_Rarely       1102                   Sales   
1   49        No  Travel_Frequently        279  Research & Development   
2   37       Yes      Travel_Rarely       1373  Research & Development   
3   33        No  Travel_Frequently       1392  Research & Development   
4   27        No      Travel_Rarely        591  Research & Development   

   DistanceFromHome  Education EducationField  EmployeeCount  EmployeeNumber  \
0                 1          2  Life Sciences              1               1   
1                 8          1  Life Sciences              1               2   
2                 2          2          Other              1               4   
3                 3          4  Life Sciences              1               5   
4                 2          1        Medical              1               7   

   ...  RelationshipSatisfaction StandardHours  StockOptionLevel  \
0  ...

DATA CLEANING

In [6]:
columns_to_drop = ['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours']
data_cleaned = data.drop(columns=columns_to_drop)

In [8]:
missing_values = data_cleaned.isnull().sum()

print("Missing values per column:\n", missing_values)

Missing values per column:
 Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64


We saw that there is no missing values nor duplicates in the dataset. So, we will not be removing/dropping any rows.

DATA SUMMARIZATION

In [12]:
# Summarize numerical columns
numerical_summary = data_cleaned.describe()

# Summarize categorical columns
categorical_columns = data_cleaned.select_dtypes(include=['object']).columns
categorical_summary = data_cleaned[categorical_columns].nunique()

In [14]:
# Display summaries
print("Numerical Summary:\n", numerical_summary)
print("\nCategorical Column Unique Values:\n", categorical_summary)

Numerical Summary:
                Age    DailyRate  DistanceFromHome    Education  \
count  1470.000000  1470.000000       1470.000000  1470.000000   
mean     36.923810   802.485714          9.192517     2.912925   
std       9.135373   403.509100          8.106864     1.024165   
min      18.000000   102.000000          1.000000     1.000000   
25%      30.000000   465.000000          2.000000     2.000000   
50%      36.000000   802.000000          7.000000     3.000000   
75%      43.000000  1157.000000         14.000000     4.000000   
max      60.000000  1499.000000         29.000000     5.000000   

       EnvironmentSatisfaction   HourlyRate  JobInvolvement     JobLevel  \
count              1470.000000  1470.000000     1470.000000  1470.000000   
mean                  2.721769    65.891156        2.729932     2.063946   
std                   1.093082    20.329428        0.711561     1.106940   
min                   1.000000    30.000000        1.000000     1.000000   
25%  

ONE-HOT ENCODING

In [17]:
# Convert 'Attrition' into a binary numeric column
data_cleaned['Attrition'] = data_cleaned['Attrition'].map({'Yes': 1, 'No': 0})

# Perform one-hot encoding on other categorical columns
categorical_columns = data_cleaned.select_dtypes(include=['object']).columns
data_encoded = pd.get_dummies(data_cleaned, columns=categorical_columns, drop_first=True)

In [19]:
# Scaling Numerical Variables
numerical_features = ['Age', 'MonthlyIncome', 'YearsAtCompany', 'DailyRate', 'HourlyRate']  # Add more if necessary
scaler = MinMaxScaler()

data_encoded[numerical_features] = scaler.fit_transform(data_encoded[numerical_features])

SPLITTING DATA

In [22]:
X = data_encoded.drop('Attrition', axis=1)  # Feature
y = data_encoded['Attrition']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # (80% training, 20% testing)

In [24]:
# Checking the shape of the training and testing sets
print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

Training set shape: (1176, 44)
Testing set shape: (294, 44)


SCALING THE DATA

In [27]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

TRAINING AND COMPARING MULTIPLE MODELS

In [38]:
models = []
model_performance = {}

# Defining models to be compared
model_dict = {
    "Logistic Regression": LogisticRegression(max_iter=5000, solver='liblinear'),  # Alternative solver
    "Random Forest": RandomForestClassifier(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "SVM": SVC(random_state=42),
    "KNN": KNeighborsClassifier()
}


In [40]:
# Train and Evaluate Each Model
for model_name, model in model_dict.items():
    model.fit(X_train_scaled, y_train)
    
    y_pred = model.predict(X_test_scaled)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred, zero_division=1)  # Handle undefined precision
    
    # Store the performance metrics
    model_performance[model_name] = {
        'Accuracy': accuracy,
        'Confusion Matrix': conf_matrix,
        'Classification Report': class_report
    }
    
    # Print the evaluation results for each model
    print(f"\nModel: {model_name}")
    print(f"Accuracy: {accuracy:.2f}")
    print("Confusion Matrix:\n", conf_matrix)
    print("Classification Report:\n", class_report)


Model: Logistic Regression
Accuracy: 0.88
Confusion Matrix:
 [[241  14]
 [ 21  18]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.95      0.93       255
           1       0.56      0.46      0.51        39

    accuracy                           0.88       294
   macro avg       0.74      0.70      0.72       294
weighted avg       0.87      0.88      0.88       294


Model: Random Forest
Accuracy: 0.88
Confusion Matrix:
 [[254   1]
 [ 35   4]]
Classification Report:
               precision    recall  f1-score   support

           0       0.88      1.00      0.93       255
           1       0.80      0.10      0.18        39

    accuracy                           0.88       294
   macro avg       0.84      0.55      0.56       294
weighted avg       0.87      0.88      0.83       294


Model: Decision Tree
Accuracy: 0.76
Confusion Matrix:
 [[215  40]
 [ 32   7]]
Classification Report:
               precision    reca

In [42]:
# Summarize the accuracy of each model for comparison
print("\nModel Comparison:")
for model_name, performance in model_performance.items():
    print(f"{model_name}: Accuracy = {performance['Accuracy']:.2f}")


Model Comparison:
Logistic Regression: Accuracy = 0.88
Random Forest: Accuracy = 0.88
Decision Tree: Accuracy = 0.76
SVM: Accuracy = 0.90
KNN: Accuracy = 0.88


CHOOSING BEST MODEL based on the comparison of the HIGHEST ACCURACY (or best-balanced metrics).

In [45]:
import joblib

# Selected the best model
best_model_name = "SVM"
best_model = model_dict[best_model_name]

# Saving the model
joblib.dump(best_model, f'{best_model_name}_model.pkl')
print(f"{best_model_name} has been saved successfully as '{best_model_name}_model.pkl'.")

SVM has been saved successfully as 'SVM_model.pkl'.
