In [1]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load dataset (assuming CSV format)
# Replace 'employee_turnover.csv' with your dataset path
df = pd.read_csv('HR-Employee-Attrition.csv')

# Data Preprocessing
df['OverTime'] = df['OverTime'].map({'Yes': 1, 'No': 0})  # Convert categorical to numerical
df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})  # Target variable

# Feature selection
features = ['Age', 'MonthlyIncome', 'YearsAtCompany', 'JobSatisfaction', 'OverTime']
X = df[features]
y = df['Attrition']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model training using RandomForest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Feature importance
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'feature': features, 'importance': importances}).sort_values(by='importance', ascending=False)
print("\nFeature Importance:\n", feature_importance_df)


Accuracy Score: 0.8390022675736961
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.95      0.91       380
           1       0.31      0.13      0.18        61

    accuracy                           0.84       441
   macro avg       0.59      0.54      0.55       441
weighted avg       0.79      0.84      0.81       441


Feature Importance:
            feature  importance
1    MonthlyIncome    0.386662
0              Age    0.254416
2   YearsAtCompany    0.186443
3  JobSatisfaction    0.088928
4         OverTime    0.083551
