In [10]:
# Import necessary libraries
import pandas as pd
import numpy as np

# For data preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# For model building
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# For evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [11]:
# Read the training and test data
train_df = pd.read_csv('/project/data/employee_data_train.csv')
test_df = pd.read_csv('/project/data/employee_data_test.csv')

In [12]:
# Combine train and test data for consistent preprocessing
combined_df = pd.concat([train_df, test_df], sort=False)

# Drop irrelevant columns
drop_cols = ['Employee Name', 'Employee ID', 'Start Date', 'End Date']
combined_df = combined_df.drop(drop_cols, axis=1)

In [13]:
# Feature Engineering
def feature_engineering(df):
    # Salary Percentage Change
    df['Salary Percentage Change'] = (df['Current Salary'] - df['Starting Salary']) / df['Starting Salary']

    # Salary Raise Per Year
    # To avoid division by zero, add a small epsilon where Tenure is zero
    epsilon = 1e-6
    df['Adjusted Tenure'] = df['Tenure'].apply(lambda x: x if x > 0 else epsilon)
    df['Salary Raise Per Year'] = (df['Current Salary'] - df['Starting Salary']) / df['Adjusted Tenure']

    # Promotion Frequency
    df['Promotion Frequency'] = df['Promotion History'] / df['Adjusted Tenure']

    # Drop the Adjusted Tenure column as it's no longer needed
    df = df.drop('Adjusted Tenure', axis=1)

    return df

In [14]:
# Apply feature engineering
combined_df = feature_engineering(combined_df)

# Update numerical features to include new features
numerical_features = ['Age', 'Tenure', 'Starting Salary', 'Current Salary',
                      'Years of Experience', 'Average Monthly Working Hours',
                      'Months in Role', 'Promotion History', 'Last Performance Review Score',
                      'Salary Percentage Change', 'Salary Raise Per Year', 'Promotion Frequency']

# Identify categorical features
categorical_features = ['Gender', 'Role', 'Department', 'Location', 'Contract']

# Separate features and target variable
X = combined_df.drop('Turnover', axis=1)
y = combined_df['Turnover']

# Split back into train and test sets
X_train = X.iloc[:len(train_df)]
X_test = X.iloc[len(train_df):]
y_train = y.iloc[:len(train_df)]
y_test = y.iloc[len(train_df):]

In [15]:
# Define the preprocessing steps with imputation
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create the pipeline with preprocessing and model
model = RandomForestClassifier(random_state=42)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best model from GridSearchCV
best_model = grid_search.best_estimator_

# Verify the model on unseen test data
y_pred = best_model.predict(X_test)

In [16]:
# Check accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on test data: {accuracy:.2f}")

Accuracy on test data: 0.78


In [17]:
# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.89      0.86        75
           1       0.55      0.46      0.50        24
           4       0.00      0.00      0.00         1

    accuracy                           0.78       100
   macro avg       0.46      0.45      0.45       100
weighted avg       0.76      0.78      0.77       100



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Confusion Matrix:
[[67  8  0]
 [13 11  0]
 [ 0  1  0]]


In [19]:
# Function to predict attrition probability on new data
def predict_attrition(data, model):
    """
    Accepts a dataframe with the same columns as training data,
    runs the model per each employee, and returns the dataframe
    with one new column with attrition probability.
    """
    # Keep a copy of the original data
    data_original = data.copy()

    # Drop irrelevant columns
    data = data.drop(drop_cols, axis=1)

    # Apply feature engineering
    data = feature_engineering(data)

    # Check if all required columns are present
    required_columns = numerical_features + categorical_features
    missing_cols = set(required_columns) - set(data.columns)
    if missing_cols:
        raise ValueError(f"The following required columns are missing: {missing_cols}")

    # The pipeline handles preprocessing and missing values
    probabilities = model.predict_proba(data)[:, 1]  # Probability of class '1' (attrition)

    # Add probabilities to the original dataframe
    data_original['Attrition Probability'] = probabilities

    return data_original

In [20]:
# Extract feature importances and corresponding feature names
def get_feature_importances(best_model, categorical_features):
    """
    Retrieves feature importances from the Random Forest model and maps them to feature names.
    Returns a sorted list of tuples (feature_name, importance).
    """
    # Get the feature importances from the classifier in the fitted model
    importances = best_model.named_steps['classifier'].feature_importances_

    # Get the preprocessor from the fitted model
    preprocessor = best_model.named_steps['preprocessor']

    # Get the numerical feature names
    num_features = preprocessor.transformers[0][2]

    # Get the fitted OneHotEncoder and retrieve the categorical feature names
    onehot_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot']
    onehot_feature_names = onehot_encoder.get_feature_names_out(categorical_features)

    # Combine numerical and categorical feature names
    feature_names = np.concatenate([num_features, onehot_feature_names])

    # Create a dictionary mapping feature names to their importances
    feature_importance_dict = dict(zip(feature_names, importances))

    # Sort the features by importance
    sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

    return sorted_features

In [21]:
# Now use the best_model (from grid_search.best_estimator_) to get feature importances
feature_importances = get_feature_importances(grid_search.best_estimator_, categorical_features)

# Print the top 10 features
print("\nTop 10 Features by Importance:")
for feature, importance in feature_importances[:10]:
    print(f"{feature}: {importance:.4f}")

# export top 10 features to a csv file
feature_importances_df = pd.DataFrame(feature_importances[:10], columns=['Feature', 'Importance'])
# feature_importances_df.to_csv('feature_importance.csv', index=False)

# run on unseen data
new_data_df = pd.read_csv('/project/data/employee_data_test.csv')
result_df = predict_attrition(new_data_df, best_model)

# print(result_df[['Employee ID', 'Attrition Probability']])

# result_df.to_csv('predicted_ap.csv', index=False)


Top 10 Features by Importance:
Tenure: 0.1464
Salary Raise Per Year: 0.1130
Current Salary: 0.0741
Starting Salary: 0.0715
Salary Percentage Change: 0.0714
Age: 0.0693
Average Monthly Working Hours: 0.0693
Promotion Frequency: 0.0688
Months in Role: 0.0661
Years of Experience: 0.0652
