<a href="https://colab.research.google.com/github/cbonnin88/people-analytics/blob/main/Employee_attrition_and_salary_prediction_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,  confusion_matrix
import plotly.express as px
import polars as pl
from google.colab import auth
from google.cloud import bigquery
import datetime
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error,r2_score

In [2]:
auth.authenticate_user()
project_id = 'people-analytics-connectsphere'
client = bigquery.Client(project=project_id)
print('Authentication Successful')

Authentication Successful


In [3]:
master_query = f"""
WITH LatestSalaries AS (
  SELECT
    employee_id,
    salary,
    currency,
    salary_in_eur,
    ROW_NUMBER() OVER(PARTITION BY employee_id ORDER BY effective_date DESC) AS rn
  FROM `people-analytics-connectsphere.people_data.salaries`

)
SELECT
  emp.employee_id,
  emp.first_name,
  emp.last_name,
  emp.hire_date,
  emp.termination_date,
  emp.level,
  emp.department,
  emp.location,
  emp.country,
  r.job_title,
  p.performance_score,
  p.satisfaction_score,
  ls.salary AS latest_salary_local,
  ls.currency,
  ls.salary_in_eur AS latest_salary_eur
FROM
  `people-analytics-connectsphere.people_data.employees` AS emp
LEFT JOIN `people-analytics-connectsphere.people_data.job_roles` AS r
  ON emp.role_id = r.role_id
LEFT JOIN  `people-analytics-connectsphere.people_data.performance` AS p
  ON emp.employee_id = p.employee_id
LEFT JOIN LatestSalaries ls
  ON emp.employee_id = ls.employee_id
  AND ls.rn = 1
"""

In [4]:
df_people = client.query(master_query).to_dataframe()
df_people = pl.from_pandas(df_people)

In [5]:
df_people = df_people.with_columns([
    pl.col('hire_date'), # Assuming hire_date is already a date type
    pl.col('termination_date').cast(pl.Datetime, strict=False) # strict=False handles the Nulls
])

In [6]:
df_people = df_people.with_columns(
    pl.when(pl.col('termination_date').is_not_null()).then(1).otherwise(0).alias('is_churn'),
    (pl.lit(datetime.date.today())- pl.col('hire_date')).dt.total_days().alias('tenure_days')
)

In [7]:
df_people.head()

employee_id,first_name,last_name,hire_date,termination_date,level,department,location,country,job_title,performance_score,satisfaction_score,latest_salary_local,currency,latest_salary_eur,is_churn,tenure_days
i64,str,str,date,datetime[μs],str,str,str,str,str,i64,i64,i64,str,i64,i32,i64
1,"""John""","""Shepard""",2019-01-11,,"""T1""","""Leadership""","""Copenhagen""","""Denmark""","""Chief Executive Officer""",5,,1364750,"""DKK""",183188,0,2447
14,"""Victor""","""Morris""",2021-04-08,,"""T2""","""Finance""","""Copenhagen""","""Denmark""","""Financial Lead""",4,2.0,962731,"""DKK""",131784,0,1629
17,"""Elizabeth""","""White""",2024-05-11,,"""T2""","""Sales""","""Copenhagen""","""Denmark""","""Head of Sales""",3,5.0,896607,"""DKK""",120350,0,500
22,"""Shannon""","""Jones""",2022-06-18,,"""T2""","""Data""","""Copenhagen""","""Denmark""","""Head of Data""",2,4.0,880373,"""DKK""",125238,0,1193
24,"""Steven""","""White""",2022-02-12,,"""T2""","""Human Resources""","""Copenhagen""","""Denmark""","""HR Manager""",3,3.0,766500,"""DKK""",102886,0,1319


# **Selecting Features and Targets**

In [8]:
# For this model, we'll use tenure, performance, satisfaction, and role characteristics.
features = [
    'tenure_days',
    'performance_score',
    'satisfaction_score',
    'level',
    'department',
    'location'
]

target = 'is_churn'

In [9]:
# Converting to pandas for scikit-learn compatibility
df_ml = df_people.to_pandas()

In [10]:
# One-Hot Encode Categorical Features
# ML models need number, so I converted the text columns like 'department' into numerica format

X = pd.get_dummies(df_ml[features],drop_first=True)
y = df_ml[target]

In [11]:
# Split Data into Training and Testing Sets
# I am training the model on one part of the data and test it on another to see how it performs

X_train,X_test,y_train,y_test = train_test_split(
    X,y,
    test_size=0.25, # I am using 25% of the data
    random_state=42,
    stratify=y # Ensures the proportion of churners is the same in train and test sets
)

In [12]:
print('Data for Attrition Model is ready.')
print(f'Training data shape: {X_train.shape}')
print(f'Testing data shape: {X_test.shape}')

Data for Attrition Model is ready.
Training data shape: (3375, 16)
Testing data shape: (1125, 16)


# **Train and Evaluate the Model**

In [13]:
# Initialize and train the Model
# I used class_weight='balanced' to help the model handle the fact that far more employees stay than leave

rf_model = RandomForestClassifier(random_state=42,class_weight='balanced')
rf_model.fit(X_train,y_train)
print('Attrition model trained successfully')

Attrition model trained successfully


In [14]:
# Making Predictions and Evaluate
y_pred = rf_model.predict(X_test)

In [15]:
# Printing A report showing key metrics like precision and recall

print('\nClassificatioin Report:')
print(classification_report(y_test,y_pred,target_names=['Stayed','Churned']))


Classificatioin Report:
              precision    recall  f1-score   support

      Stayed       0.89      0.93      0.91       934
     Churned       0.54      0.42      0.47       191

    accuracy                           0.84      1125
   macro avg       0.72      0.67      0.69      1125
weighted avg       0.83      0.84      0.83      1125



In [16]:
# Computing the confusion matrix
cm = confusion_matrix(y_test,y_pred)

In [17]:
fig_matrix = px.imshow(
    cm,
    labels=dict(x='Predicted Label',y='True Label',color='Count'),
    x=['Stayed','Churned'],
    y=['Stayed','Churned'],
    text_auto=True,
    color_continuous_scale='Viridis',
    title='Confusion Matrix'

)

fig_matrix.show()

# **New Salary Prediction**

In [18]:
df_salary = df_ml[df_ml['is_churn']==0].copy()

In [19]:
features_salary = ['job_title','level','location','department']
target_salary = 'latest_salary_eur'

In [20]:
X_salary = df_salary[features_salary]
y_salary = df_salary[target_salary]

In [21]:
# Splitting the Data iinto Training and Testing Sets

X_train_sal, X_test_sal,y_train_sal,y_test_sal = train_test_split(
    X_salary,y_salary,test_size=0.25, random_state=42
)
print('Data for salary model is ready.')
print(f'Training data shape: {X_train_sal.shape}')
print(f'Testing data shape: {X_test_sal.shape}')

Data for salary model is ready.
Training data shape: (2800, 4)
Testing data shape: (934, 4)


# **Creating a Modeling Pipeline**

In [23]:
# Defining the Preprocessing Step
# I need to one-hot encode my categorical features.
# 'handle_unknown='ignore'' tells the model what to do if it ses a new category during prediction

preprocessor = ColumnTransformer(
    transformers=[
        ('cat',OneHotEncoder(handle_unknown='ignore'),features_salary)
    ]
)

In [24]:
# Defining the Model and Creating the Pipeline
# I am using a GradienBoostingRegressor, which is a powerful and accurate model for regression

gbr_pipeline = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('regressor',GradientBoostingRegressor(random_state=42))
])

print('Salary model pipeline created successfully')

Salary model pipeline created successfully


# **Train and Evaluate the Salary Model**

In [25]:
# Training The Pipeline
gbr_pipeline.fit(X_train_sal,y_train_sal)

print('Salary model trained successfully')

Salary model trained successfully


In [29]:
# Make Predictions on the Test Data
y_pred_sal = gbr_pipeline.predict(X_test_sal)

In [32]:
# Evaluate Performance
rmse = mean_squared_error(y_test_sal,y_pred_sal)
r2 = r2_score(y_test_sal,y_pred_sal)

In [33]:
print(f'\nModel Performance on Test Data:')
print(f'Root Mean Squared Error (RMSE): €{rmse:,.2f}')
print(f'R-squared (R2): {r2:.2f}')
print(f'\nThis means the models predictions are, on average, off by about €{rmse:,.0f}.')
print(f'The model explains approximately {r2:.0%} of the variance in salaries based on the provided features')


Model Performance on Test Data:
Root Mean Squared Error (RMSE): €59,201,435.24
R-squared (R2): 0.89

This means the models predictions are, on average, off by about €59,201,435.
The model explains approximately 89% of the variance in salaries based on the provided features


# **Using the Model for a Prediction**

In [38]:
# Creating a DataFrame for the new hire

new_hire_data = pd.DataFrame({
    'job_title':['People Analyst'],
    'level':['T5'],
    'location':['Paris'],
    'department':['Human Resources']
})

In [39]:
# Using the trained pipeline to predict the salary

predicted_salary = gbr_pipeline.predict(new_hire_data)
print(f'\nPredicted salary for the new hire: €{predicted_salary[0]:,.0f}')


Predicted salary for the new hire: €48,765
