<a href="https://colab.research.google.com/github/cbonnin88/Equilibrium/blob/main/Predictive_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [42]:
import polars as pl
from google.cloud import bigquery
from google.colab import auth
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import plotly.express as px
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error

# **Data Extraction from BigQuery**

In [2]:
auth.authenticate_user()
project_id = 'project-equilibrium-474307'
client = bigquery.Client(project_id)
print('Authentication Successfull')

Authentication Successfull


In [3]:
master_query = f"""
WITH LatestPerformance AS (
  SELECT
    employee_id,
    performance_score,
    potential_score,
    ROW_NUMBER() OVER(PARTITION BY employee_id ORDER BY review_date DESC) AS rn
  FROM
    `project-equilibrium-474307.hr_analytics.performance`
),
LatestCompensation AS (
  SELECT
    employee_id,
    base_salary,
    bonus,
    stock_options,
    ROW_NUMBER() OVER(PARTITION BY employee_id ORDER BY effective_date DESC) AS rn
  FROM
    `project-equilibrium-474307.hr_analytics.compensation`
)
SELECT
  e.employee_id,
  e.gender,
  e.department,
  e.job_level,
  e.hire_date,
  e.termination_date,
  lc.base_salary,
  lc.bonus,
  lc.stock_options,
  lp.performance_score,
  lp.potential_score,
  CASE WHEN e.termination_date IS NOT NULL THEN 1 ELSE 0 END AS is_attrited,
  DATE_DIFF(COALESCE(e.termination_date, '2025-10-16'), e.hire_date, DAY) AS tenure_in_days
FROM
  `project-equilibrium-474307.hr_analytics.employees` AS e
LEFT JOIN LatestCompensation AS lc
  ON e.employee_id = lc.employee_id AND lc.rn = 1
LEFT JOIN LatestPerformance AS lp
  ON e.employee_id = lp.employee_id AND lp.rn = 1
WHERE
  lc.base_salary IS NOT NULL AND lp.performance_score IS NOT NULL;
"""

In [4]:
pandas_df = client.query(master_query).to_dataframe()
df_people = pl.from_pandas(pandas_df)

In [5]:
print(f'Data Successfully Extracted. Shape:{df_people.shape}')

Data Successfully Extracted. Shape:(2500, 13)


In [6]:
df_people.head()

employee_id,gender,department,job_level,hire_date,termination_date,base_salary,bonus,stock_options,performance_score,potential_score,is_attrited,tenure_in_days
i64,str,str,str,date,date,f64,i64,i64,i64,i64,i64,i64
8273,"""Male""","""Finance""","""T1""",2021-02-16,,132800.0,8951,0,5,2,0,1703
8790,"""Non-Binary""","""Sales""","""T1""",2025-07-18,2025-07-26,96000.0,15017,2000,5,3,1,8
7634,"""Female""","""Data""","""T5""",2023-09-06,2025-01-25,36800.0,7201,2000,2,2,1,507
3829,"""Female""","""Product""","""T1""",2024-05-25,2024-06-27,117600.0,12103,500,1,2,1,33
4618,"""Non-Binary""","""Product""","""T2""",2024-07-11,,98700.0,9211,1000,3,5,0,462


# **Feature Engineering & Preprocessing**

In [7]:
# Convert date columns to numerical features
df_hr = df_people.with_columns(
    pl.col('hire_date').dt.ordinal_day().alias('hire_date_ordinal'),
)

In [8]:
# One-Hot Encode Categorical Variables
df_hr = df_people.to_dummies(columns=['gender','ethnicity','department','job_level'])

In [10]:
# Handling any potential missing values
df_hr = df_people.fill_null(strategy='mean')

In [11]:
print('Feature engineering complete. New shape:',df_hr.shape)

Feature engineering complete. New shape: (2500, 13)


# **Model 1 - Attrition Prediction (Classification)**

In [16]:
# Defining features X and target y
# Dropping non-predictive columns and the target itself

features_attrition = df_hr.drop(['employee_id','hire_date','termination_date','is_attrited','base_salary'])
target_attrition = df_hr['is_attrited']

# One-Hot Encode Categorical Variables before splitting
features_attrition = features_attrition.to_dummies(columns=['gender','department','job_level'])

In [20]:
# Splitting the data into training and testing sets
X_train,X_test,y_train,y_test = train_test_split(
    features_attrition,target_attrition,test_size=0.25,random_state=42,stratify=target_attrition
)

# One-Hot Encode Categorical Variables after splitting
X_train = X_train.to_dummies(columns=['gender','department','job_level'])
X_test = X_test.to_dummies(columns=['gender','department','job_level'])

In [21]:
# Initializing and train the random Forest Classifier
attrition_model = RandomForestClassifier(n_estimators=100,random_state=42,class_weight='balanced')
attrition_model.fit(X_train,y_train)

In [22]:
# Making Predictions on the test set
y_pred = attrition_model.predict(X_test)

In [23]:
# Evaluating my model
print('\nClassification Report:')
print(classification_report(y_test,y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.97      0.89       515
           1       0.25      0.05      0.09       110

    accuracy                           0.80       625
   macro avg       0.54      0.51      0.49       625
weighted avg       0.73      0.80      0.75       625



In [27]:
print('Confusion Matrix')
cm = confusion_matrix(y_test,y_pred)
print(cm)

Confusion Matrix
[[497  18]
 [104   6]]


In [30]:
fig = px.imshow(
    cm,
    labels=dict(x='Predicted Label',y='True Label'),
    x=['Predicted Stay (0)','Predicted Leave (1)'],
    y=['True Stay (0)','True Leave (1)'],
    text_auto=True,
    title='Attrition Model Confusion Matrix',
    color_continuous_scale=px.colors.sequential.Viridis_r
)

fig.update_layout(
    xaxis=dict(side='top'),
    title_x=0.5
)

In [34]:
# Displaying Feature Importances
print('\nTop Five Drivers of Attrition (as %):')
feature_importances = pd.Series(attrition_model.feature_importances_,index=features_attrition.columns).sort_values(ascending=False)

# Converting to percentage, sort and format
feature_importances_percent = (feature_importances * 100).sort_values(ascending=False)
formatted_importances = feature_importances_percent.head(5).map('{:.0f}%'.format)

display(formatted_importances)


Top Five Drivers of Attrition (as %):


Unnamed: 0,0
tenure_in_days,33%
bonus,17%
performance_score,11%
stock_options,8%
potential_score,7%


In [35]:
feature_importances = pd.Series(attrition_model.feature_importances_, index=features_attrition.columns)
feature_importances_percent = (feature_importances * 100).sort_values(ascending=False)

In [36]:
# Creating a DataFrame for Plotly
importance_df = pl.DataFrame({
    'feature': feature_importances_percent.index,
    'importance_pct':feature_importances_percent.values
})


In [37]:
# Getting the top 15 features for the plot
top_15_features = importance_df.head(15)
display(top_15_features)

feature,importance_pct
str,f64
"""tenure_in_days""",32.913425
"""bonus""",16.932375
"""performance_score""",10.5957
"""stock_options""",7.504283
"""potential_score""",7.149789
…,…
"""job_level_T2""",1.831459
"""department_Human Resources""",1.813532
"""gender_Female""",1.796428
"""department_Data""",1.783673


In [41]:
fig_imp = px.bar(
    top_15_features,
    x='importance_pct',
    y='feature',
    orientation='h',
    title='Top 15 Feature Importances for Attrition Prediction',
    labels={'importance_pct':'Importance (%)','feature':'Feature'},
    text='importance_pct',
    color='feature',
    color_discrete_sequence=px.colors.qualitative.Pastel
)

fig_imp.update_layout(
    yaxis=dict(autorange='reversed'),
    title_x=0.5,
    showlegend=False
)
fig_imp.update_traces(texttemplate='%{text:.0f}%',textposition='outside')

# **Model 2 - Salary Prediction (Regression)**

---



In [43]:
# Defining features (X) and target (y)
features_salary = df_people.drop(['employee_id','hire_date','termination_date','is_attrited','base_salary'])
target_salary = df_people['base_salary']

In [46]:
# Splitting the Data
X_train_sal,X_test_sal,y_train_sal,y_test_sal = train_test_split(
    features_salary,target_salary,test_size=0.25,random_state=42
)

# One-Hot Encode Categorical Variables after splitting
X_train_sal = X_train_sal.to_dummies(columns=['gender','department','job_level'])
X_test_sal = X_test_sal.to_dummies(columns=['gender','department','job_level'])

In [47]:
# Initialize and train the Random Forest Regressor
salary_model = RandomForestRegressor(n_estimators=100,random_state=42)
salary_model.fit(X_train_sal,y_train_sal)

In [48]:
# Making predictions
y_pred_sal = salary_model.predict(X_test_sal)

In [49]:
# Evaluate the model
r2 = r2_score(y_test_sal, y_pred_sal)
mae = mean_absolute_error(y_test_sal,y_pred_sal)

In [51]:
print(f'\nR-squared: {r2:.2f}')
print(f'Mean Absolute Error (MAE): €{mae:,.2f}')


R-squared: 0.81
Mean Absolute Error (MAE): €7,957.63
