<a href="https://colab.research.google.com/github/cbonnin88/Equilibrium/blob/main/hr_data_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install Faker



In [None]:
import polars as pl
from faker import Faker
import random
import numpy as np
from datetime import datetime, timedelta

In [None]:
num_employees = 2500
attrition_rate = 0.18
promotion_rate = 0.30

In [None]:
fake = Faker()
faker_instance = Faker('fr_Fr')

In [None]:
# Defining Business specific Lists
departments = ['Engineering','Product','Sales','Human Resources','Finance','Data']
job_levels = ['T1','T2','T3','T4','T5']
locations = ['Paris','Bordeaux','Nantes','Lille','Marseille','Lyon','Limoges']
gender = ['Male','Female','Non-Binary']

In [None]:
# Generate 'employees' table
print('Generating employees table...')
employees_data= []
for i in range(1,num_employees + 1):
  hire_date = fake.date_between(start_date='-5y',end_date='today')
  genders = random.choice(gender)

  # Determine terminate date based on attrition rate
  termination_date = None
  if random.random() < attrition_rate:
    # Ensure terminate is after hire date
    termination_date = fake.date_between(start_date=hire_date, end_date='today')

  employees_data.append({
      'employee_id': fake.random_number(digits=4,fix_len=True),
      'first_name':faker_instance.first_name_male() if genders == 'Male' else faker_instance.first_name_female(),
      'last_name':faker_instance.last_name(),
      'gender':genders,
      'hire_date':hire_date,
      'termination_date':termination_date,
      'department':random.choice(departments),
      'job_level':random.choice(job_levels),
      'location':random.choice(locations)
  })

  employees_df = pl.DataFrame(employees_data)

Generating employees table...


In [None]:
employees_df

employee_id,first_name,last_name,gender,hire_date,termination_date,department,job_level,location
i64,str,str,str,date,date,str,str,str
8273,"""Nicolas""","""Barthelemy""","""Male""",2021-02-16,,"""Finance""","""T1""","""Paris"""
8790,"""Alexandria""","""Poirier""","""Non-Binary""",2025-07-18,2025-07-26,"""Sales""","""T1""","""Marseille"""
7634,"""Anouk""","""Bouchet""","""Female""",2023-09-06,2025-01-25,"""Data""","""T5""","""Bordeaux"""
3829,"""Alice""","""Lejeune""","""Female""",2024-05-25,2024-06-27,"""Product""","""T1""","""Lyon"""
4618,"""Martine""","""Noël""","""Non-Binary""",2024-07-11,,"""Product""","""T2""","""Lille"""
…,…,…,…,…,…,…,…,…
5970,"""Anouk""","""Blanc""","""Non-Binary""",2021-10-30,,"""Human Resources""","""T2""","""Marseille"""
3563,"""Pauline""","""Thomas""","""Non-Binary""",2020-10-30,,"""Human Resources""","""T3""","""Marseille"""
2643,"""Raymond""","""Torres""","""Male""",2023-03-18,,"""Data""","""T3""","""Nantes"""
8038,"""René""","""Dupuy""","""Male""",2022-02-07,,"""Sales""","""T2""","""Paris"""


# **Generating 'Compensation' Table**

In [None]:
print('Generating compensation table...')
# Base salary map to create logical salaries based on job level
job_level_salary_map = {
    'T5':40000,
    'T4':56000,
    'T3':76000,
    'T2':96000,
    'T1':130000
    }

Generating compensation table...


In [None]:
compensation_data = []
for row in employees_df.iter_rows(named=True):
  base_salary = job_level_salary_map[row['job_level']]

  # Adding a random variation
  salary_variation = base_salary * random.uniform(-0.1,0.1)
  final_salary = base_salary + salary_variation

  # I'm introducing a systemic bias for my analysis later
  # Introducing a slight pay gap for female and non-binary employees to create a problem to solve
  if row ['gender'] == 'Female' or row['gender'] == 'Non-Binary':
    final_salary *= 0.96 # 4% less on average

  compensation_data.append({
      'compensation_id':fake.random_number(digits=4,fix_len=True),
      'employee_id':row['employee_id'],
      'effective_date':row['hire_date'],
      'base_salary':round(final_salary,-2), # Rounding up to the nearest 100
      'bonus':round(final_salary * random.uniform(0.05,0.20)-2),
      'stock_options':random.choice([500,1000,1500,2000,0])
  })

compensation_df = pl.DataFrame(compensation_data)

In [None]:
compensation_df

compensation_id,employee_id,effective_date,base_salary,bonus,stock_options
i64,i64,date,f64,i64,i64
7999,8273,2021-02-16,132800.0,8951,0
5916,8790,2025-07-18,113600.0,9218,0
9635,7634,2023-09-06,36800.0,7201,2000
8896,3829,2024-05-25,117600.0,12103,500
6357,4618,2024-07-11,98700.0,9211,1000
…,…,…,…,…,…
6059,5970,2021-10-30,98500.0,10503,2000
6295,3563,2020-10-30,79700.0,14736,500
8586,2643,2023-03-18,73800.0,11180,1000
9555,8038,2022-02-07,95200.0,15205,1500


# **Generating Performance Reviews Table**

In [None]:
print('Generating performance_reviews table...')
performance_data = []
for row in employees_df.iter_rows(named=True):
  # Each employee gets 1 to 5 reviews
  num_reviews = random.randint(1,5)

  end_date = row['termination_date'] if row['termination_date'] else datetime.now().date()

  for j in range(num_reviews):
    # Ensure  review date is within employment period
    review_date = fake.date_between(start_date=row['hire_date'],end_date=end_date)

    # Create logical performance scores
    # If an employee has attritited, make their last performance score more likely to be low
    perf_score = random.randint(1,5)
    if j == num_reviews - 1 and row['termination_date'] is not None:
      if random.random() < 0.6: # 60% chance of a low last score if attrited
        perf_score = random.randint(1,3)

  performance_data.append({
       'review_id':fake.random_number(digits=4,fix_len=True),
       'employee_id':row['employee_id'],
       'review_date':review_date,
       'performance_score':perf_score,
       'potential_score':random.randint(1,5)
   })

performance_df = pl.DataFrame(performance_data)

Generating performance_reviews table...


In [None]:
performance_df

review_id,employee_id,review_date,performance_score,potential_score
i64,i64,date,i64,i64
3438,8273,2022-08-11,5,2
1531,8790,2025-07-18,2,3
6018,7634,2023-12-31,2,2
1113,3829,2024-06-25,1,2
9996,4618,2024-11-06,5,1
…,…,…,…,…
6847,5970,2023-12-12,3,4
5483,3563,2023-10-04,4,1
8956,2643,2023-08-10,5,3
9570,8038,2025-01-29,3,4


# **Generate Job History Table**

In [None]:
print('Generating job history table')
job_history_data = []
promoted_employees = employees_df.sample(fraction=promotion_rate)

level_map = {
    'T5':'T4',
    'T4':'T3',
    'T2':'T1'
}

Generating job history table


In [None]:
for row in promoted_employees.iter_rows(named=True):
  if row['job_level'] in level_map:
    # Calculate the potential start and end dates for promotion
    potential_start_date = row['hire_date'] + timedelta(days=365)
    end_date = row['termination_date'] if row['termination_date'] else datetime.now().date()

    # Only generate a promotion date if the start date is before the end date
    if potential_start_date < end_date:
      promotion_date = fake.date_between(
        start_date=potential_start_date,  # Min 1 year before promotion
        end_date=end_date
      )

      # Determine previous job level (simplified logic)
      previous_job_level = [k for k, v in level_map.items() if v == row['job_level']]
      if previous_job_level:
        job_history_data.append({
            'history_id':fake.random_number(digits=4,fix_len=True),
            'employee_id':row['employee_id'],
            'promotion_date':promotion_date,
            'previous_job_level':previous_job_level[0],
            'new_job_level':row['job_level']
        })

job_history_df = pl.DataFrame(job_history_data)

In [None]:
job_history_df

history_id,employee_id,promotion_date,previous_job_level,new_job_level
i64,i64,date,str,str
1466,8254,2025-06-06,"""T5""","""T4"""
9284,8254,2024-08-17,"""T5""","""T4"""
3640,8254,2024-07-20,"""T5""","""T4"""
3714,5052,2025-04-22,"""T5""","""T4"""
5465,6549,2025-05-27,"""T5""","""T4"""
…,…,…,…,…
5680,7319,2025-01-10,"""T5""","""T4"""
7159,2252,2024-11-30,"""T5""","""T4"""
4050,8445,2024-02-23,"""T5""","""T4"""
9647,5997,2024-12-02,"""T5""","""T4"""


# **Save to CSV Files**

In [None]:
print('Saving DataFrames to csv files...')
employees_df.write_csv('employees.csv')
compensation_df.write_csv('compensation.csv')
performance_df.write_csv('performance.csv')
job_history_df.write_csv('job_history.csv')


print('\nData generation complete, Four csv files have been created')

Saving DataFrames to csv files...

Data generation complete, Four csv files have been created
