<a href="https://colab.research.google.com/github/cbonnin88/people-analytics/blob/main/ConnectSphere_data_creation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install Faker



In [None]:
import polars as pl
from faker import Faker
import numpy as np
import datetime
import random

In [None]:
faker_instance = {
    'France': Faker('fr_Fr'),
    'United Kingdom':Faker('en_GB'),
    'Denmark':Faker('dk_DK'),
    'Germany':Faker('de_DE')
}

In [None]:
num_employees = 4500
num_leadership = 7
num_managers = 441
departments = ['Engineering','Product','Sales','Human Resources','Finance','Data','Leadership']
non_leadership_depts = [d for d in departments if d != 'Leadership']
city = ['London','Paris','Berlin','Copenhagen']
country_map = {
    'Paris':'France',
    'London':'United Kingdom',
    'Berlin':'Germany',
    'Copenhagen':'Denmark'
}

In [None]:
# Creating Job titles

job_titles = {
    'Engineering':['Frontend Engineer','Backend Engineer','DevOps Engineer','QA Engineer','Engineering Manager','Cloud Engineer'],
    'Product':['Product Manager','Product Analyst','UX/UI Designer','Product Engineer','Head of Product'],
    'Sales':['Account Executive','Sales Development Representative','Sales Manager','Customer Success Manager','Sales Analyst','Head of Sales'],
    'Human Resources':['Talent Acquisition Specialist','HR Business Partner','People Analyst','HR Machine Learning Engineer','HR Data Scientist','HR Manager'],
    'Finance':['Credit Analyst','Accountant','Financial Analyst','Financial Planner','Auditor','Financial Lead'],
    'Data':['Data Analyst','Data Scientist','Analytics Engineer','Data Engineer','Machine Learning Engineer','Cloud Data Engineer','Head of Data'],
    'Leadership':['Chief Technicological Officer','Director of Product','Head of Global Sales','Chief People Officer','Chief Financial Officer','Chief Data Officer','Chief Executive Officer']
}

In [None]:
manager_titles = [
    "Engineering Manager",
    "Head of Product",
    "Head of Sales",
    "HR Manager",
    "Financial Lead",
    "Head of Data"
]

In [None]:
#Manager Levels

ic_levels = ['T5','T4','T3']
manager_level = 'T2'
leadership_level = 'T1'

In [None]:
# Salary Bands

salary_bands = {
    ('Paris','T5'):(40000,55999),('Paris','T4'):(56000,75999),('Paris','T3'):(76000,95999),('Paris','T2'):(96000,120000),
    ('London', 'T5'): (45000, 60999), ('London', 'T4'): (61000, 85999), ('London', 'T3'): (86000, 110999), ('London', 'T2'): (111000, 140000),
    ('Copenhagen', 'T5'): (42000, 58999), ('Copenhagen', 'T4'): (59000, 80999), ('Copenhagen', 'T3'): (81000, 100999), ('Copenhagen', 'T2'): (101000, 125000),
    ('Berlin','T5'):(40000,55999),('Berlin','T4'):(56000,75999),('Berlin','T3'):(76000,95999),('Berlin','T2'):(96000,120000)
}

In [None]:
# Leadership Bands

leadership_salary_bands = {
    'Paris':(130000,200000),
    'London':(150000,250000),
    'Copenhagen':(135000,200000),
    'Berlin':(145000,250000)
}

In [None]:
# Generating Job Roles Table

roles_data = []
role_id_counter = 1

for dept in departments:
  for title in job_titles[dept]:
    is_manager = title in manager_titles
    roles_data.append({
        'role_id': role_id_counter,
        'department': dept,
        'job_title': title,
        'is_manager_role': is_manager
    })
    role_id_counter += 1

In [None]:
df_roles = pl.DataFrame(roles_data)

In [None]:
# Generating Employees Table
employees_data =[]
employee_id_counter = 1

# Generationg Leadership employees
leadership_roles = df_roles.filter(pl.col('department') == 'Leadership')['role_id'].to_list()

for _ in range(num_leadership):
  location = random.choice(city)
  country = country_map[location]
  current_faker = faker_instance[country]

  hire_date = current_faker.date_between(start_date='-8y',end_date='-2y')
  employees_data.append({
      'employee_id': employee_id_counter,
      'first_name': current_faker.first_name(),
      'last_name': current_faker.last_name(),
      'email': current_faker.email(),
      'location': location, 'country':country,
      'department':'Leadership','role_id':random.choice(leadership_roles), 'level':leadership_level,
      'hire_date':hire_date, 'termination_date':None, # Ensure None is used for missing termination dates
      'performance_score':random.randint(1,5), 'satisfactioin_score':random.randint(1,5)
  })
  employee_id_counter += 1

In [None]:
# Generating 441 T2 Managers

manager_roles_map = {}
for dept in non_leadership_depts:
  manager_role = df_roles.filter((pl.col('department') == dept) & (pl.col('is_manager_role') == True))['role_id']
  if manager_role.len() > 0:
    manager_roles_map[dept] = manager_role.item()


manager_dept_distribution = (non_leadership_depts * (num_managers // len(non_leadership_depts)+1))[:num_managers]
random.shuffle(manager_dept_distribution)

for dept in manager_dept_distribution:
  if dept in manager_roles_map: # Only generate manager if a manager role exists for the department
    location = random.choice(city)
    country = country_map[location]
    current_faker = faker_instance[country]

    hire_date = current_faker.date_between(start_date='-6y',end_date='-1y')
    employees_data.append({
        'employee_id': employee_id_counter,
        'first_name': current_faker.first_name(),
        'last_name': current_faker.last_name(),
        'email': current_faker.email(),
        'location':location,'country':country,
        'department':dept,'role_id':manager_roles_map[dept], 'level':manager_level,
        'hire_date':hire_date, 'termination_date':None, # Ensure None is used for missing termination dates
        'performance_score':random.randint(1,5),'satisfaction_score':random.randint(1,5)
    })
    employee_id_counter += 1
  else:
    print(f"Skipping manager generation for department: {dept} as no manager role exists.") # Optional: add a message for skipped departments

In [None]:
# Generating the Individual Contributors
num_ics = num_employees - num_leadership - num_managers
ic_roles_map = {dept: df_roles.filter((pl.col('department')== dept) & (pl.col('is_manager_role')==False))['role_id'].to_list() for dept in non_leadership_depts}

for _ in range(num_ics):
  department = random.choice(non_leadership_depts)
  location = random.choice(city)
  country = country_map[location]
  current_faker = faker_instance[country]

  # Ensure hire_date is at least 180 days before today to avoid error in termination_date generation
  end_date_hire = datetime.date.today() - datetime.timedelta(days=180)
  hire_date = current_faker.date_between(start_date='-5y',end_date=end_date_hire)

  performance_score = random.randint(1,5)
  has_left = (performance_score <=2) and (random.random() < 0.45)
  termination_date = current_faker.date_between(start_date=hire_date + datetime.timedelta(days=180), end_date='today') if has_left else None # Ensure None is used for missing termination dates
  employees_data.append({
      'employee_id': employee_id_counter,'first_name':current_faker.first_name(),'last_name':current_faker.last_name(),
      'email':current_faker.email(),'location':location,'country':country,
      'department':department,'role_id':random.choice(ic_roles_map[department]),'level':random.choice(ic_levels),
      'hire_date':hire_date,'termination_date':termination_date,
      'performance_score':performance_score,'satisfaction_score':random.randint(1,5)
  })
  employee_id_counter += 1

In [None]:
import polars as pl
import datetime

# Define explicit schema to handle potential type inconsistencies, especially with dates and nulls
explicit_schema = {
    'employee_id': pl.Int64,
    'first_name': pl.Utf8,
    'last_name': pl.Utf8,
    'email': pl.Utf8,
    'location': pl.Utf8,
    'country': pl.Utf8,
    'department': pl.Utf8,
    'role_id': pl.Int64,
    'level': pl.Utf8,
    'hire_date': pl.Date,
    'termination_date': pl.Date, # Explicitly define as Date, Polars handles None as null for this type
    'performance_score': pl.Int64,
    'satisfaction_score': pl.Int64
}

df_employees = pl.DataFrame(employees_data, schema=explicit_schema)

In [None]:
# Generate Salaries & Performance Tables
salaries_data = []
salary_id_counter = 1

import pandas as pd # Added import for pandas

for _, row in df_employees.to_pandas().iterrows():
  if row['level'] == 'T1': band = leadership_salary_bands.get(row['location'])
  else: band = salary_bands.get((row['location'], row['level']))
  base_salary_eur = random.randint(band[0], band[1])

  salary_local = base_salary_eur

  currency = 'EUR'
  if row['location'] == 'London':
        salary_local = int(base_salary_eur * 0.85) # Convert to GBP
        currency = 'GBP'
  elif row['location'] == 'Copenhagen':
        salary_local = int(base_salary_eur * 7.45) # Convert to DKK
        currency = 'DKK'

  salaries_data.append({
      'salary_id':salary_id_counter,
      'employee_id': row['employee_id'],
      'salary':salary_local,
      'currency':currency,
      'salary_in_eur':base_salary_eur,
      'effective_date': row['hire_date'].date() # Convert hire_date to datetime.date
  })
  salary_id_counter += 1

  if (datetime.date.today() - row['hire_date'].date()).days > 365 and random.random() < 0.4:
    # Use an existing faker instance to generate the increase date
    increase_date = faker_instance['France'].date_between(start_date=row['hire_date'].date() + datetime.timedelta(days=365), end_date='today')
    if pd.isna(row['termination_date']) or increase_date < row['termination_date'].date(): # Added check for NaT and converted to date
      new_salary_local = int(salary_local * random.uniform(1.05,1.15))
      new_salary_eur = int(base_salary_eur * random.uniform(1.05,1.15))
      salaries_data.append({ # Corrected typo from salaires_data
          'salary_id':salary_id_counter,
          'employee_id':row['employee_id'],
          'salary':new_salary_local,
          'currency':currency,
          'salary_in_eur':new_salary_eur,
          'effective_date': increase_date
      })
      salary_id_counter += 1

In [None]:
df_salaries = pl.DataFrame(salaries_data)
df_performance = df_employees.select(['employee_id','performance_score','satisfaction_score'])
df_employees = df_employees.drop(['performance_score','satisfaction_score'])

In [None]:
# Saving to CSV
df_employees.write_csv('employees.csv')
df_roles.write_csv('job_roles.csv')
df_salaries.write_csv('salaries.csv')
df_performance.write_csv('performance.csv')

In [None]:
# Verification
print('CSV file generation complete with explicit manager roles')
print('-' * 40)
print(f'Total Employees Generated: {len(df_employees)}')
print(f'-T1 Leadership: {len(df_employees.filter(pl.col('level')=='T1'))}')
print(f'-T2 Managers: {len(df_employees.filter(pl.col('level')=='T2'))}')
print(f'-T3-T5 ICs: {len(df_employees.filter(pl.col('level').is_in(['T3','T4','T5'])))}')
print('-'*40)

CSV file generation complete with explicit manager roles
----------------------------------------
Total Employees Generated: 4500
-T1 Leadership: 7
-T2 Managers: 441
-T3-T5 ICs: 4052
----------------------------------------
