In [1]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta

In [2]:
# Initialize Faker for generating fake names
fake = Faker()

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

# Set the number of samples you want in your dataset
num_samples = 1000  # Adjust this number as needed

# Generate unique Employee IDs
employee_ids = random.sample(range(1000, 10000), num_samples)  # IDs between 1000 and 9999

# Define possible values for categorical variables
genders = ['Male', 'Female']
roles = [
    'Account Executive', 'Sales Manager', 'Marketing Specialist',
    'Marketing Manager', 'Software Engineer', 'Engineering Manager', 'Director'
]
locations = ['Remote', 'Office-Based']
contracts = ['Full-time', 'Part-time']

# Generate random data
data = []

today = datetime.today().date()  # Convert to date object

# Define start date range from 01-01-2010 to today
start_date_earliest = datetime(2010, 1, 1).date()
start_date_latest = today

In [3]:
for employee_id in employee_ids:
    employee_name = fake.name()
    gender = random.choice(genders)
    age = random.randint(22, 65)

    # Start Date
    max_start_date_based_on_age = today - timedelta(days=(age - 18) * 365)
    adjusted_start_date_earliest = max(start_date_earliest, max_start_date_based_on_age)
    start_date = fake.date_between_dates(date_start=adjusted_start_date_earliest, date_end=start_date_latest)

    # Determine if the employee has left
    if random.random() < 0.3:  # 30% turnover rate
        turnover = 1
        # End Date can be any date after start_date + 30 days (minimum tenure of 30 days)
        min_end_date = start_date + timedelta(days=30)
        max_end_date = today
        if min_end_date >= max_end_date:
            min_end_date = max_end_date - timedelta(days=1)
        end_date = fake.date_between_dates(date_start=min_end_date, date_end=max_end_date)
        tenure_days = (end_date - start_date).days
    else:
        turnover = 0
        end_date = None
        tenure_days = (today - start_date).days

    # Ensure minimum tenure of 30 days
    if tenure_days < 30:
        tenure_days = 30
        start_date = today - timedelta(days=tenure_days)
        if turnover == 1:
            end_date = today

    # Tenure Calculations
    tenure_years = round(tenure_days / 365.25, 1)
    tenure_months = tenure_days // 30

    # Months in Role
    max_months_in_role = tenure_months
    if max_months_in_role < 1:
        max_months_in_role = 1
    months_in_role = random.randint(1, max_months_in_role)

    # Years of Experience
    min_years_experience = int(tenure_years) + 1
    max_years_experience = age - 18
    if max_years_experience < min_years_experience:
        max_years_experience = min_years_experience
    years_experience = random.randint(min_years_experience, max_years_experience)


    # Promotion history: employees with tenure > 3 years have at least 1 promotion
    if tenure_years > 3:
        promotion_history = random.randint(1, 5)
    else:
        promotion_history = random.randint(0, 1)

    # Months in role calculation
    if promotion_history == 0:
        # No promotions, months_in_role is tenure in months, capped at 37
        months_in_role = min(tenure_months, 37)
    else:
        # Calculate the date to end promotions
        promotion_end_date = (end_date if end_date else today) - timedelta(days=1)
        # Generate promotion dates
        promotion_dates = sorted([
            fake.date_between_dates(
                date_start=start_date,
                date_end=promotion_end_date
            )
            for _ in range(promotion_history)
        ])
        # Ensure promotion dates are unique and sorted
        promotion_dates = sorted(set(promotion_dates))
        # Last promotion date
        last_promotion_date = promotion_dates[-1]
        # months_in_role is time from last promotion to end date (or today)
        months_since_last_promotion = ((end_date if end_date else today) - last_promotion_date).days // 30
        # Ensure months_in_role does not exceed 37 months and tenure in months
        months_in_role = min(months_since_last_promotion, 37, int(tenure_years * 12))
        # Ensure months_in_role is at least 1
        if months_in_role < 1:
            months_in_role = 1

    role = random.choice(roles)
    # Assign department based on role
    if role in ['Account Executive', 'Sales Manager']:
        department = 'Sales'
    elif role in ['Marketing Specialist', 'Marketing Manager']:
        department = 'Marketing'
    else:
        department = 'IT'

    starting_salary = random.randint(40000, 100000)
    current_salary = starting_salary + random.randint(0, 50000)
    location = random.choices(locations, weights=[3, 7], k=1)[0]
    contract = random.choices(contracts, weights=[9, 1], k=1)[0]
    avg_monthly_hours = random.randint(120, 200)

    # Adjust Performance Score Based on Promotion History
    if promotion_history > 0:
        # Employees with promotions are more likely to have higher performance scores
        last_performance_review_score = random.choices([3, 4, 5], weights=[1, 2, 3])[0]
    else:
        # Employees without promotions might have lower scores
        last_performance_review_score = random.choices([1, 2, 3, 4, 5], weights=[2, 2, 3, 2, 1])[0]

    data.append({
        'Employee Name': employee_name,
        'Employee ID': employee_id,
        'Gender': gender,
        'Age': age,
        'Tenure': tenure_years,
        'Role': role,
        'Department': department,
        'Starting Salary': starting_salary,
        'Current Salary': current_salary,
        'Location': location,
        'Contract': contract,
        'Years of Experience': years_experience,
        'Average Monthly Working Hours': avg_monthly_hours,
        'Months in Role': months_in_role,
        'Promotion History': promotion_history,
        'Last Performance Review Score': last_performance_review_score,
        'Start Date': start_date,
        'End Date': end_date,
        'Turnover': turnover
    })

# Create a DataFrame
df = pd.DataFrame(data)

In [4]:
# Save to CSV
# df.head(900).to_csv('employee_data_train.csv', index=False)
# df.tail(100).to_csv('employee_data_test.csv', index=False)

# Display the first few rows
print(df.head())

     Employee Name  Employee ID Gender  Age  Tenure               Role  \
0    Nicole Morgan         2824   Male   24     3.7  Account Executive   
1    Emily Elliott         1409   Male   48    14.6  Account Executive   
2     Meghan Irwin         5506   Male   65     4.2  Marketing Manager   
3     Kevin Vargas         5012   Male   63     0.3      Sales Manager   
4  Kimberly Walker         4657   Male   36     3.2  Account Executive   

  Department  Starting Salary  Current Salary      Location   Contract  \
0      Sales            55387           90615        Remote  Full-time   
1      Sales            85238          109521        Remote  Full-time   
2  Marketing            58080           70519  Office-Based  Part-time   
3      Sales            48252           85252        Remote  Full-time   
4      Sales            58171           67653  Office-Based  Full-time   

   Years of Experience  Average Monthly Working Hours  Months in Role  \
0                    5               