In [None]:
import numpy as np
import pandas as pd
import random

!pip install faker
from faker import Faker
from datetime import datetime, timedelta



Collecting faker
  Downloading faker-37.0.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.0.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.0.0


# Dataset generation

In [None]:
# Initialize Faker for UK names
fake = Faker("en_GB")

# Define gender distribution
GENDER_DIST = ['Female', 'Male']
GENDER_PROB = [0.46, 0.54]

# Define UK regions and cities
UK_REGIONS_CITIES = {
    "London": ["London", "Croydon", "Harrow"],
    "South East": ["Brighton", "Reading", "Southampton"],
    "North West": ["Manchester", "Liverpool", "Preston"],
    "Midlands": ["Birmingham", "Leicester", "Nottingham"],
    "Scotland": ["Edinburgh", "Glasgow", "Aberdeen"]
}

# Define department probabilities
DEPARTMENTS = {
    "HR": 0.10, "Finance": 0.12, "IT": 0.15, "Marketing": 0.10,
    "Operations": 0.18, "Sales": 0.20, "Customer Service": 0.15
}

# Define job titles and probabilities within each department
JOB_TITLES = {
    "HR": [("HR Manager", 0.3), ("HR Officer", 0.4), ("Recruitment Consultant", 0.3)],
    "Finance": [("Finance Manager", 0.3), ("Accountant", 0.4), ("Auditor", 0.3)],
    "IT": [("Software Engineer", 0.4), ("Data Analyst", 0.3), ("IT Support Technician", 0.3)],
    "Marketing": [("Marketing Manager", 0.3), ("SEO Specialist", 0.4), ("Content Writer", 0.3)],
    "Operations": [("Operations Manager", 0.3), ("Supply Chain Analyst", 0.4), ("Logistics Coordinator", 0.3)],
    "Sales": [("Sales Manager", 0.3), ("Sales Executive", 0.4), ("Business Development Executive", 0.3)],
    "Customer Service": [("Customer Service Manager", 0.3), ("Customer Service Advisor", 0.4), ("Call Centre Agent", 0.3)]
}

# Define education levels for job titles
EDUCATION_LEVELS = {
    "HR Manager": "Master's", "HR Officer": "Bachelor's", "Recruitment Consultant": "Bachelor's",
    "Finance Manager": "Master's", "Accountant": "Bachelor's", "Auditor": "Bachelor's",
    "Software Engineer": "Bachelor's", "Data Analyst": "Bachelor's", "IT Support Technician": "Diploma",
    "Marketing Manager": "Master's", "SEO Specialist": "Bachelor's", "Content Writer": "Bachelor's",
    "Operations Manager": "Master's", "Supply Chain Analyst": "Bachelor's", "Logistics Coordinator": "Diploma",
    "Sales Manager": "Bachelor's", "Sales Executive": "Bachelor's", "Business Development Executive": "Diploma",
    "Customer Service Manager": "Bachelor's", "Customer Service Advisor": "Diploma", "Call Centre Agent": "A-Level"
}

# Define UK salary ranges (£ per year)
SALARY_RANGES = {
    "HR Manager": (45000, 65000), "HR Officer": (30000, 45000), "Recruitment Consultant": (28000, 40000),
    "Finance Manager": (55000, 75000), "Accountant": (35000, 50000), "Auditor": (32000, 48000),
    "Software Engineer": (40000, 70000), "Data Analyst": (35000, 55000), "IT Support Technician": (25000, 40000),
    "Marketing Manager": (45000, 60000), "SEO Specialist": (30000, 45000), "Content Writer": (28000, 42000),
    "Operations Manager": (50000, 75000), "Supply Chain Analyst": (35000, 55000), "Logistics Coordinator": (28000, 45000),
    "Sales Manager": (45000, 70000), "Sales Executive": (30000, 50000), "Business Development Executive": (28000, 45000),
    "Customer Service Manager": (35000, 50000), "Customer Service Advisor": (25000, 35000), "Call Centre Agent": (21000, 30000)
}

# Define probabilities for performance ratings
PERFORMANCE_RATINGS = ["Excellent", "Good", "Satisfactory", "Needs Improvement"]
PERFORMANCE_PROB = [0.25, 0.40, 0.25, 0.10]

# Define hire year probabilities
HIRE_YEARS = list(range(2015, 2025))
HIRE_YEAR_PROB = [0.05, 0.07, 0.08, 0.08, 0.09, 0.10, 0.12, 0.13, 0.14, 0.14]

# Define termination probability
TERMINATION_PROB = 0.112  # 11.2% of employees will have termination dates

# Function to generate employee records
def generate_employee_data(n=7950):
    employees = []

    for i in range(n):
        emp_id = f"E{10000 + i}"
        first_name = fake.first_name()
        last_name = fake.last_name()
        gender = np.random.choice(GENDER_DIST, p=GENDER_PROB)
        region = random.choice(list(UK_REGIONS_CITIES.keys()))
        city = random.choice(UK_REGIONS_CITIES[region])

        hire_year = np.random.choice(HIRE_YEARS, p=HIRE_YEAR_PROB)
        hire_date = fake.date_between_dates(datetime(hire_year, 1, 1), datetime(hire_year, 12, 31))

        department = np.random.choice(list(DEPARTMENTS.keys()), p=list(DEPARTMENTS.values()))
        job_title, _ = random.choices(JOB_TITLES[department], weights=[x[1] for x in JOB_TITLES[department]])[0]
        education_level = EDUCATION_LEVELS[job_title]

        birth_year = hire_year - np.random.randint(22, 50)
        birth_date = fake.date_of_birth(minimum_age=22, maximum_age=60)

        performance = np.random.choice(PERFORMANCE_RATINGS, p=PERFORMANCE_PROB)
        overtime = np.random.choice(["Yes", "No"], p=[0.30, 0.70])

        salary = np.random.randint(*SALARY_RANGES[job_title])
        termination_date = fake.date_between_dates(hire_date + timedelta(days=180), datetime(2025, 12, 31)) if random.random() < TERMINATION_PROB else None

        adjusted_salary = salary * (1.02 if gender == "Male" else 1) * (1.05 if education_level == "Master's" else 1)

        employees.append([emp_id, first_name, last_name, gender, region, city, hire_date, department, job_title, education_level, performance, overtime, salary, birth_date, termination_date, adjusted_salary])

    return pd.DataFrame(employees, columns=["Employee ID", "First Name", "Last Name", "Gender", "Region", "City", "Hire Date", "Department", "Job Title", "Education Level", "Performance Rating", "Overtime", "Salary (£)", "Birth Date", "Termination Date", "Adjusted Salary (£)"])

# Generate dataset and save to CSV
df = generate_employee_data()
df.to_csv("UK_HR_Dataset.csv", index=False)

NameError: name 'Faker' is not defined

In [None]:
from google.colab import drive
import pandas as pd

# Mount Google Drive to a directory (e.g., /content/drive)
drive.mount('/content/drive')

# Read the CSV file from the mounted Drive
df = pd.read_csv ('/content/drive/MyDrive/UK_HR_Dataset.csv') # Update the path if your file is in a different folder
#Print the dataset
df

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,Employee ID,First Name,Last Name,Gender,Region,City,Hire Date,Department,Job Title,Education Level,Performance Rating,Overtime,Salary (£),Birth Date,Termination Date,Adjusted Salary (£)
0,E10000,Sandra,Wilson,Male,Scotland,Aberdeen,2017-02-03,IT,Software Engineer,Bachelor's,Excellent,No,62478,1966-02-02,,63727.560
1,E10001,Vanessa,Clarke,Female,North West,Manchester,2017-12-23,Sales,Sales Executive,Bachelor's,Satisfactory,No,30034,1995-05-01,,30034.000
2,E10002,Marie,Foster,Female,London,Harrow,2022-03-21,Customer Service,Customer Service Manager,Bachelor's,Good,No,40268,1985-07-14,,40268.000
3,E10003,Gemma,Hodgson,Female,London,London,2022-08-11,Finance,Accountant,Bachelor's,Excellent,Yes,36647,2000-01-18,,36647.000
4,E10004,Frances,Moore,Female,London,London,2018-05-16,IT,IT Support Technician,Diploma,Excellent,No,32791,1966-04-05,,32791.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7945,E17945,Bethany,Brown,Male,South East,Reading,2019-04-08,HR,HR Manager,Master's,Good,No,49329,1971-05-20,,52831.359
7946,E17946,Beth,Morgan,Male,London,London,2023-03-15,Sales,Sales Manager,Bachelor's,Good,No,68671,1967-09-14,,70044.420
7947,E17947,Clare,Allen,Female,South East,Brighton,2020-05-12,IT,Data Analyst,Bachelor's,Good,No,48223,1977-05-23,,48223.000
7948,E17948,Terence,Ryan,Female,South East,Southampton,2023-08-19,IT,IT Support Technician,Diploma,Good,Yes,26997,1980-09-28,,26997.000


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7950 entries, 0 to 7949
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Employee ID          7950 non-null   object 
 1   First Name           7950 non-null   object 
 2   Last Name            7950 non-null   object 
 3   Gender               7950 non-null   object 
 4   Region               7950 non-null   object 
 5   City                 7950 non-null   object 
 6   Hire Date            7950 non-null   object 
 7   Department           7950 non-null   object 
 8   Job Title            7950 non-null   object 
 9   Education Level      7950 non-null   object 
 10  Performance Rating   7950 non-null   object 
 11  Overtime             7950 non-null   object 
 12  Salary (£)           7950 non-null   int64  
 13  Birth Date           7950 non-null   object 
 14  Termination Date     907 non-null    object 
 15  Adjusted Salary (£)  7950 non-null   f