In [7]:
import pandas as pd
import numpy as np
from faker import Faker
import random

fake = Faker('de_CH')

# Set a fixed random seed for reproducibility
np.random.seed(42)

# Predefine departments and job titles to ensure consistency
departments = ['HR', 'IT', 'Sales', 'Marketing', 'Finance', 'Operations']
job_titles = {
    'HR': ['HR Manager', 'HR Specialist'],
    'IT': ['IT Support Specialist', 'Software Developer', 'Senior Analyst'],
    'Sales': ['Sales Representative', 'Sales Manager'],
    'Marketing': ['Marketing Coordinator', 'Marketing Manager'],
    'Finance': ['Accountant', 'Financial Analyst'],
    'Operations': ['Operations Manager', 'Logistics Coordinator']
}

# Function to generate each field
def generate_employee(department, id):
    job_title = random.choice(job_titles[department])
    salary = generate_salary(job_title)
    return {
        'Employee_ID': id,
        'Name': fake.name(),
        'Age': random.randint(25, 65),
        'Gender': random.choice(['Male', 'Female', 'Other']),
        'Department': department,
        'Job_Title': job_title,
        'Salary': salary,
        'Date_of_Hire': fake.date_between(start_date='-12y', end_date='today'),
        'Performance_Rating': random.randint(1, 5),
        'Email_Address': fake.email(),
        'Address': fake.address(),
        'Phone_Number': fake.phone_number(),
        'Contract_Type': random.choice(['Permanent', 'Temporary', 'Contract']),
        'Seniority': random.choice(['Junior', 'Mid-level', 'Senior'])
    }

def generate_salary(job_title):
    salaries = {
        'HR Manager': 120000,
        'HR Specialist': 90000,
        'IT Support Specialist': 95000,
        'Software Developer': 115000,
        'Senior Analyst': 130000,
        'Sales Representative': 85000,
        'Sales Manager': 130000,
        'Marketing Coordinator': 80000,
        'Marketing Manager': 120000,
        'Accountant': 100000,
        'Financial Analyst': 110000,
        'Operations Manager': 125000,
        'Logistics Coordinator': 70000
    }
    return salaries.get(job_title, 75000)

# Generate data for 200 employees
employees = [generate_employee(random.choice(departments), x) for x in range(200)]

# Convert list of dictionaries to DataFrame
df = pd.DataFrame(employees)

# Set a fixed random seed for reproducibility
np.random.seed(42)

# Assuming df is already loaded or generated as per the previous part
employee_ids = df.index.tolist()

# Define a function to generate permissions for each employee
def generate_permissions(employee_id):
    num_permissions = random.randint(1, 5)  # Assume each employee can have 1 to 5 permissions
    permissions = []
    for _ in range(num_permissions):
        permission_type = random.choice(['Holiday', 'Sick Leave', 'Personal Leave', 'Maternity Leave', 'Paternity Leave'])
        start_date = fake.date_between(start_date='-2y', end_date='today')
        duration = random.choice([1, 3, 7, 14, 28])  # days
        end_date = start_date + pd.Timedelta(days=duration)
        status = random.choice(['Approved', 'Pending', 'Denied'])
        #notes = fake.sentence(nb_words=6) if permission_type in ['Sick Leave', 'Maternity Leave', 'Paternity Leave'] else "N/A"
        
        permissions.append({
            'Employee_ID': employee_id,
            'Permission_Type': permission_type,
            'Start_Date': start_date,
            'End_Date': end_date,
            'Status': status
        })
    return permissions

# Generate permissions for all employees
all_permissions = []
for employee_id in employee_ids:
    all_permissions.extend(generate_permissions(employee_id))

# Convert list of dictionaries to DataFrame
permissions_df = pd.DataFrame(all_permissions)

df.to_csv('2_HR_dataset.csv', index=False)
permissions_df.to_csv('2_Permissions_dataset.csv', index=False)


In [11]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta

# Initialize a Faker generator
fake = Faker()

# Parameters for data generation
n_customers = 1000  # number of customers
n_transactions = 50000  # number of transactions

# Generate Customer Data
customer_ids = range(1, n_customers + 1)
customers_data = {
    "Customer_ID": customer_ids,
    "Name": [fake.name() for _ in customer_ids],
    "Age": [random.randint(18, 70) for _ in customer_ids],
    "Location": [fake.city() for _ in customer_ids],
    "Email": [fake.email() for _ in customer_ids]
}
customers_df = pd.DataFrame(customers_data)

# Generate Transaction Data
transaction_ids = range(1, n_transactions + 1)
transaction_data = {
    "Transaction_ID": transaction_ids,
    "Customer_ID": [random.choice(customer_ids) for _ in transaction_ids],
    "Date": [(datetime.now() - timedelta(days=random.randint(0, 365))).date() for _ in transaction_ids],
    "Amount": [random.uniform(10.0, 500.0) for _ in transaction_ids],
    "Product_Category": [random.choice(['Electronics', 'Clothing', 'Home & Garden', 'Sports', 'Beauty']) for _ in transaction_ids]
}
transactions_df = pd.DataFrame(transaction_data)

customers_df.to_csv('1_CustomersData.csv', index=False)
transactions_df.to_csv('1_TransactionsData.csv', index=False)

In [10]:
customers_df

Unnamed: 0,Customer_ID,Name,Age,Location,Email
0,1,Amy Jones,49,Jamietown,tara82@example.com
1,2,Michelle Romero,40,East Michelleview,laurenowens@example.net
2,3,Alexis Cooley,37,Valdezhaven,ojones@example.org
3,4,Kendra Dennis,43,Alyssahaven,hunterward@example.net
4,5,Jenny Gregory,18,West Scottville,melody67@example.net
...,...,...,...,...,...
9995,9996,Calvin Sutton,67,New Christopher,morganrivera@example.net
9996,9997,Vincent Long,65,Sandrafurt,tylerangela@example.net
9997,9998,James Bishop,38,South Dana,suzanne56@example.net
9998,9999,Karen Reyes,33,Lake Todd,smedina@example.org
