In [None]:
import math

def calculate_n(confidence_level, tolerance):
    alpha = 1 - confidence_level
    n = math.ceil(math.log(2 / alpha) / (2 * tolerance ** 2))
    return n

confidence_level = 0.98  # 80% confidence level
tolerance = 0.05  # Error tolerance of 0.05

# Use Hoeffding's inequality
n = calculate_n(confidence_level, tolerance)
print(f"Number of trials needed (n): {n}")

In [None]:
import pandas as pd

def read_employee_data(file_path):
    df = pd.read_csv(file_path)
    employee_data = df.iloc[:, 1:].values
    return employee_data

file_path = '/challenge_dataset.csv'

employee_data = read_employee_data(file_path)

In [None]:
import numpy as np

def divide_and_minimize_error(employee_data, num_trials):
    num_employees, num_skills = employee_data.shape

    if num_employees % 2 != 0:
        raise ValueError("Number of employees must be even to divide into two equal groups.")

    half_size = num_employees // 2
    best_group1 = None
    best_group2 = None
    min_error = float('inf')
    best_errors = None

    for _ in range(num_trials):
        indices = np.random.permutation(num_employees)
        group1_indices = indices[:half_size]
        group2_indices = indices[half_size:]

        group1_sum = np.sum(employee_data[group1_indices], axis=0)
        group2_sum = np.sum(employee_data[group2_indices], axis=0)
        errors = np.abs(group1_sum - group2_sum)
        total_error = np.sum(errors)

        if total_error < min_error:
            min_error = total_error
            best_group1 = employee_data[group1_indices]
            best_group2 = employee_data[group2_indices]
            best_errors = errors
            group1 = group1_indices
            group2 = group2_indices

    print("Minimum total error achieved:", min_error)
    print("Best group 1:\n", group1)
    print("Best group 2:\n", group2)
    return best_group1, best_group2, best_errors

In [None]:
group1, group2, errors = divide_and_minimize_error(employee_data, n)
if group1 is not None and group2 is not None:
    group1_skill_sums = np.sum(group1, axis=0)
    group2_skill_sums = np.sum(group2, axis=0)

    print("Group 1 skill sums:\n", group1_skill_sums)
    print("Group 2 skill sums:\n", group2_skill_sums)

    print("Error between Group 1 and Group 2 for each skill:\n", errors)
else:
    print("Could not find a satisfactory division with the given number of trials.")
