# Evaluation code for filter_dp

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import opendp as dp

Load and clean dataset

In [None]:
# Load input datasets
input_salaries = pd.read_csv('input/EmployeeSalaries.csv', names=["Department","Department_Name","Division","Gender","Base_Salary","Overtime_Pay","Longevity_Pay","Grade"])
input_students = pd.read_csv('input/StudentsPerformance.csv', names=["gender","race_ethnicity","parental_education","lunch","test_preparation","math_score","reading_score","writing_score"])

# Load output datasets
output_salaries = pd.read_csv('output/EmployeeSalaries.perturbed.csv')
output_students = pd.read_csv('output/StudentsPerformance.perturbed.csv')

In [None]:
# Remove additional first column from output datasets
output_salaries = output_salaries.iloc[:, 1:]
output_students = output_students.iloc[:, 1:]

# Remove the last line from the "student" output file
output_students = output_students[~output_students.apply(lambda x: x.astype(str).str.contains('Quit').any(), axis=1)]

# Extract relevant columns
input_salaries_base = input_salaries['Base_Salary'].astype(float)
output_salaries_base = output_salaries['Base_Salary'].astype(float)

input_students_scores = input_students[['math_score', 'reading_score', 'writing_score']].astype(int)
output_students_scores = output_students[['math_score', 'reading_score', 'writing_score']].astype(int)

In [None]:
# Salaries dataset analysis
print("Salaries Dataset:")
print("Original Base_Salary - Mean: {:.2f}, Std: {:.2f}".format(input_salaries_base.mean(), input_salaries_base.std()))
print("Perturbed Base_Salary - Mean: {:.2f}, Std: {:.2f}".format(output_salaries_base.mean(), output_salaries_base.std()))

plt.figure(figsize=(8, 6))
plt.hist(input_salaries_base, bins=20, alpha=0.5, label='Original')
plt.hist(output_salaries_base, bins=20, alpha=0.5, label='Perturbed')
plt.xlabel('Base_Salary')
plt.ylabel('Frequency')
plt.legend()
plt.show()

# Students dataset analysis
print("\nStudents Dataset:")
for score in ['math_score', 'reading_score', 'writing_score']:
    print("Original {} - Mean: {:.2f}, Std: {:.2f}".format(score, input_students_scores[score].mean(), input_students_scores[score].std()))
    print("Perturbed {} - Mean: {:.2f}, Std: {:.2f}".format(score, output_students_scores[score].mean(), output_students_scores[score].std()))

plt.figure(figsize=(8, 6))
for score in ['math_score', 'reading_score', 'writing_score']:
    plt.hist(input_students_scores[score], bins=20, alpha=0.5, label='Original ' + score)
    plt.hist(output_students_scores[score], bins=20, alpha=0.5, label='Perturbed ' + score)
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.legend()
plt.show()
