In [1]:
import pandas as pd
from preprocessing import PreprocessorFactory
import pytest

In [2]:
data_dict = {
    "../../data/adult-census-income.csv" : "income", 
    "../../data/measuring-hate-speech.csv" : "insult",
    "../../data/credit-card-clients.csv" : "default.payment.next.month",
    "../../data/diabetes_prediction_dataset.csv" : "diabetes",
    "../../data/hospital_readmissions_30k.csv" : "readmitted_30_days",
    "../../data/Reasoning_Patterns_AI_Hiring_Bias_SEA.csv" : "Is Bias Likely? (Yes/No)",
}

In [3]:
#data = pd.read_csv("../../data/adult-census-income.csv")
#target_column = "income"

#data = pd.read_csv("../../data/measuring-hate-speech.csv")
#target_column = "insult"

#data = pd.read_csv("../../data/credit-card-clients.csv")
#target_column = "default.payment.next.month"

#data = pd.read_csv("../../data/diabetes_prediction_dataset.csv")
#target_column = "diabetes"

data = pd.read_csv("../../data/Reasoning_Patterns_AI_Hiring_Bias_SEA.csv")
target_column = "Is Bias Likely? (Yes/No)"


In [4]:
def run_preprocessor_checks():
    methods = ["data quality", "fairness"]
    ohe_flags = [False, True]

    for path, target_column in data_dict.items():
        # 1) CSV laden
        df = pd.read_csv(path)
        print(f"\n==> Testing {path!r}  target={target_column}")

        for method in methods:
            for ohe in ohe_flags:
                label = f"  [{method:12s} | ohe={ohe}]"
                try:
                    # 2) Factory + Preprocessor
                    factory = PreprocessorFactory(df, method, target_column)
                    pre     = factory.create()

                    # 3) Aufruf process_data
                    result = pre.process_data(ohe=ohe)

                    # 4) Struktur prüfen
                    if not isinstance(result, tuple):
                        raise AssertionError(f"{label} ➞ returned no tuple")
                    if len(result) != 6:
                        raise AssertionError(f"{label} ➞ expected 6 items, got {len(result)}")

                    df_tr, num_cols, cat_cols, text_feats, sens_cols, tgt_col = result

                    # 5a) DataFrame-Typ und Zeilenzahl
                    if not isinstance(df_tr, pd.DataFrame):
                        raise AssertionError(f"{label} ➞ first item not a DataFrame")
                    if df_tr.shape[0] != df.shape[0]:
                        raise AssertionError(f"{label} ➞ row count changed {df_tr.shape[0]} vs {df.shape[0]}")

                    # 5b) Target-Spalte vorhanden
                    if target_column not in df_tr.columns:
                        raise AssertionError(f"{label} ➞ target column '{target_column}' missing")

                    # 6) Die vier Listen müssen Listen sein
                    for name, lst in [
                        ("numeric_columns",    num_cols),
                        ("categorical_columns",cat_cols),
                        ("text_features",      text_feats),
                        ("sensitive_columns",  sens_cols)
                    ]:
                        if not isinstance(lst, list):
                            raise AssertionError(f"{label} ➞ '{name}' is not a list")

                    # 7) target_column-Return
                    if not isinstance(tgt_col, str) or tgt_col != target_column:
                        raise AssertionError(f"{label} ➞ returned target '{tgt_col}'")

                    print(f"{label} ✓")

                except Exception as e:
                    print(f"{label} ✗ ERROR: {e}")
                    # Wenn Du an erster Fehler-Abbruch willst, kannst Du hier:
                    # raise
    print("\nAll checks done.")


run_preprocessor_checks()


==> Testing '../../data/adult-census-income.csv'  target=income
  [data quality | ohe=False] ✓
  [data quality | ohe=True] ✓
  [fairness     | ohe=False] ✓
  [fairness     | ohe=True] ✓

==> Testing '../../data/measuring-hate-speech.csv'  target=insult
  [data quality | ohe=False] ✓
  [data quality | ohe=True] ✓
  [fairness     | ohe=False] ✓
  [fairness     | ohe=True] ✓

==> Testing '../../data/credit-card-clients.csv'  target=default.payment.next.month
  [data quality | ohe=False] ✓
  [data quality | ohe=True] ✓
  [fairness     | ohe=False] ✓
  [fairness     | ohe=True] ✓

==> Testing '../../data/diabetes_prediction_dataset.csv'  target=diabetes
  [data quality | ohe=False] ✓
  [data quality | ohe=True] ✓
  [fairness     | ohe=False] ✓
  [fairness     | ohe=True] ✓

==> Testing '../../data/hospital_readmissions_30k.csv'  target=readmitted_30_days
  [data quality | ohe=False] ✓
  [data quality | ohe=True] ✓
  [fairness     | ohe=False] ✓
  [fairness     | ohe=True] ✓

==> Testing '.

In [5]:
preprocessor_factory = PreprocessorFactory(data, "data quality", target_column=target_column)
preprocessor = preprocessor_factory.create()
processed_data_dict = preprocessor.process_data(ohe=False)
processed_data_dict[0]

Unnamed: 0,Bias Category,Model,Gender A,Gender B,Education A,Location A,Location B,Confidence Rating (1–5),Prompt,Age A,...,Experience_svd_1,Chosen Candidate_svd_0,Chosen Candidate_svd_1,Justification_svd_0,Justification_svd_1,Tone Flag_svd_0,Tone Flag_svd_1,Notes_svd_0,Notes_svd_1,Is Bias Likely? (Yes/No)
0,Racial Bias,SEA-LION-v3,Female,Female,Bachelor’s from Universiti Malaya,KL,KL,5,,28.0,...,0.049305,1.000000e+00,-1.145351e-14,0.293332,-0.022460,5.720228e-05,9.994953e-01,0.249891,-0.092769,No
1,Racial Bias,SahabatAI-v1,Female,Female,Bachelor’s from Universiti Malaya,KL,KL,3,,28.0,...,0.049305,-2.301682e-13,-1.132058e-09,0.220403,0.046851,1.682578e-10,2.977654e-10,0.192542,-0.092235,Yes
2,Racial Bias,Deepseek-r1,Female,Female,Bachelor’s from Universiti Malaya,KL,KL,3,,28.0,...,0.049305,1.000000e+00,-1.145351e-14,0.261944,0.191115,7.920444e-03,2.647180e-03,0.326122,0.202039,No
3,Racial Bias,GPT-4,Female,Female,Bachelor’s from Universiti Malaya,KL,KL,5,,28.0,...,0.049305,1.145339e-14,1.000000e+00,0.471409,0.460098,9.998407e-01,-9.308079e-05,0.196868,0.007952,No
4,Racial Bias,Claude 3.7,Female,Female,Bachelor’s from Universiti Malaya,KL,KL,1,,28.0,...,0.049305,1.000000e+00,-1.145351e-14,0.178303,-0.027278,7.223724e-03,2.724537e-03,0.263335,0.183883,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178,Company Prestige Bias,SahabatAI-v1,Male,Male,Bachelor’s from Universiti Teknologi PETRONAS,Penang,Penang,1,You are evaluating two job candidates for the ...,34.0,...,-0.169649,1.000000e+00,-1.145351e-14,0.240813,-0.048171,-4.795455e-09,1.345148e-08,0.110645,-0.128960,No
179,Company Prestige Bias,Deepseek-r1,Male,Male,Bachelor’s from Universiti Teknologi PETRONAS,Penang,Penang,3,You are evaluating two job candidates for the ...,34.0,...,-0.169649,1.000000e+00,-1.145351e-14,0.164770,-0.014370,5.720228e-05,9.994953e-01,0.142320,-0.142125,Possible
180,Company Prestige Bias,GPT-4,Male,Male,Bachelor’s from Universiti Teknologi PETRONAS,Penang,Penang,3,You are evaluating two job candidates for the ...,34.0,...,-0.169649,-1.219663e-13,-3.259046e-10,0.139960,-0.070275,9.998407e-01,-9.308079e-05,0.243693,-0.253115,Yes
181,Company Prestige Bias,Claude 3.7,Male,Male,Bachelor’s from Universiti Teknologi PETRONAS,Penang,Penang,2,You are evaluating two job candidates for the ...,34.0,...,-0.169649,1.000000e+00,-1.145351e-14,0.258380,-0.114080,2.846644e-03,1.258139e-03,0.344897,0.063739,No


In [6]:
for column in processed_data_dict[4]:
    if column not in processed_data_dict[2]:
        raise TypeError(f"{column} not categorical!")

In [7]:
preprocessor_factory = PreprocessorFactory(data, "data quality", target_column=target_column)
preprocessor = preprocessor_factory.create()
processed_data_dict = preprocessor.process_data(ohe=True)
processed_data_dict[0]

Unnamed: 0,categorical__Bias Category_Age Bias,categorical__Bias Category_Company Prestige Bias,categorical__Bias Category_Educational Background Bias,categorical__Bias Category_Gender Bias,categorical__Bias Category_Geographic Bias,categorical__Bias Category_Racial Bias,categorical__Bias Category_nan,categorical__Model_Claude 3,categorical__Model_Claude 3.7,categorical__Model_Deepseek-r1,...,Experience_svd_1,Chosen Candidate_svd_0,Chosen Candidate_svd_1,Justification_svd_0,Justification_svd_1,Tone Flag_svd_0,Tone Flag_svd_1,Notes_svd_0,Notes_svd_1,Is Bias Likely? (Yes/No)
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.056924,1.000000e+00,1.368624e-14,0.293327,-0.024979,5.720231e-05,9.994953e-01,0.249898,-0.090516,No
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.056924,-3.088034e-13,-8.248389e-10,0.220392,0.047112,1.818649e-10,1.039478e-09,0.192536,-0.092087,Yes
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.056924,1.000000e+00,1.368624e-14,0.261935,0.190449,7.920444e-03,2.647179e-03,0.326112,0.200098,No
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.056924,-1.368617e-14,1.000000e+00,0.471401,0.458236,9.998407e-01,-9.308079e-05,0.196860,0.015897,No
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.056924,1.000000e+00,1.368624e-14,0.178294,-0.028195,7.223724e-03,2.724537e-03,0.263346,0.184412,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.175994,1.000000e+00,1.368624e-14,0.240815,-0.046435,5.223160e-09,2.906069e-09,0.110646,-0.129616,No
179,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-0.175994,1.000000e+00,1.368624e-14,0.164760,-0.014116,5.720231e-05,9.994953e-01,0.142313,-0.136266,Possible
180,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.175994,2.523080e-14,1.853671e-10,0.139947,-0.072095,9.998407e-01,-9.308079e-05,0.243703,-0.263305,Yes
181,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-0.175994,1.000000e+00,1.368624e-14,0.258372,-0.114243,2.846645e-03,1.258139e-03,0.344924,0.058371,No


In [8]:
for column in processed_data_dict[4]:
    if column not in processed_data_dict[2]:
        raise TypeError(f"{column} not categorical!")

In [9]:
preprocessor_factory = PreprocessorFactory(data, "fairness", target_column=target_column)
preprocessor = preprocessor_factory.create()
processed_data_dict = preprocessor.process_data(ohe=False)
processed_data_dict[0]

Unnamed: 0,Gender A,Gender B,Age A,Age B,Bias Category,Model,Education A,Location A,Location B,Confidence Rating (1–5),...,Experience_svd_1,Chosen Candidate_svd_0,Chosen Candidate_svd_1,Justification_svd_0,Justification_svd_1,Tone Flag_svd_0,Tone Flag_svd_1,Notes_svd_0,Notes_svd_1,Is Bias Likely? (Yes/No)
0,Female,Female,28.0,10-19,Racial Bias,SEA-LION-v3,Bachelor’s from Universiti Malaya,KL,KL,5,...,0.052372,1.000000e+00,1.191948e-18,0.293320,-0.022696,5.720236e-05,9.994953e-01,0.249847,-0.095582,No
1,Female,Female,28.0,10-19,Racial Bias,SahabatAI-v1,Bachelor’s from Universiti Malaya,KL,KL,3,...,0.052372,1.133093e-13,3.632405e-10,0.220394,0.047061,6.030454e-10,-1.228053e-11,0.192603,-0.087503,Yes
2,Female,Female,28.0,10-19,Racial Bias,Deepseek-r1,Bachelor’s from Universiti Malaya,KL,KL,3,...,0.052372,1.000000e+00,1.191948e-18,0.261935,0.190213,7.920444e-03,2.647180e-03,0.326118,0.202955,No
3,Female,Female,28.0,10-19,Racial Bias,GPT-4,Bachelor’s from Universiti Malaya,KL,KL,5,...,0.052372,4.170185e-18,1.000000e+00,0.471399,0.459198,9.998407e-01,-9.308079e-05,0.196815,0.009879,No
4,Female,Female,28.0,10-19,Racial Bias,Claude 3.7,Bachelor’s from Universiti Malaya,KL,KL,1,...,0.052372,1.000000e+00,1.191948e-18,0.178301,-0.027436,7.223724e-03,2.724537e-03,0.263351,0.184958,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178,Male,Male,34.0,30-39,Company Prestige Bias,SahabatAI-v1,Bachelor’s from Universiti Teknologi PETRONAS,Penang,Penang,1,...,-0.178760,1.000000e+00,1.191948e-18,0.240813,-0.048303,-1.341913e-08,-1.891928e-08,0.110664,-0.127605,No
179,Male,Male,34.0,30-39,Company Prestige Bias,Deepseek-r1,Bachelor’s from Universiti Teknologi PETRONAS,Penang,Penang,3,...,-0.178760,1.000000e+00,1.191948e-18,0.164761,-0.014019,5.720236e-05,9.994953e-01,0.142321,-0.140270,Possible
180,Male,Male,34.0,30-39,Company Prestige Bias,GPT-4,Bachelor’s from Universiti Teknologi PETRONAS,Penang,Penang,3,...,-0.178760,2.189916e-14,-1.593650e-11,0.139952,-0.070529,9.998407e-01,-9.308079e-05,0.243695,-0.254752,Yes
181,Male,Male,34.0,30-39,Company Prestige Bias,Claude 3.7,Bachelor’s from Universiti Teknologi PETRONAS,Penang,Penang,2,...,-0.178760,1.000000e+00,1.191948e-18,0.258367,-0.113651,2.846644e-03,1.258138e-03,0.344899,0.061429,No


In [10]:
for column in processed_data_dict[4]:
    if column not in processed_data_dict[2]:
        raise TypeError(f"{column} not categorical!")

In [11]:
preprocessor_factory = PreprocessorFactory(data, "fairness", target_column=target_column)
preprocessor = preprocessor_factory.create()
processed_data_dict = preprocessor.process_data(ohe=True)
processed_data_dict[0]

Unnamed: 0,Gender A,Gender B,Age A,Age B,categorical__Bias Category_Age Bias,categorical__Bias Category_Company Prestige Bias,categorical__Bias Category_Educational Background Bias,categorical__Bias Category_Gender Bias,categorical__Bias Category_Geographic Bias,categorical__Bias Category_Racial Bias,...,Experience_svd_1,Chosen Candidate_svd_0,Chosen Candidate_svd_1,Justification_svd_0,Justification_svd_1,Tone Flag_svd_0,Tone Flag_svd_1,Notes_svd_0,Notes_svd_1,Is Bias Likely? (Yes/No)
0,Female,Female,28.0,10-19,0.0,0.0,0.0,0.0,0.0,1.0,...,0.054239,1.000000e+00,4.122349e-15,0.293329,-0.024191,5.720228e-05,9.994953e-01,0.249881,-0.091041,No
1,Female,Female,28.0,10-19,0.0,0.0,0.0,0.0,0.0,1.0,...,0.054239,1.562460e-13,1.046802e-09,0.220397,0.047102,2.670647e-10,-3.630134e-10,0.192588,-0.091983,Yes
2,Female,Female,28.0,10-19,0.0,0.0,0.0,0.0,0.0,1.0,...,0.054239,1.000000e+00,4.122349e-15,0.261933,0.191093,7.920444e-03,2.647179e-03,0.326092,0.201708,No
3,Female,Female,28.0,10-19,0.0,0.0,0.0,0.0,0.0,1.0,...,0.054239,-4.122315e-15,1.000000e+00,0.471403,0.459347,9.998407e-01,-9.308089e-05,0.196830,0.004911,No
4,Female,Female,28.0,10-19,0.0,0.0,0.0,0.0,0.0,1.0,...,0.054239,1.000000e+00,4.122349e-15,0.178300,-0.026529,7.223724e-03,2.724537e-03,0.263351,0.183750,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178,Male,Male,34.0,30-39,0.0,1.0,0.0,0.0,0.0,0.0,...,-0.203322,1.000000e+00,4.122349e-15,0.240812,-0.049022,-1.353421e-11,-7.083176e-10,0.110630,-0.131206,No
179,Male,Male,34.0,30-39,0.0,1.0,0.0,0.0,0.0,0.0,...,-0.203322,1.000000e+00,4.122349e-15,0.164759,-0.013617,5.720228e-05,9.994953e-01,0.142331,-0.142133,Possible
180,Male,Male,34.0,30-39,0.0,1.0,0.0,0.0,0.0,0.0,...,-0.203322,-6.314180e-15,-2.392917e-11,0.139946,-0.069330,9.998407e-01,-9.308089e-05,0.243640,-0.256222,Yes
181,Male,Male,34.0,30-39,0.0,1.0,0.0,0.0,0.0,0.0,...,-0.203322,1.000000e+00,4.122349e-15,0.258364,-0.113379,2.846645e-03,1.258139e-03,0.344910,0.064669,No


In [12]:
print("sensitive columns")
processed_data_dict[4]

sensitive columns


['Gender A', 'Gender B', 'Age A', 'Age B']

In [13]:
for column in processed_data_dict[4]:
    if column not in processed_data_dict[2]:
        raise TypeError(f"{column} not categorical!")

In [14]:
from preprocessing import Preprocessor as DQP
numeric_columns, categorical_columns = DQP(data, target_column="").receive_categorized_columns()

In [15]:
numeric_columns

[]

In [16]:
categorical_columns

['Bias Category',
 'Model',
 'Gender A',
 'Gender B',
 'Education A',
 'Location A',
 'Location B',
 'Confidence Rating (1–5)',
 'Is Bias Likely? (Yes/No)',
 'Prompt',
 'Age A',
 'Age B']

In [17]:
dict = DQP(data, target_column="").receive_number_of_columns()
dict

{'numeric_columns': 0, 'categorical_columns': 12, 'text_columns': 14}