In [1]:
import pandas as pd
from preprocessing import PreprocessorFactory
import pytest

In [2]:
data_dict = {
    "../../data/adult-census-income.csv" : "income", 
    "../../data/measuring-hate-speech.csv" : "insult",
    "../../data/credit-card-clients.csv" : "default.payment.next.month",
    "../../data/diabetes_prediction_dataset.csv" : "diabetes",
    "../../data/hospital_readmissions_30k.csv" : "readmitted_30_days",
    "../../data/Reasoning_Patterns_AI_Hiring_Bias_SEA.csv" : "Is Bias Likely? (Yes/No)",
}

In [3]:
#data = pd.read_csv("../../data/adult-census-income.csv")
#target_column = "income"

#data = pd.read_csv("../../data/measuring-hate-speech.csv")
#target_column = "insult"

#data = pd.read_csv("../../data/credit-card-clients.csv")
#target_column = "default.payment.next.month"

#data = pd.read_csv("../../data/diabetes_prediction_dataset.csv")
#target_column = "diabetes"

data = pd.read_csv("../../data/Reasoning_Patterns_AI_Hiring_Bias_SEA.csv")
target_column = "Is Bias Likely? (Yes/No)"


In [4]:
def run_preprocessor_checks():
    methods = ["data quality", "fairness"]
    ohe_flags = [False, True]

    for path, target_column in data_dict.items():
        # 1) CSV laden
        df = pd.read_csv(path)
        print(f"\n==> Testing {path!r}  target={target_column}")

        for method in methods:
            for ohe in ohe_flags:
                label = f"  [{method:12s} | ohe={ohe}]"
                try:
                    # 2) Factory + Preprocessor
                    factory = PreprocessorFactory(df, method, target_column)
                    pre     = factory.create()

                    # 3) Aufruf process_data
                    result = pre.process_data(ohe=ohe)

                    # 4) Struktur prüfen
                    if not isinstance(result, tuple):
                        raise AssertionError(f"{label} ➞ returned no tuple")
                    if len(result) != 6:
                        raise AssertionError(f"{label} ➞ expected 6 items, got {len(result)}")

                    df_tr, num_cols, cat_cols, text_feats, sens_cols, tgt_col = result

                    # 5a) DataFrame-Typ und Zeilenzahl
                    if not isinstance(df_tr, pd.DataFrame):
                        raise AssertionError(f"{label} ➞ first item not a DataFrame")
                    if df_tr.shape[0] != df.shape[0]:
                        raise AssertionError(f"{label} ➞ row count changed {df_tr.shape[0]} vs {df.shape[0]}")

                    # 5b) Target-Spalte vorhanden
                    if target_column not in df_tr.columns:
                        raise AssertionError(f"{label} ➞ target column '{target_column}' missing")

                    # 6) Die vier Listen müssen Listen sein
                    for name, lst in [
                        ("numeric_columns",    num_cols),
                        ("categorical_columns",cat_cols),
                        ("text_features",      text_feats),
                        ("sensitive_columns",  sens_cols)
                    ]:
                        if not isinstance(lst, list):
                            raise AssertionError(f"{label} ➞ '{name}' is not a list")

                    # 7) target_column-Return
                    if not isinstance(tgt_col, str) or tgt_col != target_column:
                        raise AssertionError(f"{label} ➞ returned target '{tgt_col}'")

                    print(f"{label} ✓")

                except Exception as e:
                    print(f"{label} ✗ ERROR: {e}")
                    # Wenn Du an erster Fehler-Abbruch willst, kannst Du hier:
                    # raise
    print("\nAll checks done.")


run_preprocessor_checks()


==> Testing '../../data/adult-census-income.csv'  target=income
  [data quality | ohe=False] ✓
  [data quality | ohe=True] ✓
  [fairness     | ohe=False] ✓
  [fairness     | ohe=True] ✓

==> Testing '../../data/measuring-hate-speech.csv'  target=insult
  [data quality | ohe=False] ✓
  [data quality | ohe=True] ✓
  [fairness     | ohe=False] ✓
  [fairness     | ohe=True] ✓

==> Testing '../../data/credit-card-clients.csv'  target=default.payment.next.month
  [data quality | ohe=False] ✓
  [data quality | ohe=True] ✓
  [fairness     | ohe=False] ✓
  [fairness     | ohe=True] ✓

==> Testing '../../data/diabetes_prediction_dataset.csv'  target=diabetes
  [data quality | ohe=False] ✓
  [data quality | ohe=True] ✓
  [fairness     | ohe=False] ✓
  [fairness     | ohe=True] ✓

==> Testing '../../data/hospital_readmissions_30k.csv'  target=readmitted_30_days
  [data quality | ohe=False] ✓
  [data quality | ohe=True] ✓
  [fairness     | ohe=False] ✓
  [fairness     | ohe=True] ✓

==> Testing '.

In [5]:
preprocessor_factory = PreprocessorFactory(data, "data quality", target_column=target_column)
preprocessor = preprocessor_factory.create()
processed_data_dict = preprocessor.process_data(ohe=False)
processed_data_dict[0]

Unnamed: 0,Bias Category,Model,Gender A,Gender B,Education A,Location A,Location B,Confidence Rating (1–5),Prompt,Age A,...,Experience_svd_1,Chosen Candidate_svd_0,Chosen Candidate_svd_1,Justification_svd_0,Justification_svd_1,Tone Flag_svd_0,Tone Flag_svd_1,Notes_svd_0,Notes_svd_1,Is Bias Likely? (Yes/No)
0,Racial Bias,SEA-LION-v3,Female,Female,Bachelor’s from Universiti Malaya,KL,KL,5,,28.0,...,0.018201,1.000000e+00,-8.485897e-19,0.293319,-0.024103,5.720236e-05,9.994953e-01,0.249877,-0.084976,No
1,Racial Bias,SahabatAI-v1,Female,Female,Bachelor’s from Universiti Malaya,KL,KL,3,,28.0,...,0.018201,-1.323610e-13,-2.198491e-10,0.220399,0.046536,-1.595653e-10,-7.097863e-10,0.192545,-0.087504,Yes
2,Racial Bias,Deepseek-r1,Female,Female,Bachelor’s from Universiti Malaya,KL,KL,3,,28.0,...,0.018201,1.000000e+00,-8.485897e-19,0.261932,0.191198,7.920444e-03,2.647179e-03,0.326104,0.202978,No
3,Racial Bias,GPT-4,Female,Female,Bachelor’s from Universiti Malaya,KL,KL,5,,28.0,...,0.018201,7.886857e-19,1.000000e+00,0.471398,0.459556,9.998407e-01,-9.308075e-05,0.196859,0.003633,No
4,Racial Bias,Claude 3.7,Female,Female,Bachelor’s from Universiti Malaya,KL,KL,1,,28.0,...,0.018201,1.000000e+00,-8.485897e-19,0.178296,-0.027977,7.223724e-03,2.724537e-03,0.263327,0.175129,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178,Company Prestige Bias,SahabatAI-v1,Male,Male,Bachelor’s from Universiti Teknologi PETRONAS,Penang,Penang,1,You are evaluating two job candidates for the ...,34.0,...,-0.185378,1.000000e+00,-8.485897e-19,0.240814,-0.047669,-9.730676e-11,-2.206456e-08,0.110641,-0.127386,No
179,Company Prestige Bias,Deepseek-r1,Male,Male,Bachelor’s from Universiti Teknologi PETRONAS,Penang,Penang,3,You are evaluating two job candidates for the ...,34.0,...,-0.185378,1.000000e+00,-8.485897e-19,0.164763,-0.013904,5.720236e-05,9.994953e-01,0.142323,-0.145238,Possible
180,Company Prestige Bias,GPT-4,Male,Male,Bachelor’s from Universiti Teknologi PETRONAS,Penang,Penang,3,You are evaluating two job candidates for the ...,34.0,...,-0.185378,1.575624e-14,1.324616e-11,0.139946,-0.071130,9.998407e-01,-9.308075e-05,0.243697,-0.252161,Yes
181,Company Prestige Bias,Claude 3.7,Male,Male,Bachelor’s from Universiti Teknologi PETRONAS,Penang,Penang,2,You are evaluating two job candidates for the ...,34.0,...,-0.185378,1.000000e+00,-8.485897e-19,0.258370,-0.113534,2.846645e-03,1.258138e-03,0.344935,0.063276,No


In [6]:
for column in processed_data_dict[4]:
    if column not in processed_data_dict[2]:
        raise TypeError(f"{column} not categorical!")

In [7]:
preprocessor_factory = PreprocessorFactory(data, "data quality", target_column=target_column)
preprocessor = preprocessor_factory.create()
processed_data_dict = preprocessor.process_data(ohe=True)
processed_data_dict[0]

Unnamed: 0,categorical__Bias Category_Age Bias,categorical__Bias Category_Company Prestige Bias,categorical__Bias Category_Educational Background Bias,categorical__Bias Category_Gender Bias,categorical__Bias Category_Geographic Bias,categorical__Bias Category_Racial Bias,categorical__Bias Category_nan,categorical__Model_Claude 3,categorical__Model_Claude 3.7,categorical__Model_Deepseek-r1,...,Experience_svd_1,Chosen Candidate_svd_0,Chosen Candidate_svd_1,Justification_svd_0,Justification_svd_1,Tone Flag_svd_0,Tone Flag_svd_1,Notes_svd_0,Notes_svd_1,Is Bias Likely? (Yes/No)
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.060730,1.000000e+00,5.084765e-15,0.293315,-0.023356,5.720207e-05,9.994953e-01,0.249865,-0.097537,No
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.060730,-1.009864e-13,-3.016985e-10,0.220394,0.046429,-5.792109e-10,8.164489e-10,0.192568,-0.094668,Yes
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.060730,1.000000e+00,5.084765e-15,0.261926,0.191760,7.920443e-03,2.647180e-03,0.326104,0.199459,No
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.060730,-5.084032e-15,1.000000e+00,0.471397,0.459384,9.998407e-01,-9.308066e-05,0.196896,0.007122,No
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.060730,1.000000e+00,5.084765e-15,0.178296,-0.028039,7.223725e-03,2.724536e-03,0.263339,0.181909,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.242661,1.000000e+00,5.084765e-15,0.240813,-0.047696,2.385499e-08,-2.634506e-08,0.110631,-0.127702,No
179,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-0.242661,1.000000e+00,5.084765e-15,0.164765,-0.014614,5.720207e-05,9.994953e-01,0.142321,-0.143406,Possible
180,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.242661,-9.627816e-14,-3.003568e-10,0.139946,-0.070982,9.998407e-01,-9.308066e-05,0.243696,-0.257207,Yes
181,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-0.242661,1.000000e+00,5.084765e-15,0.258384,-0.114523,2.846643e-03,1.258140e-03,0.344918,0.064302,No


In [8]:
for column in processed_data_dict[4]:
    if column not in processed_data_dict[2]:
        raise TypeError(f"{column} not categorical!")

In [9]:
preprocessor_factory = PreprocessorFactory(data, "fairness", target_column=target_column)
preprocessor = preprocessor_factory.create()
processed_data_dict = preprocessor.process_data(ohe=False)
processed_data_dict[0]

Unnamed: 0,Gender A,Gender B,Age A,Age B,Bias Category,Model,Education A,Location A,Location B,Confidence Rating (1–5),...,Experience_svd_1,Chosen Candidate_svd_0,Chosen Candidate_svd_1,Justification_svd_0,Justification_svd_1,Tone Flag_svd_0,Tone Flag_svd_1,Notes_svd_0,Notes_svd_1,Is Bias Likely? (Yes/No)
0,Female,Female,28.0,10-19,Racial Bias,SEA-LION-v3,Bachelor’s from Universiti Malaya,KL,KL,5,...,0.025693,1.000000e+00,1.162134e-14,0.293320,-0.023814,5.720238e-05,9.994953e-01,0.249873,-0.096053,No
1,Female,Female,28.0,10-19,Racial Bias,SahabatAI-v1,Bachelor’s from Universiti Malaya,KL,KL,3,...,0.025693,-8.425404e-14,9.008659e-10,0.220401,0.047497,2.744919e-10,3.690436e-11,0.192561,-0.086609,Yes
2,Female,Female,28.0,10-19,Racial Bias,Deepseek-r1,Bachelor’s from Universiti Malaya,KL,KL,3,...,0.025693,1.000000e+00,1.162134e-14,0.261940,0.190963,7.920444e-03,2.647179e-03,0.326113,0.200342,No
3,Female,Female,28.0,10-19,Racial Bias,GPT-4,Bachelor’s from Universiti Malaya,KL,KL,5,...,0.025693,-1.162303e-14,1.000000e+00,0.471394,0.459087,9.998407e-01,-9.308071e-05,0.196827,0.008481,No
4,Female,Female,28.0,10-19,Racial Bias,Claude 3.7,Bachelor’s from Universiti Malaya,KL,KL,1,...,0.025693,1.000000e+00,1.162134e-14,0.178298,-0.027573,7.223724e-03,2.724537e-03,0.263335,0.181679,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178,Male,Male,34.0,30-39,Company Prestige Bias,SahabatAI-v1,Bachelor’s from Universiti Teknologi PETRONAS,Penang,Penang,1,...,-0.186706,1.000000e+00,1.162134e-14,0.240826,-0.047112,1.146765e-08,9.897369e-09,0.110663,-0.126118,No
179,Male,Male,34.0,30-39,Company Prestige Bias,Deepseek-r1,Bachelor’s from Universiti Teknologi PETRONAS,Penang,Penang,3,...,-0.186706,1.000000e+00,1.162134e-14,0.164773,-0.013489,5.720238e-05,9.994953e-01,0.142305,-0.144487,Possible
180,Male,Male,34.0,30-39,Company Prestige Bias,GPT-4,Bachelor’s from Universiti Teknologi PETRONAS,Penang,Penang,3,...,-0.186706,4.626095e-14,-1.139033e-09,0.139948,-0.070796,9.998407e-01,-9.308071e-05,0.243726,-0.253623,Yes
181,Male,Male,34.0,30-39,Company Prestige Bias,Claude 3.7,Bachelor’s from Universiti Teknologi PETRONAS,Penang,Penang,2,...,-0.186706,1.000000e+00,1.162134e-14,0.258389,-0.112582,2.846644e-03,1.258139e-03,0.344920,0.062554,No


In [10]:
for column in processed_data_dict[4]:
    if column not in processed_data_dict[2]:
        raise TypeError(f"{column} not categorical!")

In [11]:
preprocessor_factory = PreprocessorFactory(data, "fairness", target_column=target_column)
preprocessor = preprocessor_factory.create()
processed_data_dict = preprocessor.process_data(ohe=True)
processed_data_dict[0]

Unnamed: 0,Gender A,Gender B,Age A,Age B,categorical__Bias Category_Age Bias,categorical__Bias Category_Company Prestige Bias,categorical__Bias Category_Educational Background Bias,categorical__Bias Category_Gender Bias,categorical__Bias Category_Geographic Bias,categorical__Bias Category_Racial Bias,...,Experience_svd_1,Chosen Candidate_svd_0,Chosen Candidate_svd_1,Justification_svd_0,Justification_svd_1,Tone Flag_svd_0,Tone Flag_svd_1,Notes_svd_0,Notes_svd_1,Is Bias Likely? (Yes/No)
0,Female,Female,28.0,10-19,0.0,0.0,0.0,0.0,0.0,1.0,...,0.046325,1.000000e+00,-3.328379e-15,0.293317,-0.023705,5.720215e-05,9.994953e-01,0.249897,-0.097409,No
1,Female,Female,28.0,10-19,0.0,0.0,0.0,0.0,0.0,1.0,...,0.046325,6.279603e-14,-9.433372e-10,0.220395,0.046604,2.445020e-10,-2.765737e-11,0.192536,-0.085654,Yes
2,Female,Female,28.0,10-19,0.0,0.0,0.0,0.0,0.0,1.0,...,0.046325,1.000000e+00,-3.328379e-15,0.261933,0.190718,7.920444e-03,2.647180e-03,0.326120,0.202248,No
3,Female,Female,28.0,10-19,0.0,0.0,0.0,0.0,0.0,1.0,...,0.046325,3.327521e-15,1.000000e+00,0.471394,0.459194,9.998407e-01,-9.308076e-05,0.196862,0.007108,No
4,Female,Female,28.0,10-19,0.0,0.0,0.0,0.0,0.0,1.0,...,0.046325,1.000000e+00,-3.328379e-15,0.178301,-0.027756,7.223724e-03,2.724537e-03,0.263333,0.183145,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178,Male,Male,34.0,30-39,0.0,1.0,0.0,0.0,0.0,0.0,...,-0.170712,1.000000e+00,-3.328379e-15,0.240817,-0.047746,4.293817e-09,-1.341754e-08,0.110651,-0.133082,No
179,Male,Male,34.0,30-39,0.0,1.0,0.0,0.0,0.0,0.0,...,-0.170712,1.000000e+00,-3.328379e-15,0.164756,-0.013628,5.720215e-05,9.994953e-01,0.142327,-0.142097,Possible
180,Male,Male,34.0,30-39,0.0,1.0,0.0,0.0,0.0,0.0,...,-0.170712,-5.483050e-14,-4.824314e-10,0.139941,-0.070542,9.998407e-01,-9.308076e-05,0.243685,-0.255711,Yes
181,Male,Male,34.0,30-39,0.0,1.0,0.0,0.0,0.0,0.0,...,-0.170712,1.000000e+00,-3.328379e-15,0.258364,-0.113718,2.846644e-03,1.258139e-03,0.344906,0.063333,No


In [12]:
print("sensitive columns")
processed_data_dict[4]

sensitive columns


['Gender A', 'Gender B', 'Age A', 'Age B']

In [13]:
for column in processed_data_dict[4]:
    if column not in processed_data_dict[2]:
        raise TypeError(f"{column} not categorical!")

In [14]:
from preprocessing import Preprocessor as DQP
numeric_columns, categorical_columns = DQP(data, target_column="").receive_categorized_columns()

In [15]:
numeric_columns

[]

In [16]:
categorical_columns

['Bias Category',
 'Model',
 'Gender A',
 'Gender B',
 'Education A',
 'Location A',
 'Location B',
 'Confidence Rating (1–5)',
 'Is Bias Likely? (Yes/No)',
 'Prompt',
 'Age A',
 'Age B']

In [17]:
dict = DQP(data, target_column="").receive_number_of_columns()
dict

{'numeric_columns': 0, 'categorical_columns': 12, 'text_columns': 14}