In [1]:
import sys, os
nb_dir = os.getcwd()
# 2) Eine Ebene hoch → …/fairfluence/src
src_dir = os.path.abspath(os.path.join(nb_dir, os.pardir))
# 3) Noch eine Ebene hoch → …/fairfluence
project_root = os.path.abspath(os.path.join(src_dir, os.pardir))

# 4) Füge fairfluence/ nach ganz vorne in sys.path
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    
import pandas as pd
from preprocessing import PreprocessorFactory, Preprocessor
import pytest
from datasets_dict import DATASETS
from src.ingestion.ingestorFactory import IngestorFactory

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys, os
nb_dir = os.getcwd()
# 2) Eine Ebene hoch → …/fairfluence/src
src_dir = os.path.abspath(os.path.join(nb_dir, os.pardir))
# 3) Noch eine Ebene hoch → …/fairfluence
project_root = os.path.abspath(os.path.join(src_dir, os.pardir))

# 4) Füge fairfluence/ nach ganz vorne in sys.path
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    
import pandas as pd
from preprocessing import PreprocessorFactory, Preprocessor
import pytest
from datasets_dict import DATASETS
from src.ingestion.ingestorFactory import IngestorFactory

def run_preprocessor_checks():
    methods = ["data quality", "fairness"]
    ohe_flags = [True] #[False, True]

    for index, data in enumerate(DATASETS.items()):
        data_link = data[0]
        info = data[1]
        if data_link in ["https://www.kaggle.com/datasets/aemreusta/brfss-2020-survey-data",
                        "https://www.kaggle.com/datasets/fleshmetal/records-a-comprehensive-music-metadata-dataset",
                        "https://www.kaggle.com/datasets/sudipde25/lifespan-calorie-burn-dynamics"]:
            continue
        target_column = info["target_column"]
        nr_file = info["no_dataset"]
        # 1) CSV laden
        ingestor_factory = IngestorFactory(data_link, nr_file, False)
        ingestor = ingestor_factory.create()
        df = ingestor.load_data()
        print(f"\n==> Testing {data_link!r}  target={target_column}")

        for method in methods:
            for ohe in ohe_flags:
                label = f"  [{method:12s} | ohe={ohe}]"
                try:
                    # 2) Factory + Preprocessor
                    factory = PreprocessorFactory(df, method, target_column)
                    pre     = factory.create()

                    # 3) Aufruf process_data
                    result = pre.process_data(ohe=ohe)

                    # 4) Struktur prüfen
                    if not isinstance(result, tuple):
                        raise AssertionError(f"{label} ➞ returned no tuple")
                    if len(result) != 6:
                        raise AssertionError(f"{label} ➞ expected 6 items, got {len(result)}")

                    df_tr, num_cols, cat_cols, text_feats, sens_cols, tgt_col = result

                    # 5a) DataFrame-Typ und Zeilenzahl
                    if not isinstance(df_tr, pd.DataFrame):
                        raise AssertionError(f"{label} ➞ first item not a DataFrame")
                    if df_tr.shape[0] != df.shape[0]:
                        raise AssertionError(f"{label} ➞ row count changed {df_tr.shape[0]} vs {df.shape[0]}")

                    # 5b) Target-Spalte vorhanden
                    if target_column not in df_tr.columns:
                        raise AssertionError(f"{label} ➞ target column '{target_column}' missing")
                    
                    # Nan Check
                    if df_tr.isnull().values.any():
                        nan_cols = df_tr.columns[df_tr.isnull().any()].tolist()
                        raise AssertionError(f"{label} ➞ found NaNs in columns {nan_cols}")

                    # 6) Die vier Listen müssen Listen sein
                    for name, lst in [
                        ("numeric_columns",    num_cols),
                        ("categorical_columns",cat_cols),
                        ("text_features",      text_feats),
                        ("sensitive_columns",  sens_cols)
                    ]:
                        if not isinstance(lst, list):
                            raise AssertionError(f"{label} ➞ '{name}' is not a list")

                    # 7) target_column-Return
                    if not isinstance(tgt_col, str) or tgt_col != target_column:
                        raise AssertionError(f"{label} ➞ returned target '{tgt_col}'") 
                    
                    # Every thing is numerical
                    if ohe:
                        for col in df_tr.columns:
                            if not pd.api.types.is_numeric_dtype(df_tr[col]):
                                raise AssertionError(
                                    f"{label} ➞ column '{col}' is not numeric (dtype={df_tr[col].dtype})"
                                )
                    
                    
                    vals = df_tr[target_column].unique()
                    counts = df_tr[target_column].value_counts().to_dict()
                    if len(vals) != 2:
                        raise AssertionError(
                            f"{label} ➞ target column must be binary, "
                            f"but found values {vals}"
                        )
                    else:
                        print(f"{label} → target value counts: {counts}")

                    print(f"{label} ✓")
                except Exception as e:
                    print(f"{label} ✗ ERROR: {e}")

    print("\nAll checks done.")


run_preprocessor_checks()

Creating CSV from Arrow format: 100%|██████████| 33/33 [00:00<00:00, 379.85ba/s]



==> Testing 'https://huggingface.co/datasets/scikit-learn/adult-census-income'  target=income
  [data quality | ohe=True] → target value counts: {1: 24720, 0: 7841}
  [data quality | ohe=True] ✓
  [fairness     | ohe=True] → target value counts: {1: 24720, 0: 7841}
  [fairness     | ohe=True] ✓
Dataset URL: https://www.kaggle.com/datasets/jayaantanaath/student-habits-vs-academic-performance
../../data/kaggle_temp/student_habits_performance.csv was deleted.

==> Testing 'https://www.kaggle.com/datasets/jayaantanaath/student-habits-vs-academic-performance'  target=exam_score
  [data quality | ohe=True] → target value counts: {0: 502, 1: 498}
  [data quality | ohe=True] ✓
  [fairness     | ohe=True] → target value counts: {0: 502, 1: 498}
  [fairness     | ohe=True] ✓
Dataset URL: https://www.kaggle.com/datasets/therohithanand/student-academic-performance-dataset
../../data/kaggle_temp/student_info.csv was deleted.

==> Testing 'https://www.kaggle.com/datasets/therohithanand/student-acad

Creating CSV from Arrow format: 100%|██████████| 100/100 [00:00<00:00, 550.48ba/s]



==> Testing 'https://huggingface.co/datasets/marianeft/diabetes_prediction_dataset'  target=diabetes
  [data quality | ohe=True] → target value counts: {1: 91500, 0: 8500}
  [data quality | ohe=True] ✓
  [fairness     | ohe=True] → target value counts: {1: 91500, 0: 8500}
  [fairness     | ohe=True] ✓


Creating CSV from Arrow format: 100%|██████████| 30/30 [00:00<00:00, 193.59ba/s]



==> Testing 'https://huggingface.co/datasets/scikit-learn/credit-card-clients'  target=default.payment.next.month
  [data quality | ohe=True] → target value counts: {1: 23364, 0: 6636}
  [data quality | ohe=True] ✓
  [fairness     | ohe=True] → target value counts: {1: 23364, 0: 6636}
  [fairness     | ohe=True] ✓


Creating CSV from Arrow format: 100%|██████████| 136/136 [00:02<00:00, 49.56ba/s]



==> Testing 'https://huggingface.co/datasets/ucberkeley-dlab/measuring-hate-speech'  target=hate_speech_score
  [data quality | ohe=True] → target value counts: {0: 67806, 1: 67750}
  [data quality | ohe=True] ✓
  [fairness     | ohe=True] → target value counts: {0: 67806, 1: 67750}
  [fairness     | ohe=True] ✓


Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 333.49ba/s]



==> Testing 'https://huggingface.co/datasets/Supa-AI/Reasoning_Patterns_AI_Hiring_Bias_SEA'  target=Is Bias Likely? (Yes/No)
  [data quality | ohe=True] → target value counts: {1: 129, 0: 54}
  [data quality | ohe=True] ✓
  [fairness     | ohe=True] → target value counts: {1: 129, 0: 54}
  [fairness     | ohe=True] ✓
Dataset URL: https://www.kaggle.com/datasets/siddharth0935/hospital-readmission-predictionsynthetic-dataset
../../data/kaggle_temp/hospital_readmissions_30k.csv was deleted.

==> Testing 'https://www.kaggle.com/datasets/siddharth0935/hospital-readmission-predictionsynthetic-dataset/data'  target=readmitted_30_days
  [data quality | ohe=True] → target value counts: {1: 26326, 0: 3674}
  [data quality | ohe=True] ✓
  [fairness     | ohe=True] → target value counts: {1: 26326, 0: 3674}
  [fairness     | ohe=True] ✓

==> Testing 'https://www.openml.org/search?type=data&sort=runs&status=active&id=37'  target=class
  [data quality | ohe=True] → target value counts: {1: 500, 0: 

Creating CSV from Arrow format: 100%|██████████| 5/5 [00:00<00:00, 398.74ba/s]



==> Testing 'https://huggingface.co/datasets/Andyrasika/banking-marketing'  target=y
  [data quality | ohe=True] → target value counts: {1: 4000, 0: 521}
  [data quality | ohe=True] ✓
  [fairness     | ohe=True] → target value counts: {1: 4000, 0: 521}
  [fairness     | ohe=True] ✓
Dataset URL: https://www.kaggle.com/datasets/danofer/compass
../../data/kaggle_temp/compas-scores-raw.csv was deleted.

==> Testing 'https://www.kaggle.com/datasets/danofer/compass'  target=DecileScore
  [data quality | ohe=True] → target value counts: {0: 36194, 1: 24649}
  [data quality | ohe=True] ✓
  [fairness     | ohe=True] → target value counts: {0: 36194, 1: 24649}
  [fairness     | ohe=True] ✓
Dataset URL: https://www.kaggle.com/datasets/kkanda/communities%20and%20crime%20unnormalized%20data%20set
../../data/kaggle_temp/crimedata.csv was deleted.

==> Testing 'https://www.kaggle.com/datasets/kkanda/communities%20and%20crime%20unnormalized%20data%20set'  target=ViolentCrimesPerPop
  [data quality | 



  [fairness     | ohe=True] → target value counts: {1: 21237, 0: 1170}
  [fairness     | ohe=True] ✓
Dataset URL: https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset
../../data/kaggle_temp/healthcare-dataset-stroke-data.csv was deleted.

==> Testing 'https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset'  target=stroke
  [data quality | ohe=True] → target value counts: {1: 4861, 0: 249}
  [data quality | ohe=True] ✓
  [fairness     | ohe=True] → target value counts: {1: 4861, 0: 249}
  [fairness     | ohe=True] ✓
Dataset URL: https://www.kaggle.com/datasets/fedesoriano/hepatitis-c-dataset
../../data/kaggle_temp/HepatitisCdata.csv was deleted.

==> Testing 'https://www.kaggle.com/datasets/fedesoriano/hepatitis-c-dataset'  target=Category
  [data quality | ohe=True] → target value counts: {1: 533, 0: 82}
  [data quality | ohe=True] ✓
  [fairness     | ohe=True] → target value counts: {1: 533, 0: 82}
  [fairness     | ohe=True] ✓
Dataset URL: https://www.



Dataset URL: https://www.kaggle.com/datasets/brendan45774/test-file
../../data/kaggle_temp/tested.csv was deleted.

==> Testing 'https://www.kaggle.com/datasets/brendan45774/test-file'  target=Survived
  [data quality | ohe=True] → target value counts: {1: 266, 0: 152}
  [data quality | ohe=True] ✓
  [fairness     | ohe=True] → target value counts: {1: 266, 0: 152}
  [fairness     | ohe=True] ✓
Dataset URL: https://www.kaggle.com/datasets/ulrikthygepedersen/speed-dating
../../data/kaggle_temp/speeddating.csv was deleted.

==> Testing 'https://www.kaggle.com/datasets/ulrikthygepedersen/speed-dating/data'  target=match
  [data quality | ohe=True] → target value counts: {1: 6998, 0: 1380}
  [data quality | ohe=True] ✓
  [fairness     | ohe=True] → target value counts: {1: 6998, 0: 1380}
  [fairness     | ohe=True] ✓

==> Testing 'https://www.openml.org/search?type=data&sort=runs&status=active&id=451'  target=Leaving_Certificate
  [data quality | ohe=True] → target value counts: {1: 278, 0



Dataset URL: https://www.kaggle.com/datasets/muhamedyoussry/fertility-data-set
../../data/kaggle_temp/fertility.csv was deleted.

==> Testing 'https://www.kaggle.com/datasets/muhamedyoussry/fertility-data-set'  target=Diagnosis
  [data quality | ohe=True] → target value counts: {1: 88, 0: 12}
  [data quality | ohe=True] ✓
  [fairness     | ohe=True] → target value counts: {1: 88, 0: 12}
  [fairness     | ohe=True] ✓
Dataset URL: https://www.kaggle.com/datasets/kushalmanage/physics-teacher-jobs-linkedin
../../data/kaggle_temp/physics_teacher_jobs_linkedin.csv was deleted.

==> Testing 'https://www.kaggle.com/datasets/kushalmanage/physics-teacher-jobs-linkedin'  target=contractType
  [data quality | ohe=True] → target value counts: {1: 711, 0: 136}
  [data quality | ohe=True] ✓
  [fairness     | ohe=True] → target value counts: {1: 711, 0: 136}
  [fairness     | ohe=True] ✓
Dataset URL: https://www.kaggle.com/datasets/amosshehzad/public-vs-privat-schools-statistical-analysis
../../data/k

  df = pd.read_csv(final_csv)



==> Testing 'https://www.kaggle.com/datasets/threnjen/portland-housing-prices-sales-jul-2020-jul-2021'  target=price
  [data quality | ohe=True] → target value counts: {0: 12883, 1: 12848}
  [data quality | ohe=True] ✓
  [fairness     | ohe=True] → target value counts: {0: 12883, 1: 12848}
  [fairness     | ohe=True] ✓
Dataset URL: https://www.kaggle.com/datasets/janakpariyar/coursera-courses-uncleaned-dataset-to-practice
../../data/kaggle_temp/duplicate_deleted.csv was deleted.

==> Testing 'https://www.kaggle.com/datasets/janakpariyar/coursera-courses-uncleaned-dataset-to-practice'  target=course_rating
  [data quality | ohe=True] → target value counts: {0: 32130, 1: 18037}
  [data quality | ohe=True] ✓
  [fairness     | ohe=True] → target value counts: {0: 32130, 1: 18037}
  [fairness     | ohe=True] ✓

All checks done.


In [3]:
import pandas as pd
pd.set_option("display.max_columns", None)

In [4]:
import sys, os
nb_dir = os.getcwd()
# 2) Eine Ebene hoch → …/fairfluence/src
src_dir = os.path.abspath(os.path.join(nb_dir, os.pardir))
# 3) Noch eine Ebene hoch → …/fairfluence
project_root = os.path.abspath(os.path.join(src_dir, os.pardir))

# 4) Füge fairfluence/ nach ganz vorne in sys.path
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    
import pandas as pd
from preprocessing import PreprocessorFactory, Preprocessor
import pytest
from datasets_dict import DATASETS
from src.ingestion.ingestorFactory import IngestorFactory

data_link = "https://www.kaggle.com/datasets/yagizfiratt/turkish-superleague-players-salaries-20242025"
nr_file = 1
target_column = "Gross P/Y (EUR)"
ingestor_factory = IngestorFactory(data_link, nr_file, False)
ingestor = ingestor_factory.create()
data = ingestor.load_data()
dqp = Preprocessor(data, target_column="")
numeric_columns, categorical_columns = dqp.receive_categorized_columns()
preprocessor_factory = PreprocessorFactory(data, "fairness", target_column=target_column)
preprocessor = preprocessor_factory.create()
processed_data_dict = preprocessor.process_data(ohe=True)


processed_data_dict[0]

Dataset URL: https://www.kaggle.com/datasets/yagizfiratt/turkish-superleague-players-salaries-20242025
../../data/kaggle_temp/turkish_superleague_salaries.csv was deleted.


Unnamed: 0,Age_15–18,Age_19–20,Age_21–22,Age_23–24,Age_25–25,Age_26–27,Age_28–28,Age_29–30,Age_31–31,Age_32–37,Bonus P/Y (EUR)_<MISSING>,"Bonus P/Y (EUR)_€ 1,830,000","Bonus P/Y (EUR)_€ 170,000","Bonus P/Y (EUR)_€ 3,330,000","Bonus P/Y (EUR)_€ 550,000","Bonus P/Y (EUR)_€ 630,000","Bonus P/Y (EUR)_€ 830,000",Club_Adana Demirspor,Club_Alanyaspor,Club_Antalyaspor,Club_Basaksehir,Club_Besiktas,Club_Bodrum,Club_Caykur Rizespor,Club_Eyupspor,Club_Fenerbahce,Club_Galatasaray,Club_Gaziantep BB,Club_Goztepe,Club_Hatayspor,Club_Kasimpasa,Club_Kayserispor,Club_Konyaspor,Club_Samsunspor,Club_Sivasspor,Club_Trabzonspor,"Expiration_Jan 20, 2026","Expiration_Jun 30, 2025","Expiration_Jun 30, 2026","Expiration_Jun 30, 2027","Expiration_Jun 30, 2028","Expiration_Jun 30, 2029","Expiration_Jun 30, 2030","Expiration_Sep 30, 2028",Pos._D,Pos._F,Pos._K,Pos._M,Pos. Detail_AM,Pos. Detail_CB,Pos. Detail_CF,Pos. Detail_CM,Pos. Detail_DM,Pos. Detail_GK,Pos. Detail_LB,Pos. Detail_LW,Pos. Detail_RB,Pos. Detail_RW,Pos. Detail_SS,Status_Reserve,Status_Starter,Verified_No,Verified_Yes,Years Remaining_1,Years Remaining_2,Years Remaining_3,Years Remaining_4,Years Remaining_5,Years Remaining_6,Player_svd_0,Player_svd_1,Gross P/W (EUR)_svd_0,Gross P/W (EUR)_svd_1,Signed_svd_0,Signed_svd_1,Gross Remaining (EUR)_svd_0,Gross Remaining (EUR)_svd_1,Country_svd_0,Country_svd_1,Gross P/Y (EUR)
0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,0,0,0,-0.001539,0.001860,-0.000194,0.001369,0.930604,-0.236953,0.287211,-0.100154,8.265012e-12,3.994706e-05,0
1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0.002983,0.000516,-0.000194,0.001369,0.443932,0.321083,0.327434,-0.117344,1.417742e-10,4.853762e-04,0
2,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,-0.000217,-0.000146,-0.000194,0.001369,0.514736,-0.106213,0.288819,-0.101253,-5.334632e-16,-6.088216e-09,0
3,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0.000108,0.000035,-0.000194,0.001369,0.404777,-0.059106,0.288819,-0.101253,1.886891e-13,-2.349408e-07,0
4,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0.000137,0.000172,-0.000311,-0.000177,0.064961,0.331972,0.590551,0.806450,3.848888e-13,-3.808656e-08,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
536,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0.000135,0.000136,0.000036,-0.000086,0.073369,0.061184,0.159375,-0.056749,1.000000e+00,2.471980e-12,0
537,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0.000189,-0.000021,0.000036,-0.000086,0.930604,-0.236953,0.159375,-0.056749,1.000000e+00,2.471980e-12,0
538,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,-0.000104,-0.000519,0.000000,0.000000,0.055863,0.327029,0.000000,0.000000,6.951416e-11,1.364968e-05,1
539,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,-0.000139,0.000133,0.000000,0.000000,0.490537,-0.099894,0.000000,0.000000,4.771576e-11,1.861215e-04,1
