In [None]:
import pandas as pd
import numpy as np
from pmlb import fetch_data
from tab_err.api import MidLevelConfig, mid_level
from tab_err import ErrorModel, error_mechanism, error_type
from tab_err.error_mechanism import ECAR, ENAR
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import classification_report, mean_squared_error, mean_absolute_error, root_mean_squared_error
from conformal_data_cleaning.cleaner.conformal import ConformalAutoGluonCleaner 
from sklearn.preprocessing import LabelEncoder
from autogluon.tabular import TabularDataset, TabularPredictor
import autogluon.eda.auto as auto
import autogluon.eda.analysis as eda
import autogluon.eda.visualization as viz



def run_exp(Dataframe, target_columns, condition_columns, categorical_columns, regression_columns, error_type, error_rate, frac, rows_numbers_xai, diagramm_xai_rows, confidence_level, time_limit, top_k, test_size, reuse_intermediate):
    # Dataframe = 'adult'
    # target_columns = ['age', 'occupation', 'fnlwgt']
    # condition_columns = ['education-num', 'workclass', 'sex']
    # categorical_columns = ['occupation']
    # regression_columns = ['age', 'fnlwgt']
    # error_type = [error_type.Outlier(), error_type.CategorySwap(), error_type.MissingValue()]
    # error_rate = 0.2
    # frac = 0.07 # Precent of rows that will be used in the CDC expirement
    # rows_numbers_xai = 50 #number of rows to be explained in XAI section

    cleaned_data = fetch_data(Dataframe)
    cleaned_data= cleaned_data.sample(frac=frac, random_state=42)
    for cat in categorical_columns:
        cleaned_data[cat] = cleaned_data[cat].astype('category') 
    for num in regression_columns:
        cleaned_data[num] = cleaned_data[num].astype('int') 

    X = cleaned_data.drop(target_columns, axis=1)
    y = cleaned_data[target_columns]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    test_data = X_test.copy()
    test_data[target_columns] = y_test.copy()
    corrupted = test_data.copy()
    
    train_data = X_train.copy()
    train_data[target_columns] = y_train.copy()
    
    corrupted_data, error_mask = create_errors(corrupted, target_columns, condition_columns, error_type, error_rate)
    corrupted_test_copy = corrupted_data.copy()  ## this one will be used in the detection experiment 
    
    #creating the masks, that we will use later in Detection
    mask_error = error_mask.copy()
    mask_error = mask_error[target_columns]
    masks = mask_error.astype(int)
    
    mask_targets = [] # mask labels such as age-mask and so on, for detection and xai parts
    
    # these data we are gonna use in transform in the detection section and also for predection in XAI 
    for target in target_columns:
        mask_targets.append(target + '-mask')
        corrupted_test_copy[target + '-mask']=np.nan


    export_files(corrupted_data,error_mask)




 ###### Cleaning data ######

    #Using CDC Conformal data cleaning based on Jäger & Bießmann >> https://proceedings.mlr.press/v238/jager24a
    #The implementation is from: 1. https://github.com/se-jaeger/conformal-inference/tree/main/conformal_inference
                                #2. https://github.com/se-jaeger/conformal-data-cleaning/tree/main/conformal_data_cleaning
    
    

    # in Cleaner we are calling the class ConformalAutoGluonCleaner from the gitHub conformal-data-cleaning, 
    # we defined a parameter confidence_level in the class and decalred it with a value between 0 and 1
    
    cleaner = ConformalAutoGluonCleaner(confidence_level=confidence_level,seed=42)
    
    
#during the training, an ML model will be fitted for each column (target_column), where all other columns are
#the model’s features, and calibrate its output
# calibration_size = 1000 / data.shape[0] in our case almost 10,000 data which means we will use 0.1 of calibration_size, 
#which will be used to improve detection the outliers 
    cleaner.fit(
        data=train_data, 
        target_columns=target_columns,
        ci_calibration_size=0.1,
        #refit_full=True, in our class ConformalAutoGluonCleaner, the refit_full is already set to True, so we do not need to set it again. It will retrain the best model on the dataset, improving s the accuracy and also correcting data.
        fit_weighted_ensemble = True, # this parameter was set to False in the class, but we have changed it to True to improve the performance (maximize predictive quality.), yet it will be slower
        
        ### Bagging/Stacking settings ###
        # was set to False, but i changed it to True, to assist in enhancing the precision of the model and reducing overfitting 
        auto_stack=True, 
        num_bag_folds=5, #it helps detect and prevent Overfitting and provides a more stable and less biased estimate of how the model will perform.
        num_stack_levels=1,  #level 1 to maximize predictive performance     
       
        ci_ag_predictor_params={"path_prefix": "AutogluonModels",
                                #"eval_metric": "pinball_loss",                           
                               },
        #I have used 5 iterations with enough mount of time to allow the model to take the full advantage of training and also to improve it's performance.
        ci_ag_fit_params={"time_limit": time_limit,
                         "hyperparameter_tune_kwargs": {"num_trials": 5},
                         
                          
                         },
    )

    cleaned_data, cleaned_mask, prediction_sets = cleaner.transform(
                                                    corrupted_data,
                                                    reuse_intermediate = reuse_intermediate, 
                                                  )
        
    try: 
        cleaned_data.to_csv("Data/cleaned_data.csv", index=False)
        cleaned_mask.to_csv("Data/cleaned_mask.csv", index=False)
        print("Data has been saved successfully")
    except Exception as e:
        print(f"Error: {e}")


### Step 4: Calculate the Accuracy for corrected values
### in this section, we are comparing the real result with the cleaned data, so we measured the accuracy, precision, recall, and F1 scores
### The column education-num has a higher score because its Functional dependency (education (determinant) -> education-num (dependent))
### for the FDs education -> education-num, it reconstructs the target values perfectly.
    print('####### Classification report for Correction #######')

    for col in target_columns:
        y_true = y_test[col]
        y_pred = cleaned_data[col]
        
        if col not in regression_columns:
            print("Classification report: ")
            print(col)
            print(classification_report(y_true, y_pred))

        else:
            print("Regression report: ")
            print(col)
            print(f"MSE: {mean_squared_error(y_true, y_pred)}")
            print(f"MAE: {mean_absolute_error(y_true,y_pred)}")
            print(f"RMSE: {root_mean_squared_error(y_true,y_pred)}")
            print('---------------------------')
    # print("Classification report: ")
    # for x in categorical_columns:
    #     y_true = y_test[x]
    #     y_pred = cleaned_data[x]
    #     print(classification_report(y_true, y_pred))
    
    # print("Regression report: ")
    # for x in regression_columns:
    #     y_true = y_test[x]
    #     y_pred = cleaned_data[x]
    #     print(x)
    #     print(f"MSE: {mean_squared_error(y_true, y_pred)}")
    #     print(f"MAE: {mean_absolute_error(y_true,y_pred)}")
    #     print(f"RMSE: {root_mean_squared_error(y_true,y_pred)}")
    #     print('---------------------------')


    

    ## Deticting-Part ##
    print('####### Detection Part #######')

    corrupted = train_data.copy()

    corrupted_data, error_mask = create_errors(corrupted, target_columns, condition_columns, error_type, error_rate)
    
    corrupted_data.to_csv('Data/corrupted_data_E.csv', index=False)
    error_mask.to_csv('Data/ground_truth_E.csv', index=False)
    
    error_mask_Ex1_train=error_mask.copy()
    mask_numeric_train = error_mask_Ex1_train.astype(int)
    for target in target_columns:
        safe_col = target.replace('-', '_').replace(' ', '_')
        globals()[f'error_mask_{safe_col}'] = mask_numeric_train[target]
        corrupted_data[target + '-mask']=globals()[f'error_mask_{safe_col}'].values
    
    train_corrupted_data = corrupted_data
    cleaned_results={}
    final_cleaned_data = corrupted_test_copy.copy()
    
    for mask in mask_targets: 
        print(f"{mask}")
        cleaner = ConformalAutoGluonCleaner(confidence_level=confidence_level,seed=42)
        
        cleaning=cleaner.fit(
            data=train_corrupted_data, 
            target_columns=[mask],  
            ci_calibration_size=0.1,
            categorical_precision_threshold=0.7,  
            numeric_error_percentile=0.95,      
            ci_ag_fit_params={"time_limit": time_limit,
                              "hyperparameter_tune_kwargs": {"num_trials": 20},
                             },
           
            )
        
        cleaned_data, cleaned_mask, prediction_sets = cleaning.transform(corrupted_test_copy)
        cleaned_results[mask]={
            'cleaned_data': cleaned_data,
            'cleaned_mask': cleaned_mask,
            'Prediction_sets': prediction_sets
        }
        
        if mask in cleaned_data.columns:
            final_cleaned_data[mask] = cleaned_data[mask]
        
        try: 
            final_cleaned_data.to_csv("Data/cleaned_dataE.csv", index=False)
            print("Data has been saved successfully")
        except Exception as e:
            print(f"Error: {e}")


    
    ## Classification-report for Detection ##
    print('####### Classification report for Detection #######')

    for target, label in zip(target_columns, mask_targets):
    # for i, label in enumerate(mask_targets):
        model_path = "AutogluonModels/" + label
        model = TabularPredictor.load(model_path)
        preds = model.predict(corrupted_test_copy)
        y_true=masks[target]
        print(f"{label}")
        print(classification_report(y_true, preds))
        print('---------------------------------------------------------')



    ## XAI EXplaining Diagramm ##
    #source:https://auto.gluon.ai/dev/tutorials/eda/components/autogluon.eda.explain.html
    print('####### XAI Explaining Diagramm #######')

    train_corrupted_copy = train_corrupted_data.copy()
    test_copy =corrupted_test_copy.copy()
    
    for target in mask_targets:
        model_path = "AutogluonModels/" + target
        predictor = TabularPredictor.load(model_path)
        
        preds = predictor.predict(corrupted_test_copy)
        
        X_train = train_corrupted_copy.drop(target, axis=1)
        X_test=test_copy.drop(target, axis=1)
       
        rows_to_explain=X_test[preds==1][X_train.columns].head(diagramm_xai_rows)
        print(f"{target}")
    
        auto.analyze(
            train_data=X_train, model=predictor,
            anlz_facets=[
                eda.explain.ShapAnalysis(rows_to_explain),
            ],
            viz_facets=[
                viz.explain.ExplainWaterfallPlot(),
            ]
        )



    print('####### XAI Classification-report for condition columns #######')

    ## XAI Classification-report for condition columns ##
    #source:https://auto.gluon.ai/0.8.2/_modules/autogluon/eda/analysis/explain.html

    # In the report it will show only the precentage of the condition columns
    
    reports = {}
    
    top_k = top_k 
    le = LabelEncoder()
    
    
    for target, cond in zip(mask_targets, condition_columns):
        y_true = []
        y_pred = []
        model_path = "AutogluonModels/" + target
        predictor = TabularPredictor.load(model_path)
        
        y_preds = predictor.predict(corrupted_test_copy)
        
        X_train = train_corrupted_copy.drop(target, axis=1)
        X_test=test_copy.drop(target, axis=1)
        
        rows=X_test[y_preds==1][X_train.columns].head(rows_numbers_xai)
    
    
        shap_analysis = eda.explain.ShapAnalysis(rows)
        auto.analyze(
            train_data=X_train,
            model=predictor,
            anlz_facets=[shap_analysis]
        )
        shap_results = shap_analysis.state.explain["shapley"]
    
        for result in shap_results:
            shap_values = result.shap_values  
            feature_names = result.features.index.tolist()       
            top_indices = np.argsort(np.abs(shap_values))[-top_k:][::-1] 
            top_features = [feature_names[i] for i in top_indices]
        
            for x in top_features:
                y_pred.append(x)
                y_true.append(cond)
        
        counts = Counter(y_pred)   
        total = sum(counts.values())
        le.fit(y_true + y_pred)
        y_true_enc = le.transform(y_true)
        y_pred_enc = le.transform(y_pred)
        reports[target] = classification_report(y_true_enc, y_pred_enc, target_names=le.classes_, zero_division=0)
        print(f"{target}")
        print('---------------------------------------------------------')
        print(f"Top Features percentage for {target}")
        for feature, cnt in counts.items():
            print(f"{feature}: {cnt/total*100:.2f}%")
        print('---------------------------------------------------------')
        
    for target, report in reports.items():
        print(f"{target}")
        print(report)
        print('---------------------------------------------------------')

        

def create_errors(cleaned_data, target_columns, condition_columns, error_type, error_rate): 
    
    all_error_models = {} # we will store first our data here, to prevent it from overwriting, and then we will call it in config
    for target, cond, err in zip(target_columns, condition_columns, error_rate):
        
        all_error_models[target] = []
        error_model=ErrorModel(
                    error_mechanism=error_mechanism.EAR(condition_to_column=cond ,seed=42),
                    error_type=error_type,
                    error_rate=err
                )
        all_error_models[target].append(error_model)

    config = MidLevelConfig(all_error_models)
    corrupted_data, error_mask= mid_level.create_errors(data=cleaned_data, config=config)
             
    return corrupted_data, error_mask
   

def export_files(corrupted_data,error_mask):

    try: 
        corrupted_data.to_csv('Data/corrupted_data.csv', index=False)
        error_mask.to_csv('Data/ground_truth.csv', index=False)
        print("Data are now corrupted")
    
    except ValueError as e:
        print(f'Something went wrong: {e}')



# Columns = ['age', 'occupation', 'fnlwgt','hours-per-week', 'sex', 'workclass', 'education', 'education-num'] 
# in this list i have defined 3 types of errors to be choosen -> [error_type.Outlier(), error_type.CategorySwap(), error_type.MissingValue()]

if __name__ == "__main__":
    run_exp( Dataframe = 'adult',
    target_columns = ['sex', 'education', 'workclass', 'occupation'],
    condition_columns = ['relationship','education-num','age','hours-per-week'],
    # categorical columns ['age', 'hours-per-week','occupation', 'sex', 'workclass', 'education', 'material-status', 'relationship']
    categorical_columns = ['sex', 'education', 'workclass', 'occupation'], # for error_type.CategorySwap()
    regression_columns = [],  # numeric columns : ['fnlwgt']
    error_type = error_type.CategorySwap(), # apply error type Missing value to a single column at a time, to avoid information loss and unstable performance 
    error_rate = [0.2, 0.3, 0.4, 0.5],
    frac = 0.4, # Precent of rows that will be used in the CDC expirement
    test_size = 0.3, # Percenet of test data
    reuse_intermediate = False, # it's recommended to set it to False when type_error is Missing_Value, otherwise we can set it to True in other type errors to reuse the values we've already predicted instead of starting from square one
    rows_numbers_xai = 200, #number of rows to be explained in XAI report section
    diagramm_xai_rows = 5,
    confidence_level = 0.99,
    time_limit = 3600,
    top_k = 2
    )
    
