# End-to-End Data Cleaning Pipeline with Raha and Baran (Minimal and Integrated)
We build an end-to-end data cleaning pipeline with our configuration-free error detection and correction systems, Raha and Baran.

In [1]:
import pandas
import IPython.display
import ipywidgets

import raha

## 1. Instantiating the Detection and Correction Classes
We first instantiate the `Detection` and `Correction` classes.

In [2]:
from raha import analysis_utilities
app_1 = raha.Detection()
app_2 = raha.Correction()

# How many tuples would you label?
app_1.LABELING_BUDGET = 20
app_2.LABELING_BUDGET = 0

# Would you like to see the logs?
app_1.VERBOSE = True
app_2.VERBOSE = True

## 2. Instantiating the Dataset
We next load and instantiate the dataset object.

In [3]:
dataset_dictionary = {
        "name": "tax",
        "path": "datasets/tax/dirty.csv",
        "clean_path": "datasets/tax/clean.csv"
    }
d = app_1.initialize_dataset(dataset_dictionary)
d.dataframe.head()

Unnamed: 0,f_name,l_name,gender,area_code,phone,city,state,zip,marital_status,has_child,salary,rate,single_exemp,married_exemp,child_exemp
0,Pengyuan,Zendler,F,508,744-9007,SWAMPSCOTT,MA,1907,M,N,90000,5.3,0,7150,0
1,Nik,Tacic,M,702,517-7658,LAS VEGAS,NV,89140,M,N,90000,0.0,0,0,0
2,Hovav,Punter,M,501,304-9763,HASTY,AR,72640,S,N,50000,7.0,20,0,0
3,Xiangning,Vanneste,F,862,651-6469,BRIGANTINE,NJ,8203,M,Y,55000,1.9519792,0,2000,1500
4,Belen,Niccum,F,920,287-1889,FLORENCE,WI,54121,S,Y,85000,5.9232907,700,0,400


## 3. Generating Features and Clusters
Raha runs (all or the promising) error detection strategies on the dataset. This step could take a while because all the strategies should be run on the dataset. Raha then generates a feature vector for each data cell based on the output of error detection strategies. Raha next builds a hierarchical clustering model for our clustering-based sampling approach.

In [None]:
app_1.run_strategies(d)
app_1.generate_features(d)
app_1.build_clusters(d)

96774 cells are detected by ["PVD", ["city", "R"]].
1459 cells are detected by ["PVD", ["l_name", "Y"]].
199813 cells are detected by ["PVD", ["rate", "."]].
27473 cells are detected by ["PVD", ["city", "K"]].
67292 cells are detected by ["PVD", ["f_name", "o"]].
15351 cells are detected by ["PVD", ["single_exemp", "5"]].
106768 cells are detected by ["PVD", ["phone", "4"]].
106138 cells are detected by ["PVD", ["phone", "3"]].
38897 cells are detected by ["PVD", ["f_name", "l"]].
11455 cells are detected by ["PVD", ["married_exemp", "1"]].
2869 cells are detected by ["PVD", ["l_name", "Z"]].
88586 cells are detected by ["PVD", ["l_name", "r"]].
19884 cells are detected by ["PVD", ["salary", "7"]].
99587 cells are detected by ["PVD", ["f_name", "i"]].
5104 cells are detected by ["PVD", ["l_name", "N"]].
5029 cells are detected by ["PVD", ["f_name", "F"]].
12463 cells are detected by ["PVD", ["l_name", "C"]].
39791 cells are detected by ["PVD", ["f_name", "s"]].
200 cells are detected b

## 4. Interactive Tuple Sampling and Labeling
Raha then iteratively samples a tuple. We should label data cells of each sampled tuple.

In [None]:
def on_button_clicked(_):
    for j in range(0, len(texts)):
        cell = (d.sampled_tuple, j)
        error_label = 0
        correction = texts[j].value
        if d.dataframe.iloc[cell] != correction:
            error_label = 1
        d.labeled_cells[cell] = [error_label, correction]
    d.labeled_tuples[d.sampled_tuple] = 1

app_1.sample_tuple(d)
print("Fix the dirty cells in the following sampled tuple.")
sampled_tuple = pandas.DataFrame(data=[d.dataframe.iloc[d.sampled_tuple, :]], columns=d.dataframe.columns)
IPython.display.display(sampled_tuple)  
texts = [ipywidgets.Text(value=d.dataframe.iloc[d.sampled_tuple, j]) for j in range(d.dataframe.shape[1])]
button = ipywidgets.Button(description="Save the Annotation")
button.on_click(on_button_clicked)
output = ipywidgets.VBox(children=texts + [button])
IPython.display.display(output)

For the sake of time, we use the ground truth of the dataset to label tuples below.

In [None]:
%%capture
while len(d.labeled_tuples) < app_1.LABELING_BUDGET:
    app_1.sample_tuple(d)
    if d.has_ground_truth:
        app_1.label_with_ground_truth(d)

## 5. Propagating User Labels and Predicting the Labels
Raha then propagates each user label through its cluster. Raha then trains and applies one classifier per data column to predict the label of the rest of data cells.

In [None]:
app_1.propagate_labels(d)
app_1.predict_labels(d)

## 6. Initializing and Updating the Error Corrector Models
Baran initializes the error corrector models. Baran then iteratively samples a tuple. We should label data cells of each sampled tuple. It then udpates the models accordingly and generates a feature vector for each pair of a data error and a correction candidate. Finally, it trains and applies a classifier to each data column to predict the final correction of each data error. Since we already labeled tuples for Raha, we use the same labeled tuples and do not label new tuples here.

In [None]:
app_2.initialize_models(d)
app_2.initialize_dataset(d)
for si in d.labeled_tuples:
    d.sampled_tuple = si
    app_2.update_models(d)
    app_2.generate_features(d)
    app_2.predict_corrections(d)

## 7. Storing Results
Both Raha and Baran can also store the error detection/correction results.

In [None]:
#app_1.store_results(d)
#app_2.store_results(d)

## 8. Evaluating the Data Cleaning Task
We can finally evaluate our data cleaning task.

In [None]:
edp, edr, edf = d.get_data_cleaning_evaluation(d.detected_cells)[:3]
ecp, ecr, ecf = d.get_data_cleaning_evaluation(d.corrected_cells)[-3:]

evaluation_df = pandas.DataFrame(columns=["Task", "Precision", "Recall", "F1 Score"])
evaluation_df = evaluation_df.append({"Task": "Error Detection (Raha)", "Precision": "{:.2f}".format(edp), 
                                      "Recall": "{:.2f}".format(edr), "F1 Score": "{:.2f}".format(edf)}, ignore_index=True)
evaluation_df = evaluation_df.append({"Task": "Error Correction (Baran)", "Precision": "{:.2f}".format(ecp), 
                                      "Recall": "{:.2f}".format(ecr), "F1 Score": "{:.2f}".format(ecf)}, ignore_index=True)
evaluation_df.head()

In [None]:
import importlib
importlib.reload(analysis_utilities)

In [None]:
actual_errors = d.get_actual_errors_dictionary()

In [None]:
analysis_utilities.detection_evaluation(d, actual_errors)

In [None]:
correction_confidence_df = analysis_utilities.get_correction_confidence_df(d, actual_errors)

In [None]:
(correction_confidence_df["confidence"] < 0.98).sum()

In [None]:
analysis_utilities.correction_confidence_distributions(correction_confidence_df)

In [None]:
f = analysis_utilities.correction_correctness_by_confidence(correction_confidence_df)
f.show()