# `clean_duplication()` String matching in a column

Standardize duplicate strings in a column. Follow OpenRefine's [approach](https://github.com/OpenRefine/OpenRefine/wiki/Clustering-In-Depth).

In [None]:
def clean_duplication(
    df: Union[pd.DataFrame, dd.DataFrame],
    column: str
)

# Example

In [3]:
import pandas as pd
df = pd.DataFrame({
    "city": [
        "Mississippi",
        "mississippi",
        "misssisipppi",
        "Misisipi",
        "Dayton",
        "Layton",
    ]
})
df

Unnamed: 0,city
0,Mississippi
1,mississippi
2,misssisipppi
3,Misisipi
4,Dayton
5,Layton


In [11]:
clean_duplication(df, "city")
# live code export
df["city"] = df["city"].replace(["mississippi", "misssisipppi", "Misisipi"], "Mississippi")

Box(children=(VBox(children=(HBox(children=(Label(value='_'), Label(value='_'), Label(value='_'), Label(value=…

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [12]:
# dataframe with cleaned string values
df_clean

Unnamed: 0,city
0,Mississippi
1,Dayton
2,Layton
3,Bayton


In [10]:
import base64
from IPython.display import Javascript, display
from ipywidgets import Button, HBox, VBox, Label, Layout, Box, Dropdown, Checkbox, Text
import pandas as pd

def live_export_code(*args):
    code = "\ndf[\"city\"] = df[\"city\"].replace([\"mississippi\", \"misssisipppi\", \"Misisipi\"], \"Mississippi\")"
    encoded_code = (base64.b64encode(str.encode(code))).decode()
    display(
        Javascript("""
        var ind = IPython.notebook.get_selected_index();
        var cell = IPython.notebook.get_cell(ind);
        var text = cell.get_text();
        cell.set_text(text.concat(atob("{0}")));
    """.format(encoded_code)))


def final_df(*args):
    code = "# dataframe with cleaned string values\ndf_clean"
    encoded_code = (base64.b64encode(str.encode(code))).decode()
    display(
        Javascript("""
        var code = IPython.notebook.insert_cell_below('code');
        code.set_text(atob("{0}"));
        code.execute();
    """.format(encoded_code)))


def clean_duplication(*args):

    box_layout = Layout(display='flex',
                        flex_flow='column',
                        align_items='stretch',
                        border='solid',
                        width='825px')
    method = Dropdown(
        options=['key collision', 'nearest neighbour'],
        description='Method:',
    )
    key = Label(" Keying function: ")
    key_func = Dropdown(
        options=['Fingerprint', 'ngram-fingerprint'],
        layout={'width': '150px'})
    export_code = Checkbox(value=True,
                           description='export code',
                           layout=Layout(width='165px'),
                           style={'description_width': 'initial'})

    repr_el = Text(value='Mississippi', layout=Layout(width='130px'))
    repr_el2 = Text(value='Bayton', layout=Layout(width='130px'))
    sel_all = Checkbox(description="Select all", layout=Layout(width='165px'))

    merge_and_recluster = Button(description="Merge and recluster")
    merge_and_recluster.on_click(live_export_code)

    check = Checkbox(layout=Layout(width='165px'))
    check2 = Checkbox(layout=Layout(width='165px'))

    finish = Button(description="Finish")
    finish.on_click(final_df)

    spc = [Label(" ")]
    
    dropds = HBox([method, *spc * 10, key, key_func, *spc * 10, export_code])
    blank = HBox([*spc * 100])
    headers = HBox([
        *spc * 8,
        Label("Distinct values"), *spc * 15,
        Label("Total values"), *spc * 15,
        Label("Cluster values"), *spc * 27,
        Label("Merge?"), *spc * 20,
        Label("Representative value")
    ])
    line = HBox([*[Label("_")] * 100])
    vals_1 = HBox([*spc * 78, Label("Mississippi (20 rows)")])
    vals_2 = HBox([
        *spc * 18,
        Label("4"), *spc * 31,
        Label("24"), *spc * 22,
        Label("mississippi (2 rows)", layout=Layout(width='114px')), check,
        *spc * 8, repr_el
    ])
    vals_3 = HBox([*spc * 78, Label("misssisipppi (1 row)")])
    vals_4 = HBox([*spc * 78, Label("Misisipi (1 row)")])
    vals_5 = HBox([*spc * 78, Label("Bayton (9 rows)")])
    vals_6 = HBox([
        *spc * 18,
        Label("3"), *spc * 31,
        Label("24"), *spc * 22,
        Label("Dayton (8 rows)", layout=Layout(width='114px')), check2,
        *spc * 8, repr_el2
    ])
    vals_7 = HBox([*spc * 78, Label("Layton (7 row)")])
    footer = HBox([sel_all, *spc * 79, merge_and_recluster, *spc * 10, finish])

    return Box(children=[
        VBox([line,
            dropds, blank, headers, line, vals_1, vals_2, vals_3, vals_4, line,
            vals_5, vals_6, vals_7, line, footer
        ])
    ],
               layout=box_layout)

df_clean = pd.DataFrame(
    {"city": ["Mississippi", "Dayton", "Layton", "Bayton"]})