# Notebook: Use past bad approaches to slightly improve the final results

Slight changes in the submission using:
- Past submissions: the confusion matrix obtained when submitting can be used to eliminate non duplicates or add some certain ones
- Manual checkings of the pairs: some pairs have been manually labelled during the time of the challenge and can be specified
- Past non selected approaches, that we believe can eliminate some non duplicates from the submission

Please note that the modifications on the submission operated in this notebook are not significant and were used to earn up to 0.01 in the macro-F1.

## Imports

In [36]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [37]:
project_path = "/home/onyxia/work/deduplication/"  # Needs to be changed by the user

In [38]:
path_07_model_output = "data/07_model_output/"
path_09_past_approaches =  "data/09_past_approaches/"

## Data

Past approaches that were tested out during the challenge: either past submissions with results that can be re-used, or attempts that have been checked manually, or methods we believe could identify non duplicates with a high level of confidence

In [39]:
initial_approach = pd.read_csv(
    # project_path + path_07_model_output + 'best_duplicates.csv',
    project_path + path_09_past_approaches + 'best_duplicates.csv',
    names=['id1', 'id2', 'type'],
    lineterminator='\n')

current_approach = initial_approach.copy()

In [40]:
# approach_with_ner = pd.read_csv(
#     # project_path + path_07_model_output + 'best_duplicates.csv',
#     project_path + path_09_past_approaches + 'best_duplicates_ner.csv',
#     names=['id1', 'id2', 'type'],
#     lineterminator='\n')

In [41]:
bad_approach_AP_1 = pd.read_csv(
    project_path + path_09_past_approaches + 'approach_AP_1.csv',
    names=['id1', 'id2', 'type'], lineterminator='\n')

bad_approach_AP_6 = pd.read_csv(
    project_path + path_09_past_approaches + 'approach_AP_6.csv',
    names=['id1', 'id2', 'type'], lineterminator='\n')

bad_approach_FB_3 = pd.read_csv(
    project_path + path_09_past_approaches + 'approach_FB_3.csv',
    names=['id1', 'id2', 'type'], lineterminator='\n')

bad_approach_FB_6 = pd.read_csv(
    project_path + path_09_past_approaches + 'approach_FB_6.csv',
    names=['id1', 'id2', 'type'], lineterminator='\n')

bad_approach_BL_4 = pd.read_csv(
    project_path + path_09_past_approaches + 'approach_BL_4.csv',
    names=['id1', 'id2', 'type'], lineterminator='\n')

bad_approach_BL_5 = pd.read_csv(
    project_path + path_09_past_approaches + 'approach_BL_5.csv',
    names=['id1', 'id2', 'type'], lineterminator='\n')

bad_approach_BL_6 = pd.read_csv(
    project_path + path_09_past_approaches + 'approach_BL_6.csv',
    names=['id1', 'id2', 'type'], lineterminator='\n')

In [42]:
good_approach_AP_9 = pd.read_csv(
    project_path + path_09_past_approaches + 'approach_AP_9.csv',
    names=['id1', 'id2', 'type'], lineterminator='\n')

good_approach_BL_8 = pd.read_csv(
    project_path + path_09_past_approaches + 'approach_BL_8.csv',
    names=['id1', 'id2', 'type'], lineterminator='\n')

good_approach_BL_9 = pd.read_csv(
    project_path + path_09_past_approaches + 'approach_BL_9.csv',
    names=['id1', 'id2', 'type'], lineterminator='\n')

good_approach_BL_10 = pd.read_csv(
    project_path + path_09_past_approaches + 'approach_BL_10.csv',
    names=['id1', 'id2', 'type'], lineterminator='\n')

good_approach_AP_10 = pd.read_csv(
    project_path + path_09_past_approaches + 'approach_AP_10.csv',
    names=['id1', 'id2', 'type'], lineterminator='\n')

## Functions

In [43]:
def describe_duplicates(all_duplicates: pd.DataFrame) -> pd.DataFrame:
    duplicates_description = all_duplicates.groupby('type').count(
    ).reset_index()
    return duplicates_description

In [44]:
def remove_observations_from_bad_approach(
    current_approach: pd.DataFrame,
    bad_approach: pd.DataFrame,
    types_to_filter
) -> pd.DataFrame:

    len_before = len(current_approach)
    bad_approach_filtered = bad_approach[bad_approach['type'].isin(types_to_filter)]

    all_approaches = current_approach.merge(
        bad_approach_filtered,
        how='left',
        on=['id1', 'id2', 'type'],
        indicator=True)
    new_current_approach = all_approaches[
        all_approaches['_merge'] == 'left_only'
    ][['id1', 'id2', 'type']]

    len_after = len(new_current_approach)
    print(f'After filtering, {len_before - len_after} rows removed')
    return new_current_approach

In [45]:
def remove_observations_from_df_non_duplicates(
    current_approach: pd.DataFrame,
    non_duplicates: pd.DataFrame
) -> pd.DataFrame:

    len_before = len(current_approach)
    all_approaches = current_approach.merge(
        non_duplicates[['id1', 'id2']],
        how='left',
        on=['id1', 'id2'],
        indicator=True)
    new_current_approach = all_approaches[
        all_approaches['_merge'] == 'left_only'
    ][['id1', 'id2', 'type']]

    len_after = len(new_current_approach)
    print(f'After filtering, {len_before - len_after} rows removed')
    return new_current_approach

In [46]:
def add_specific_duplicates(
    current_approach: pd.DataFrame,
    past_approach: pd.DataFrame,
    types_to_add: list,
    new_type: str
) -> pd.DataFrame:

    past_to_change = past_approach[past_approach["type"].isin(types_to_add)]
    past_to_change["type"] = new_type
    print(f'At most {len(past_to_change)} observations modified or added')

    final_approach = pd.concat([past_to_change, current_approach]).drop_duplicates(
        subset=['id1', 'id2']
    ).sort_values(
        by=['id1', 'id2']
    ).reset_index(drop=True)

    print(f'{len(final_approach) - len(current_approach)} rows added')
    print(
        f'{len(current_approach[current_approach["type"].isin(types_to_add)]) - len(final_approach[final_approach["type"].isin(types_to_add)])} of {types_to_add} have been changed'
    )
    return final_approach

In [47]:
def mutualisation_with_good_approach(
    current_approach: pd.DataFrame,
    good_approach: pd.DataFrame,
    use_semantic: bool,
    use_temporal: bool,
    union_temporal: bool,
    reduce_partials: bool,
    replace_partials: bool
) -> pd.DataFrame:

    len_before = len(current_approach)

    final_full = current_approach[current_approach['type'] == "FULL"]

    current_semantic = current_approach[current_approach['type'] == "SEMANTIC"]
    if use_semantic:
        past_semantic = good_approach[good_approach['type'] == "SEMANTIC"]
        final_semantic = pd.concat([current_semantic, past_semantic]).drop_duplicates()
    else:
        final_semantic = current_semantic.copy()

    current_temporal = current_approach[current_approach['type'] == "TEMPORAL"]
    if use_temporal:
        past_temporal = good_approach[good_approach['type'] == "TEMPORAL"]
        if union_temporal:
            final_temporal = pd.concat([current_temporal, past_temporal]).drop_duplicates()
        else:
            final_temporal = pd.merge(current_temporal,
                                    past_temporal,
                                    how='inner',
                                    on=['id1', 'id2', 'type']
                                    )
    else:
        final_temporal = current_temporal.copy()
    
    current_partial = current_approach[current_approach['type'] == "PARTIAL"]
    past_partial = good_approach[good_approach['type'] == "PARTIAL"]

    if replace_partials:
        final_partial = past_partial.copy()
    else:
        final_partial = current_partial.copy()

    if reduce_partials:
        past_large_semantic = good_approach[good_approach['type'].isin(["SEMANTIC", "PARTIAL"])][["id1", "id2"]]
        final_partial = pd.merge(final_partial,
                                 past_large_semantic,
                                 how='inner',
                                 on=['id1', 'id2']
                                )

    final_approach = pd.concat(
        [final_full, final_partial, final_semantic, final_temporal]
    ).drop_duplicates(
        subset=['id1', 'id2']
    ).sort_values(
        by=['id1', 'id2']
    ).reset_index(drop=True)

    len_after = len(final_approach)
    print(f'After filtering, {len_before - len_after} rows removed')
    return final_approach

In [48]:
def aggregate_partials(
    list_approaches: pd.DataFrame
) -> pd.DataFrame:

    list_partials = [
        approach[approach["type"] == 'PARTIAL'] for approach in list_approaches
    ]

    final_partials =  pd.concat(list_partials).drop_duplicates()
    len_partials = len(final_partials)

    print(f'There are {len_partials} sure non duplicates from partials analysis')
    return final_partials

## Applications

In [49]:
describe_duplicates(current_approach)

Unnamed: 0,type,id1,id2
0,FULL,63363,63363
1,PARTIAL,31032,31032
2,SEMANTIC,186753,186753
3,TEMPORAL,679772,679772


### Gather those that are for sure not duplicates

In [50]:
non_duplicates_AP = aggregate_partials([
    bad_approach_AP_6,
    bad_approach_FB_3,
    bad_approach_BL_4,
    bad_approach_BL_5,
    bad_approach_BL_6
])

non_duplicates_AP['type'] = "NON"

There are 3566 sure non duplicates from partials analysis


In [51]:
non_duplicates_AP.to_csv("non_duplicates_AP.csv", index=False, header=False)

In [52]:
# Not very reliable

non_duplicates_BL = pd.read_csv('non_duplicates_BL.csv',
    names=['id1', 'id2', 'type'],
    lineterminator='\n')

len(non_duplicates_BL)

8498

### Complete with good approaches

In [53]:
# current_approach = mutualisation_with_good_approach(
#     current_approach,
#     good_approach_BL_10,
#     use_semantic=False,
#     use_temporal=False,
#     union_temporal=False,
#     reduce_partials=False,
#     replace_partials=False
# )

current_approach = add_specific_duplicates(
    current_approach,
    good_approach_AP_10,
    types_to_add=['PARTIAL'],
    new_type="PARTIAL"
)

At most 14000 observations modified or added
80 rows added
-2502 of ['PARTIAL'] have been changed


In [54]:
current_approach = add_specific_duplicates(
    current_approach,
    good_approach_BL_9,
    types_to_add=['PARTIAL'],
    new_type="SEMANTIC"
)

At most 8134 observations modified or added
3247 rows added
902 of ['PARTIAL'] have been changed


In [55]:
current_approach = mutualisation_with_good_approach(
    current_approach,
    good_approach_AP_9,
    use_semantic=True,
    use_temporal=False,
    union_temporal=False,
    reduce_partials=True,
    replace_partials=False
)

After filtering, -26737 rows removed


In [56]:
current_approach = mutualisation_with_good_approach(
    current_approach,
    good_approach_BL_8,
    use_semantic=True,
    use_temporal=False,
    union_temporal=False,
    reduce_partials=False,
    replace_partials=False
)

After filtering, -1050 rows removed


In [57]:
current_approach = mutualisation_with_good_approach(
    current_approach,
    good_approach_BL_9,
    use_semantic=False,
    use_temporal=True,
    union_temporal=False,
    reduce_partials=False,
    replace_partials=False
)

After filtering, 237101 rows removed


In [58]:
describe_duplicates(current_approach)

Unnamed: 0,type,id1,id2
0,FULL,63363,63363
1,PARTIAL,27109,27109
2,SEMANTIC,221790,221790
3,TEMPORAL,442671,442671


### Remove false duplicates

In [59]:
# NOT SURE TO KEEP YET

current_approach = remove_observations_from_bad_approach(
    current_approach,
    bad_approach_AP_1,
    ['SEMANTIC']
)

# current_approach = remove_observations_from_bad_approach(
#     current_approach,
#     bad_approach_AP_1,
#     ['TEMPORAL']
# )

After filtering, 10289 rows removed


In [60]:
describe_duplicates(current_approach)

Unnamed: 0,type,id1,id2
0,FULL,63363,63363
1,PARTIAL,27109,27109
2,SEMANTIC,211501,211501
3,TEMPORAL,442671,442671


### Remove non duplicates

In [61]:
current_approach = remove_observations_from_df_non_duplicates(
    current_approach,
    non_duplicates_AP
)

After filtering, 849 rows removed


In [62]:
# current_approach = remove_observations_from_df_non_duplicates(
#     current_approach,
#     non_duplicates_BL
# )

## Check results

In [63]:
describe_duplicates(current_approach)

Unnamed: 0,type,id1,id2
0,FULL,63363,63363
1,PARTIAL,27109,27109
2,SEMANTIC,211469,211469
3,TEMPORAL,441854,441854


In [69]:
current_approach.to_csv("duplicates.csv", index=False, header=False)

## Final check

In [70]:
final_approach = pd.read_csv('duplicates.csv',
                               names=['id1', 'id2', 'type'],
                               lineterminator='\n')

In [71]:
describe_duplicates(final_approach)

Unnamed: 0,type,id1,id2
0,FULL,63363,63363
1,PARTIAL,27109,27109
2,SEMANTIC,211469,211469
3,TEMPORAL,441854,441854


## Experiments

In [67]:
describe_duplicates(initial_approach)

Unnamed: 0,type,id1,id2
0,FULL,63363,63363
1,PARTIAL,31032,31032
2,SEMANTIC,186753,186753
3,TEMPORAL,679772,679772


In [68]:
# describe_duplicates(approach_with_ner)