# Use past bad submissions to filter out false duplicates

## Imports

In [46]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [47]:
project_path = "/home/onyxia/work/deduplication/"
path_07_model_output = "data/07_model_output/"
path_09_past_submissions =  "data/09_past_submissions/"

## Data

In [48]:
current_submission = pd.read_csv(
    project_path + path_07_model_output + 'best_duplicates.csv',
    names=['id1', 'id2', 'type'],
    lineterminator='\n')

In [49]:
bad_submission_AP_1 = pd.read_csv(
    project_path + path_09_past_submissions + 'submission_1_AP.csv',
    names=['id1', 'id2', 'type'],
    lineterminator='\n')

good_submission_FB_8 = pd.read_csv(
    project_path + path_09_past_submissions + 'submission_8_FB.csv',
    names=['id1', 'id2', 'type'],
    lineterminator='\n')

good_submission_AP_9 = pd.read_csv(
    project_path + path_09_past_submissions + 'submission_9_AP.csv',
    names=['id1', 'id2', 'type'],
    lineterminator='\n')

## Functions

In [50]:
def describe_duplicates(all_duplicates: pd.DataFrame) -> pd.DataFrame:
    duplicates_description = all_duplicates.groupby('type').count(
    ).reset_index()
    return duplicates_description

In [51]:
def remove_observations_from_bad_submission(
    current_submission: pd.DataFrame,
    bad_submission: pd.DataFrame,
    types_to_filter) -> pd.DataFrame:

    len_before = len(current_submission)
    bad_submission_filtered = bad_submission[bad_submission['type'].isin(types_to_filter)]

    all_submissions = current_submission.merge(
        bad_submission_filtered,
        how='left',
        on=['id1', 'id2', 'type'],
        indicator=True)
    new_current_submission = all_submissions[
        all_submissions['_merge'] == 'left_only'
    ][['id1', 'id2', 'type']]

    len_after = len(new_current_submission)
    print(f'After filtering, {len_before - len_after} rows removed')
    return new_current_submission

In [52]:
def mutualisation_with_good_submission(
    current_submission: pd.DataFrame,
    good_submission: pd.DataFrame,
    replace_partials: bool = True
) -> pd.DataFrame:

    len_before = len(current_submission)

    final_full = current_submission[current_submission['type'] == "FULL"]

    current_semantic = current_submission[current_submission['type'] == "SEMANTIC"]
    past_semantic = good_submission[good_submission['type'] == "SEMANTIC"]
    final_semantic = pd.concat([current_semantic, past_semantic]).drop_duplicates()

    current_temporal = current_submission[current_submission['type'] == "TEMPORAL"]
    past_temporal = good_submission[good_submission['type'] == "TEMPORAL"]
    final_temporal = pd.merge(current_temporal,
                              past_temporal,
                              how='inner',
                              on=['id1', 'id2', 'type']
                              )
    
    current_partial = current_submission[current_submission['type'] == "PARTIAL"]
    final_partial = current_partial.copy()

    if replace_partials:
        past_large_semantic = good_submission[good_submission['type'].isin(["SEMANTIC", "PARTIAL"])][["id1", "id2"]]
        final_partial = pd.merge(current_partial,
                                past_large_semantic,
                                how='inner',
                                on=['id1', 'id2']
                                )

    final_submission = pd.concat(
        [final_full, final_partial, final_semantic, final_temporal]
    ).drop_duplicates(
        subset=['id1', 'id2']
    ).sort_values(
        by=['id1', 'id2']
    ).reset_index(drop=True)

    len_after = len(final_submission)
    print(f'After filtering, {len_before - len_after} rows removed')
    return final_submission

## Applications

In [70]:
describe_duplicates(current_submission)

Unnamed: 0,type,id1,id2
0,FULL,63363,63363
1,PARTIAL,48885,48885
2,SEMANTIC,199113,199113
3,TEMPORAL,720167,720167


In [69]:
current_submission_filter_AP1 = remove_observations_from_bad_submission(
    current_submission,
    bad_submission_AP_1,
    ['SEMANTIC', 'TEMPORAL', 'PARTIAL']
)

After filtering, 16168 rows removed


In [63]:
current_submission_filter_FB_8 = mutualisation_with_good_submission(
    current_submission_filter_AP1,
    good_submission_FB_8,
    replace_partials=True
)

After filtering, 183585 rows removed


In [None]:
current_submission_filter_AP_9 = mutualisation_with_good_submission(
    current_submission_filter_FB_8,
    good_submission_FB_8,
    replace_partials=True
)

In [None]:
final_submission = remove_observations_from_bad_submission(
    mutualized_submission,
    bad_submission_AP_9,
    ['SEMANTIC']
)

In [64]:
describe_duplicates(final_submission)

Unnamed: 0,type,id1,id2
0,FULL,63363,63363
1,PARTIAL,14910,14910
2,SEMANTIC,204381,204381
3,TEMPORAL,495273,495273


In [65]:
final_submission.to_csv("duplicates.csv", index=False, header=False)

## Final check

In [66]:
final_final_submission = pd.read_csv('duplicates.csv',
                                     names=['id1', 'id2', 'type'],
                                     lineterminator='\n')

In [67]:
describe_duplicates(final_submission)

Unnamed: 0,type,id1,id2
0,FULL,63363,63363
1,PARTIAL,14910,14910
2,SEMANTIC,204381,204381
3,TEMPORAL,495273,495273
