# Use past bad submissions to filter out false duplicates

## Imports

In [453]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [454]:
project_path = "/home/onyxia/work/deduplication/"
path_07_model_output = "data/07_model_output/"
path_09_past_submissions =  "data/09_past_submissions/"

## Data

In [455]:
initial_submission = pd.read_csv(
    # project_path + path_07_model_output + 'best_duplicates.csv',
    project_path + path_09_past_submissions + 'best_duplicates.csv',
    names=['id1', 'id2', 'type'],
    lineterminator='\n')

current_submission = initial_submission.copy()

In [456]:
submission_with_ner = pd.read_csv(
    # project_path + path_07_model_output + 'best_duplicates.csv',
    project_path + path_09_past_submissions + 'best_duplicates_ner.csv',
    names=['id1', 'id2', 'type'],
    lineterminator='\n')

In [457]:
submission_large = pd.read_csv(
    # project_path + path_07_model_output + 'best_duplicates.csv',
    project_path + path_09_past_submissions + 'best_duplicates_large.csv',
    names=['id1', 'id2', 'type'],
    lineterminator='\n')

In [458]:
bad_submission_AP_1 = pd.read_csv(
    project_path + path_09_past_submissions + 'submission_AP_1.csv',
    names=['id1', 'id2', 'type'], lineterminator='\n')

bad_submission_AP_6 = pd.read_csv(
    project_path + path_09_past_submissions + 'submission_AP_6.csv',
    names=['id1', 'id2', 'type'], lineterminator='\n')

bad_submission_FB_1 = pd.read_csv(
    project_path + path_09_past_submissions + 'submission_FB_1.csv',
    names=['id1', 'id2', 'type'], lineterminator='\n')

bad_submission_FB_3 = pd.read_csv(
    project_path + path_09_past_submissions + 'submission_FB_3.csv',
    names=['id1', 'id2', 'type'], lineterminator='\n')

bad_submission_FB_6 = pd.read_csv(
    project_path + path_09_past_submissions + 'submission_FB_6.csv',
    names=['id1', 'id2', 'type'], lineterminator='\n')

bad_submission_BYL_4 = pd.read_csv(
    project_path + path_09_past_submissions + 'submission_BYL_4.csv',
    names=['id1', 'id2', 'type'], lineterminator='\n')

bad_submission_BYL_5 = pd.read_csv(
    project_path + path_09_past_submissions + 'submission_BYL_5.csv',
    names=['id1', 'id2', 'type'], lineterminator='\n')

bad_submission_BYL_6 = pd.read_csv(
    project_path + path_09_past_submissions + 'submission_BYL_6.csv',
    names=['id1', 'id2', 'type'], lineterminator='\n')

In [459]:
good_submission_AP_9 = pd.read_csv(
    project_path + path_09_past_submissions + 'submission_AP_9.csv',
    names=['id1', 'id2', 'type'], lineterminator='\n')

good_submission_FB_8 = pd.read_csv(
    project_path + path_09_past_submissions + 'submission_FB_8.csv',
    names=['id1', 'id2', 'type'], lineterminator='\n')

good_submission_BYL_8 = pd.read_csv(
    project_path + path_09_past_submissions + 'submission_BYL_8.csv',
    names=['id1', 'id2', 'type'], lineterminator='\n')

good_submission_BYL_9 = pd.read_csv(
    project_path + path_09_past_submissions + 'submission_BYL_9.csv',
    names=['id1', 'id2', 'type'], lineterminator='\n')

good_submission_BYL_10 = pd.read_csv(
    project_path + path_09_past_submissions + 'submission_BYL_10.csv',
    names=['id1', 'id2', 'type'], lineterminator='\n')

## Functions

In [460]:
def describe_duplicates(all_duplicates: pd.DataFrame) -> pd.DataFrame:
    duplicates_description = all_duplicates.groupby('type').count(
    ).reset_index()
    return duplicates_description

In [461]:
def remove_observations_from_bad_submission(
    current_submission: pd.DataFrame,
    bad_submission: pd.DataFrame,
    types_to_filter) -> pd.DataFrame:

    len_before = len(current_submission)
    bad_submission_filtered = bad_submission[bad_submission['type'].isin(types_to_filter)]

    all_submissions = current_submission.merge(
        bad_submission_filtered,
        how='left',
        on=['id1', 'id2', 'type'],
        indicator=True)
    new_current_submission = all_submissions[
        all_submissions['_merge'] == 'left_only'
    ][['id1', 'id2', 'type']]

    len_after = len(new_current_submission)
    print(f'After filtering, {len_before - len_after} rows removed')
    return new_current_submission

In [462]:
def remove_observations_from_df_non_duplicates(
    current_submission: pd.DataFrame,
    non_duplicates: pd.DataFrame) -> pd.DataFrame:

    len_before = len(current_submission)
    all_submissions = current_submission.merge(
        non_duplicates[['id1', 'id2']],
        how='left',
        on=['id1', 'id2'],
        indicator=True)
    new_current_submission = all_submissions[
        all_submissions['_merge'] == 'left_only'
    ][['id1', 'id2', 'type']]

    len_after = len(new_current_submission)
    print(f'After filtering, {len_before - len_after} rows removed')
    return new_current_submission

In [463]:
def add_specific_duplicates(
    current_submission: pd.DataFrame,
    past_submission: pd.DataFrame,
    types_to_add: list,
    new_type: str
) -> pd.DataFrame:

    past_to_change = past_submission[past_submission["type"].isin(types_to_add)]
    past_to_change["type"] = new_type
    print(f'At most {len(past_to_change)} observations modified or added')

    final_submission = pd.concat([past_to_change, current_submission]).drop_duplicates(
        subset=['id1', 'id2']
    ).sort_values(
        by=['id1', 'id2']
    ).reset_index(drop=True)

    print(f'{len(final_submission) - len(current_submission)} rows added')
    print(
        f'{len(current_submission[current_submission["type"].isin(types_to_add)]) - len(final_submission[final_submission["type"].isin(types_to_add)])} of {types_to_add} have been changed'
    )
    return final_submission

In [464]:
def mutualisation_with_good_submission(
    current_submission: pd.DataFrame,
    good_submission: pd.DataFrame,
    use_semantic: bool,
    use_temporal: bool,
    union_temporal: bool,
    reduce_partials: bool,
    replace_partials: bool
) -> pd.DataFrame:

    len_before = len(current_submission)

    final_full = current_submission[current_submission['type'] == "FULL"]

    current_semantic = current_submission[current_submission['type'] == "SEMANTIC"]
    if use_semantic:
        past_semantic = good_submission[good_submission['type'] == "SEMANTIC"]
        final_semantic = pd.concat([current_semantic, past_semantic]).drop_duplicates()
    else:
        final_semantic = current_semantic.copy()

    current_temporal = current_submission[current_submission['type'] == "TEMPORAL"]
    if use_temporal:
        past_temporal = good_submission[good_submission['type'] == "TEMPORAL"]
        if union_temporal:
            final_temporal = pd.concat([current_temporal, past_temporal]).drop_duplicates()
        else:
            final_temporal = pd.merge(current_temporal,
                                    past_temporal,
                                    how='inner',
                                    on=['id1', 'id2', 'type']
                                    )
    else:
        final_temporal = current_temporal.copy()
    
    current_partial = current_submission[current_submission['type'] == "PARTIAL"]
    past_partial = good_submission[good_submission['type'] == "PARTIAL"]

    if replace_partials:
        final_partial = past_partial.copy()
    else:
        final_partial = current_partial.copy()

    if reduce_partials:
        past_large_semantic = good_submission[good_submission['type'].isin(["SEMANTIC", "PARTIAL"])][["id1", "id2"]]
        final_partial = pd.merge(final_partial,
                                 past_large_semantic,
                                 how='inner',
                                 on=['id1', 'id2']
                                )

    final_submission = pd.concat(
        [final_full, final_partial, final_semantic, final_temporal]
    ).drop_duplicates(
        subset=['id1', 'id2']
    ).sort_values(
        by=['id1', 'id2']
    ).reset_index(drop=True)

    len_after = len(final_submission)
    print(f'After filtering, {len_before - len_after} rows removed')
    return final_submission

In [465]:
def aggregate_partials(
    list_submissions: pd.DataFrame
) -> pd.DataFrame:

    list_partials = [
        submission[submission["type"] == 'PARTIAL'] for submission in list_submissions
    ]

    final_partials =  pd.concat(list_partials).drop_duplicates()
    len_partials = len(final_partials)

    print(f'There are {len_partials} sure non duplicates from partials analysis')
    return final_partials

## Applications

In [466]:
describe_duplicates(current_submission)

Unnamed: 0,type,id1,id2
0,FULL,63363,63363
1,PARTIAL,31293,31293
2,SEMANTIC,186710,186710
3,TEMPORAL,678722,678722


### For Byl

In [467]:
non_duplicates_AP = aggregate_partials([
    bad_submission_AP_6,
    bad_submission_FB_3,
    bad_submission_BYL_4,
    bad_submission_BYL_5,
    bad_submission_BYL_6
])

non_duplicates_AP['type'] = "NON"

There are 3566 sure non duplicates from partials analysis


In [468]:
non_duplicates_AP.to_csv("non_duplicates_AP.csv", index=False, header=False)

In [469]:
non_duplicates_byl = pd.read_csv('non_duplicates_byl.csv',
    names=['id1', 'id2', 'type'],
    lineterminator='\n')

len(non_duplicates_byl)

8498

### Complete with good submissions

In [470]:
current_submission = mutualisation_with_good_submission(
    current_submission,
    good_submission_BYL_10,
    use_semantic=False,
    use_temporal=False,
    union_temporal=False,
    reduce_partials=False,
    replace_partials=True
)

After filtering, 18467 rows removed


In [471]:
describe_duplicates(current_submission)

Unnamed: 0,type,id1,id2
0,FULL,63363,63363
1,PARTIAL,15389,15389
2,SEMANTIC,184147,184147
3,TEMPORAL,678722,678722


In [472]:
current_submission = add_specific_duplicates(
    current_submission,
    good_submission_BYL_9,
    types_to_add=['PARTIAL'],
    new_type="SEMANTIC"
)

At most 8134 observations modified or added
3677 rows added
612 of ['PARTIAL'] have been changed


In [473]:
current_submission = mutualisation_with_good_submission(
    current_submission,
    good_submission_AP_9,
    use_semantic=True,
    use_temporal=False,
    union_temporal=False,
    reduce_partials=True,
    replace_partials=False
)

After filtering, -19402 rows removed


In [474]:
current_submission = mutualisation_with_good_submission(
    current_submission,
    good_submission_BYL_8,
    use_semantic=True,
    use_temporal=False,
    union_temporal=False,
    reduce_partials=False,
    replace_partials=False
)

After filtering, -27086 rows removed


In [475]:
current_submission = mutualisation_with_good_submission(
    current_submission,
    good_submission_BYL_9,
    use_semantic=False,
    use_temporal=True,
    union_temporal=False,
    reduce_partials=False,
    replace_partials=False
)

After filtering, 236262 rows removed


In [476]:
describe_duplicates(current_submission)

Unnamed: 0,type,id1,id2
0,FULL,63363,63363
1,PARTIAL,14000,14000
2,SEMANTIC,235701,235701
3,TEMPORAL,442460,442460


### Remove false duplicates

In [477]:
# NOT SURE TO KEEP YET

current_submission = remove_observations_from_bad_submission(
    current_submission,
    bad_submission_AP_1,
    ['SEMANTIC']
)

# current_submission = remove_observations_from_bad_submission(
#     current_submission,
#     bad_submission_AP_1,
#     ['TEMPORAL']
# )

After filtering, 11299 rows removed


In [478]:
current_submission = remove_observations_from_bad_submission(
    current_submission,
    bad_submission_AP_6,
    ['PARTIAL']
)

After filtering, 0 rows removed


In [479]:
# REMOVES MORE TP THAN FP
# current_submission = remove_observations_from_bad_submission(
#     current_submission,
#     bad_submission_FB_1,
#     ['TEMPORAL']
# )

current_submission = remove_observations_from_bad_submission(
    current_submission,
    bad_submission_FB_1,
    ['PARTIAL']
)

After filtering, 0 rows removed


In [480]:
current_submission = remove_observations_from_bad_submission(
    current_submission,
    bad_submission_FB_3,
    ['PARTIAL']
)

After filtering, 0 rows removed


In [481]:
# current_submission = remove_observations_from_bad_submission(
#     current_submission,
#     bad_submission_FB_6,
#     ['PARTIAL']
# )

In [482]:
current_submission = remove_observations_from_bad_submission(
    current_submission,
    bad_submission_BYL_4,
    ['PARTIAL']
)

After filtering, 0 rows removed


In [483]:
current_submission = remove_observations_from_bad_submission(
    current_submission,
    bad_submission_BYL_5,
    ['PARTIAL']
)

After filtering, 0 rows removed


In [484]:
current_submission = remove_observations_from_bad_submission(
    current_submission,
    bad_submission_BYL_6,
    ['PARTIAL']
)

After filtering, 0 rows removed


In [485]:
describe_duplicates(current_submission)

Unnamed: 0,type,id1,id2
0,FULL,63363,63363
1,PARTIAL,14000,14000
2,SEMANTIC,224402,224402
3,TEMPORAL,442460,442460


### Remove non duplicates

In [486]:
current_submission = remove_observations_from_df_non_duplicates(
    current_submission,
    non_duplicates_AP
)

After filtering, 849 rows removed


In [487]:
# current_submission = remove_observations_from_df_non_duplicates(
#     current_submission,
#     non_duplicates_byl
# )

## Check results

In [488]:
describe_duplicates(current_submission)

Unnamed: 0,type,id1,id2
0,FULL,63363,63363
1,PARTIAL,14000,14000
2,SEMANTIC,224370,224370
3,TEMPORAL,441643,441643


In [489]:
current_submission.to_csv("duplicates.csv", index=False, header=False)

## Final check

In [490]:
final_submission = pd.read_csv('duplicates.csv',
                               names=['id1', 'id2', 'type'],
                               lineterminator='\n')

In [491]:
describe_duplicates(final_submission)

Unnamed: 0,type,id1,id2
0,FULL,63363,63363
1,PARTIAL,14000,14000
2,SEMANTIC,224370,224370
3,TEMPORAL,441643,441643


## Experiments

In [492]:
describe_duplicates(good_submission_BYL_10)

Unnamed: 0,type,id1,id2
0,FULL,63353,63353
1,PARTIAL,15389,15389
2,SEMANTIC,179088,179088
3,TEMPORAL,530120,530120


In [493]:
describe_duplicates(initial_submission)

Unnamed: 0,type,id1,id2
0,FULL,63363,63363
1,PARTIAL,31293,31293
2,SEMANTIC,186710,186710
3,TEMPORAL,678722,678722


In [494]:
diff_with_past = remove_observations_from_bad_submission(
    good_submission_BYL_10,
    initial_submission,
    # good_submission_BYL_10,
    ['PARTIAL']
)

describe_duplicates(diff_with_past)

After filtering, 12223 rows removed


Unnamed: 0,type,id1,id2
0,FULL,63353,63353
1,PARTIAL,3166,3166
2,SEMANTIC,179088,179088
3,TEMPORAL,530120,530120
