In [1]:
from collections import defaultdict
import numpy as np
import pandas as pd

In [2]:
NUM_MODELS = 10
TRAIN_TARGET_MEAN = 0.37 # The positive class ratio of the training data
TEST_TARGET_MEAN = 0.16 # The positive class ratio of the test data
REPEAT = 2 # The repeat times of the last two post processing steps
DUP_THRESHOLD = 0.3 # Used to determine if a pair of questions are duplicate
NOT_DUP_THRESHOLD = 0.1 # Used to determine if a pair of questions are non-duplicate
MAX_UPDATE = 0.2 # The maximum update value for each time's prediction update in the last two post processing steps
DUP_UPPER_BOUND = 0.98 # Used to determine whether and how much to increase the prediction of two questions with common duplicates
NOT_DUP_LOWER_BOUND = 0.01 # Used to determine whether and how much to decrease the prediction of two questions with common non-duplicates

In [3]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

In [4]:
# Average 1 prediction of XGBoost and 10 predictions from the 10-folds cross-valid trainig using deep network model
df = pd.read_csv("predictions/dn_preds0.csv")
for i in range(1, NUM_MODELS):
    df["is_duplicate"] = df["is_duplicate"] + pd.read_csv("predictions/dn_preds" + str(i) + ".csv")["is_duplicate"]
df["is_duplicate"] = df["is_duplicate"] + pd.read_csv("predictions/xgb_preds" + ".csv")["is_duplicate"]
df["is_duplicate"] /= 11

In [5]:
# Adjust the predictions according to the positive class ratios of training data and test data
print("Adjusting predictions considering the different class inbalance ratio...")
a = TEST_TARGET_MEAN / TRAIN_TARGET_MEAN
b = (1 - TEST_TARGET_MEAN) / (1 - TRAIN_TARGET_MEAN)
df["is_duplicate"] = df["is_duplicate"].apply(lambda x: a*x / (a*x + b*(1 - x)))

Adjusting predictions considering the different class inbalance ratio...


In [6]:
test_label = np.array(df["is_duplicate"])

In [7]:
# Update the predictions of quetion pairs that have common duplicates
# For the underneath principle please see the Post-Processing part in our final report.
print("Updating the predictions of the pairs with common duplicates..")
for i in range(REPEAT):
    dup_neighbors = defaultdict(set)

    # Gather each question's duplicate questions based on the training set labels
    for dup, q1, q2 in zip(df_train["is_duplicate"], df_train["question1"], df_train["question2"]):
        if dup:
            dup_neighbors[q1].add(q2)
            dup_neighbors[q2].add(q1)

    # Gather each question's duplicate questions based on the test set predictions
    for dup, q1, q2 in zip(test_label, df_test["question1"], df_test["question2"]):
        if dup > DUP_THRESHOLD:
            dup_neighbors[q1].add(q2)
            dup_neighbors[q2].add(q1)

    # If a pair of questions have common duplicates then their prediction should not be lower than DUP_UPPER_BOUND, otherwise increase their predition
    count = 0
    for index, (q1, q2) in enumerate(zip(df_test["question1"], df_test["question2"])):
        dup_neighbor_count = len(dup_neighbors[q1].intersection(dup_neighbors[q2]))
        if dup_neighbor_count > 0 and test_label[index] < DUP_UPPER_BOUND:
            update = min(MAX_UPDATE, (DUP_UPPER_BOUND - test_label[index]) / 2)
            test_label[index] += update
            count += 1

    print("Updated:", count)

Updating the predictions of the pairs with common duplicates..
Updated: 16463
Updated: 17298


In [8]:
# Update the predictions of quetion pairs that have common non-duplicates
# For the underneath principle please see the Post-Processing part in our final report.
print("Updating the predictions of the pairs with common non-duplicates..")
for i in range(REPEAT):
    not_dup_neighbors = defaultdict(set)

    # Gather each question's non-duplicate questions based on the training set labels
    for dup, q1, q2 in zip(df_train["is_duplicate"], df_train["question1"], df_train["question2"]):
        if not dup:
            not_dup_neighbors[q1].add(q2)
            not_dup_neighbors[q2].add(q1)

    # Gather each question's non-duplicate questions based on the test set predictions
    for dup, q1, q2 in zip(test_label, df_test["question1"], df_test["question2"]):
        if dup < NOT_DUP_THRESHOLD:
            not_dup_neighbors[q1].add(q2)
            not_dup_neighbors[q2].add(q1)

    # If a pair of questions have common non-duplicates then their prediction should not be higher than NOT_DUP_LOWER_BOUND, otherwise decrease their predition
    count = 0
    for index, (q1, q2) in enumerate(zip(df_test["question1"], df_test["question2"])):
        dup_neighbor_count = len(not_dup_neighbors[q1].intersection(not_dup_neighbors[q2]))
        if dup_neighbor_count > 0 and test_label[index] > NOT_DUP_LOWER_BOUND:
            update = min(MAX_UPDATE, (test_label[index] - NOT_DUP_LOWER_BOUND) / 2)
            test_label[index] -= update
            count += 1

    print("Updated:", count)


Updating the predictions of the pairs with common non-duplicates..
Updated: 4807
Updated: 5142


In [9]:
# Save the final predictions to file
submission = pd.DataFrame({"test_id":df_test["test_id"], "is_duplicate":test_label})
submission = submission.reindex(columns=['test_id','is_duplicate'])
submission.to_csv("predictions/averaged_xgb_dn_preds_post.csv", index=False)