In [None]:
# The training and testing data should be POS tagged (e.g., mystem for Russian) and saved as word POS word POS ...
# as implemented in notebook rule_based_classification.ipynb

# This code uses predictions from individual models saved in files
# input files:
# CNN predictions in form of probabilities as .txt file: probability\tquestion, where probability - a prob. that a question is comp.
# BERT predictions in form of probabilities as .txt file: probability1\t\tprobaility2\tquestion, where probability1 is a prob. that a question is not comp., probability2 - otherwise
# ML (log. regr) predictions as .txt file: lr_probabilities\tquestion

### Define helper functions

In [None]:
import json
import subprocess
import string
import re
from tqdm import tqdm_notebook as tqdm
import pandas as pd
import numpy as np
from os.path import join

### Helper functions to read probabilities
import glob
import os

def read_cnn_questions_probabilities():
    cnn_questions=[]
    cnn_probabilities=[]
    for n in range(10): # adjusted to 10-Fold Crossvalidation producing a separ. file for each split
        path_cnn = 'path_to_CNN_predictions'
        filename = glob.glob(os.path.join(path_cnn, 'file' + str(n) + '.txt')) # change to your file names  
        with open(filename[0]) as f:
            content = f.readlines()
        cnn_probabilities += [float(x.strip('\n').split('\t')[0]) for x in content[1:]] # ignore heading in the input file
        cnn_questions += [x.strip('\n').split('\t')[1] for x in content[1:]] # ignore heading in the input file
    return  cnn_probabilities, cnn_questions 

def read_bert_questions_probabilities():
    bert_questions=[]
    bert_probabilities=[]
    for n in range(10):
        path_bert = 'path_to_BERT_predictions'  
        filename = glob.glob(os.path.join(path_bert, 'file' + str(n) + '.txt'))
        with open(filename[0]) as f:
            content = f.readlines()
        bert_probabilities += [float(x.strip('\n').split('\t')[1]) for x in content] # no heading in the input file
        bert_questions += [x.strip('\n').split('\t')[2] for x in content] # no heading in the input file
    return bert_probabilities, bert_questions

def read_ml_questions_probabilities():
    ml_questions=[]
    lr_probabilities=[]
    for n in range(10):
        path_ml = 'path_to_ML_predictions'
        filename = glob.glob(os.path.join(path_ml, 'file' + str(n) + '.txt'))
        with open(filename[0]) as f:
            content = f.readlines()
        lr_probabilities += [float(x.strip('\n').split('\t')[0]) for x in content[1:]] # ignore heading in the input file
        ml_questions += [x.strip('\n').split('\t')[1]for x in content[1:]]
    return  lr_probabilities, ml_questions


## Building Ensemble classifier on a training set

In [None]:
# Reading probabilities in a dataframe

cnn_probabilities, cnn_questions = read_cnn_questions_probabilities()
bert_probabilities, bert_questions = read_bert_questions_probabilities()

neural_predictions_df = pd.DataFrame({'cnn_probabilities':cnn_probabilities, 'bert_probabilities':bert_probabilities, 
                                      'lr_probabilities':lr_probabilities, 'bert_questions':bert_questions, 'cnn_questions':cnn_questions, 'ml_questions':ml_questions})

# all_predictions_df contains probabilities of all classifiers along with questions
# Columns: 'cnn_probabilities', 'bert_probabilities', 'lr_probabilities', 'bert_questions', 'cnn_questions', 'ml_questions'

global all_predictions_df
all_predictions_df = neural_predictions_df

In [None]:
# Read data from train split

dataset = 'train-binary.txt'
data = pd.read_csv(join('path_to_train_data', dataset), sep='\t')

questions_original = np.array([question for question in data['question'].tolist()]) # questions before POS tagging
questions = np.array([pos(mst.process(strip_punct(question))) for question in data['question'].tolist()]) # produce POS tagging on a fly
# see notebook rule_based_classification.ipynb for example with mystem in Russian
# or ignore if training data was already POS tagged
labels = np.array(data.comp.tolist())

dataset = 'train-binary-afterpb.txt' # dataset produced after removing questions classified with rules (see notebook rule_based_classification.ipynb)
# which produce predicitons with precision 1. In our case, the first 7 rules from the notebook

data_afterpb_df = pd.read_csv(join('path_to_train_data', dataset), sep='\t')

questions_afterpb = np.array([pos(mst.process(strip_punct(question))) for question in data_afterpb_df['question'].tolist()]) # produce POS tagging on a fly
# or ignore if training data was already POS tagged

labels_afterpb = np.array(data_afterpb_df.comp.tolist())


### Predictions based on the ML probabilities combined with rules

In [None]:
# Define helper functions

from collections import Counter
from sklearn.metrics import classification_report

# We add the 8th rule (in Russian) which recalls extra 20% but makes precision not 1 any more.
# We will then fix FP using ML

def predict_pattern(question):
    if (re.search('луч.{0,1}ше ', question) and not re.search('как ', question)): prediction = 1
    elif (re.search('comp', question) and re.search(' или | vs | vs.', question) and question.find('comp') < question.find(' или ') and not re.search('более comp или conj менее comp', question)): prediction = 1
    elif (re.search('как', question) and re.search('правильно', question) and re.search(' или ', question)) or (re.search('как', question) and re.search('пишется | писать | написать', question) and re.search(' или ', question)): prediction = 1
    elif (re.search('что ', question) and re.search(r'общего | сходст| схож', question) and re.search(' и | от | или | между | vs | vs. | versus ', question)): prediction = 1
    elif (re.search('выб+рать|купить|взять', question) and re.search(r' или | между | vs | vs. | versus ', question)): prediction = 1
    elif ((re.search(' в ', question)) and re.search(' сравнении ', question)) or ((re.search(' по ', question)) and re.search(' сравнению ', question)): prediction = 1
    elif (re.search('преимуществ|недостаток', question) and re.search('перед | над | сравнен | vs | vs. | versus', question)): prediction = 1
    elif (re.search(' отлич| разница | различие | различ', question) and re.search(' и | от | или | между | vs | vs. | versus', question) and not re.search('что ', question)): prediction = 1
    else: prediction = 0
    return prediction

def binary_pred(prob, thre=0.5):
    if prob >= thre: pred = 1
    else: pred = 0
    return pred

def precision_recall(probabilities):
    comb_predictions = []
    thresholds = []

    for pr_pos in np.arange(0.0, 0.99, 0.001): # manipulate the step of the probability threshold in order to draw precision-recall curve
        comb_predictions_ = []
        pr_pos = round(pr_pos,5)
        for item in probabilities:
            pred = binary_pred(item, thre=pr_pos)
            comb_predictions_.append(pred)
        comb_predictions_ = np.pad(comb_predictions_, (0,476), 'constant', constant_values=1) # predictions would be padded with ones for those predicted by the rules as 1
        comb_predictions.append(comb_predictions_)
        thresholds.append(pr_pos)
    predictions_thresholds = list(zip(comb_predictions, thresholds))
    dicts_low = {}
    keys = [str(item[1]) for item in predictions_thresholds]
    for i in range(len(keys)):
            dicts_low[keys[i]] = predictions_thresholds[i][0]
            
    comb_predictions = []
    thresholds = []

    for pr_pos in np.arange(0.99, 1.00001, 0.00001): # the high probabilities are very senstitive to the step in making decision so that we reduces the step 
        comb_predictions_ = []
        pr_pos = round(pr_pos,5)
        for item in probabilities:
            pred = binary_pred(item, thre=pr_pos)
            comb_predictions_.append(pred)
        comb_predictions_ = np.pad(comb_predictions_, (0,476), 'constant', constant_values=1)
        comb_predictions.append(comb_predictions_)
        thresholds.append(pr_pos)
    predictions_thresholds = list(zip(comb_predictions, thresholds))
    dicts_hi = {}
    keys = [str(item[1]) for item in predictions_thresholds]
    for i in range(len(keys)):
            dicts_hi[keys[i]] = predictions_thresholds[i][0]
    
    dicts={}
    dicts = {**dicts_low, **dicts_hi}

    labels_afterpb_padded = np.pad(labels_afterpb, (0,476), 'constant', constant_values=1)
    recall = []
    precision = []
    thresholds = []
    for keys, values in dicts.items():
        precision_ = classification_report(y_true=labels_afterpb_padded, y_pred=values, output_dict=True)['1']['precision']
        recall_ = classification_report(y_true=labels_afterpb_padded, y_pred=values, output_dict=True)['1']['recall']
        precision.append(round(precision_, 3))
        recall.append(round(recall_,3))
        thresholds.append(keys)
    return precision, recall, thresholds

In [None]:
%%time
# Calculate precision, recall and respective decision probability thresholds for predicitions
# from log. regr., CNN, BERT

precision_lr, recall_lr, thresholds_lr = precision_recall(lr_probabilities)
precision_bert, recall_bert, thresholds_bert = precision_recall(bert_probabilities)
precision_cnn, recall_cnn, thresholds_cnn = precision_recall(cnn_probabilities)

In [None]:
%%time
# Build ensemble classifiers, where comb_no_bert would mean Ensemble-CNN, comb_no_cnn - Ensemble-BERT, and comb_predictions - Ensemble-combi 
# The calculations would be done in two parts for the lower classifier confedence probabilities with a enlarged step
# and for higher probabilites with a reduced step.

comb_no_bert = []
comb_no_cnn = []
comb_predictions = []
thresholds = []

for pr_pos in np.arange(0.0, 0.99, 0.001):
    comb_predictions_ = []
    comb_no_bert_ = []
    comb_no_cnn_ = []
    pr_pos = round(pr_pos,5)
    for item in list(zip(lr_probabilities, cnn_probabilities, bert_probabilities, ml_questions)):
        pred_lr = binary_pred(item[0], thre=0.45)                 # these thresholds correspond to values where the model
        pred_cnn = binary_pred(item[1], thre=0.995)               # achieves a maximal recall at a precision of 1
        pred_bert = binary_pred(item[2], thre=0.998)              # for a comparative questions class
        if pred_bert == 1: comb_predictions_.append(pred_bert)
        elif pred_cnn == 1: comb_predictions_.append(pred_cnn)
        elif pred_lr == 1: comb_predictions_.append(pred_lr)
        else:
            if binary_pred(item[1], thre=pr_pos) == binary_pred(item[2], thre=pr_pos): pred = binary_pred(item[2], thre=pr_pos)
            else: pred = predict_pattern(item[3])
            comb_predictions_.append(pred)

        if pred_cnn == 1: comb_no_bert_.append(pred_cnn)
        else:
            if binary_pred(item[0], thre=pr_pos) == binary_pred(item[1], thre=pr_pos): pred = binary_pred(item[1], thre=pr_pos)
            else: pred = predict_pattern(item[4])
            comb_no_bert_.append(pred)
            
        if pred_bert == 1: comb_no_cnn_.append(pred_bert)
        elif pred_lr == 1: comb_no_cnn_.append(pred_lr)
        else:
            if binary_pred(item[0], thre=pr_pos) == binary_pred(item[2], thre=pr_pos): pred = binary_pred(item[2], thre=pr_pos)
            else: pred = predict_pattern(item[4])
            comb_no_cnn_.append(pred)
            
    comb_no_bert_ = np.pad(comb_no_bert_, (0,476), 'constant', constant_values=1)         # padding by ones to reflect the number of questions 
    comb_no_cnn_ = np.pad(comb_no_cnn_, (0,476), 'constant', constant_values=1)           # classified as comparative by rules with precision 1
    comb_predictions_ = np.pad(comb_predictions_, (0,476), 'constant', constant_values=1)
    comb_predictions.append(comb_predictions_)
    comb_no_bert.append(comb_no_bert_)
    comb_no_cnn.append(comb_no_cnn_)
    thresholds.append(pr_pos)

def create_dict_low(predictions, thresholds):
    predictions_thresholds = list(zip(predictions, thresholds))
    dicts_low = {}
    keys = [str(item[1]) for item in predictions_thresholds]
    for i in range(len(keys)):
            dicts_low[keys[i]] = predictions_thresholds[i][0]
    return dicts_low

dicts_low_comb = create_dict_low(comb_predictions, thresholds)
dicts_low_no_bert = create_dict_low(comb_no_bert, thresholds)
dicts_low_no_cnn = create_dict_low(comb_no_cnn, thresholds)

In [None]:
%%time

# Follows the instructions above but for higher classifiers' confidence probabilities

comb_no_bert = []
comb_no_cnn = []
comb_predictions = []
thresholds = []

for pr_pos in np.arange(0.99, 1.00001, 0.00001):
    comb_predictions_ = []
    comb_no_bert_ = []
    comb_no_cnn_ = []
    pr_pos = round(pr_pos,5)
    for item in list(zip(lr_probabilities, cnn_probabilities, bert_probabilities, ml_questions)):
        pred_lr = binary_pred(item[0], thre=0.45)
        pred_cnn = binary_pred(item[1], thre=0.995)
        pred_bert = binary_pred(item[2], thre=0.998)
        if pred_bert == 1: comb_predictions_.append(pred_bert)
        elif pred_cnn == 1: comb_predictions_.append(pred_cnn)
        elif pred_lr == 1: comb_predictions_.append(pred_lr)
        else:
            if binary_pred(item[1], thre=pr_pos) == binary_pred(item[2], thre=pr_pos): pred = binary_pred(item[2], thre=pr_pos)
            else: pred = predict_pattern(item[4])
            comb_predictions_.append(pred)

        if pred_cnn == 1: comb_no_bert_.append(pred_cnn)
        else:
            if binary_pred(item[0], thre=pr_pos) == binary_pred(item[1], thre=pr_pos): pred = binary_pred(item[1], thre=pr_pos)
            else: pred = predict_pattern(item[3])
            comb_no_bert_.append(pred)
            
        if pred_bert == 1: comb_no_cnn_.append(pred_bert)
        elif pred_lr == 1: comb_no_cnn_.append(pred_lr)
        else:
            if binary_pred(item[0], thre=pr_pos) == binary_pred(item[2], thre=pr_pos): pred = binary_pred(item[2], thre=pr_pos)
            else: pred = predict_pattern(item[4])
            comb_no_cnn_.append(pred)
            
    comb_no_bert_ = np.pad(comb_no_bert_, (0,476), 'constant', constant_values=1)
    comb_no_cnn_ = np.pad(comb_no_cnn_, (0,476), 'constant', constant_values=1)
    majority_vote_pred_ = np.pad(majority_vote_pred_, (0,476), 'constant', constant_values=1)
    comb_predictions_ = np.pad(comb_predictions_, (0,476), 'constant', constant_values=1)
    comb_predictions.append(comb_predictions_)
    majority_vote_pred.append(majority_vote_pred_)
    comb_no_bert.append(comb_no_bert_)
    comb_no_cnn.append(comb_no_cnn_)
    thresholds.append(pr_pos)

def create_dict_hi(predictions, thresholds):
    predictions_thresholds = list(zip(predictions, thresholds))
    dicts_hi = {}
    keys = [str(item[1]) for item in predictions_thresholds]
    for i in range(len(keys)):
            dicts_hi[keys[i]] = predictions_thresholds[i][0]
    return dicts_hi

dicts_hi_comb = create_dict_hi(comb_predictions, thresholds)
dicts_hi_no_bert = create_dict_hi(comb_no_bert, thresholds)
dicts_hi_no_cnn = create_dict_hi(comb_no_cnn, thresholds)

In [None]:
%%time

# Merge dictionaries calculated above and extract precisions, recalls, f1's, and probability thresholds

def prec_rec_f1(dicts_low, dicts_hi):
    thresholds_comb = []
    dicts = {**dicts_low, **dicts_hi}
    labels_afterpb_padded = np.pad(labels_afterpb, (0,476), 'constant', constant_values=1)
    recall = []
    precision = []
    f1 = []
    for keys, values in dicts.items():
        precision_ = classification_report(y_true=labels_afterpb_padded, y_pred=values, output_dict=True)['1']['precision']
        recall_ = classification_report(y_true=labels_afterpb_padded, y_pred=values, output_dict=True)['1']['recall']
        f1_ = classification_report(y_true=labels_afterpb_padded, y_pred=values, output_dict=True)['1']['f1-score']
        precision.append(round(precision_, 3))
        recall.append(round(recall_,3))
        f1.append(round(f1_,3))
        thresholds_comb.append(float(keys))
    return precision, recall, f1, thresholds_comb

precision, recall, f1, thresholds = prec_rec_f1(dicts_low_comb, dicts_hi_comb)
precision_no_bert, recall_no_bert, f1_no_bert, thresholds_no_bert = prec_rec_f1(dicts_low_no_bert, dicts_hi_no_bert)
precision_no_cnn, recall_no_cnn, f1_no_cnn, thresholds_no_cnn = prec_rec_f1(dicts_low_no_cnn, dicts_hi_no_cnn)

In [None]:
# Read rule-based predictions calculated and saved in rule_based_calssification.ipynb

pb_df = pd.read_csv('../prrecall-pb.txt', sep='\t')
precision_pb = pb_df.precision_pb
recall_pb = pb_df.recall_pb

In [None]:
# Plot precision-recall curves for classification of the comparative question class

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

font = {'size' : 22}
plt.rc('font', **font)

fig = plt.figure(figsize=(25, 6))
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2)

major_ticks = np.arange(0, 1.01, 0.05)
minor_ticks = np.arange(0, 1.01, 0.01)

ax1.set_xticks(major_ticks)
ax1.set_xticks(minor_ticks, minor=True)
ax1.set_yticks(major_ticks)
ax1.set_yticks(minor_ticks, minor=True)
ax2.set_xticks(major_ticks)
ax2.set_xticks(minor_ticks, minor=True)
ax2.set_yticks(major_ticks)
ax2.set_yticks(minor_ticks, minor=True)
ax2.label_outer()

axes = plt.gca()
ax1.set_xlim([0.39,0.71])
ax1.set_ylim([0.89,1.01])
ax2.set_xlim([0.49,0.81])
ax2.set_ylim([0.89,1.01])

ax1.grid(which='minor', alpha=0.2)
ax1.grid(which='major', alpha=0.5)
ax2.grid(which='minor', alpha=0.2)
ax2.grid(which='major', alpha=0.5)

ax1.plot(recall_pb, precision_pb, marker='o', label='Rule-based', linestyle='dashed', 
        linewidth=2, markersize=10)
ax1.plot(recall_cnn+[0.55], precision_cnn+[1.0], marker='d', label='CNN', linestyle='dashed', 
        linewidth=2, markersize=10, markevery = 0.025)
ax1.plot(recall_bert+[0.49], precision_bert+[1.0], marker='s', label='BERT', linestyle='dashed', 
        linewidth=2, markersize=10, markevery = 0.03)
ax1.plot(recall_lr+[0.55], precision_lr+[1.0], marker='v', label='Log. regression', linestyle='dashed', 
        linewidth=2, markersize=10, markevery = 0.025)


ax2.plot(recall+[0.60], precision+[1.0], marker='o', label='Ensemble-combi', linestyle='dashed', 
        linewidth=2, markersize=10, markevery = 0.03)
ax2.plot(recall_no_bert+[0.629], precision_no_bert+[1.0], marker='d', label='Ensemble-CNN', linestyle='dashed', 
        linewidth=2, markersize=10, markevery = 0.025)
ax2.plot(recall_no_cnn+[0.628], precision_no_cnn+[1.0], marker='s', label='Ensemble-BERT', linestyle='dashed', 
        linewidth=2, markersize=10, markevery = 0.025)

ax1.set(xlabel='Recall', ylabel='Precision')
ax2.set(xlabel='Recall')

ax1.legend(loc=3, ncol=1)
ax2.legend(loc=3, ncol=1)
plt.tight_layout()

plt.show()

## The same can be then applied for the test set to evaluate classification performance