In [None]:
# The training and testing data should be POS tagged (e.g., mystem for Russian) and saved as word POS word POS ...
# as implemented in notebook rule_based_classification.ipynb

# input files:
# CNN predictions in form of probabilities as .txt file: probability\tquestion, where probability - a prob. that a question is comp.
# BERT predictions in form of probabilities as .txt file: probability1\t\tprobaility2\tquestion, where probability1 is a prob. that a question is not comp., probability2 - otherwise
# ML (SVM and log. regr) predictions as .txt file: svm_probabilities\tlr_probabilities\tquestion

### Define helper functions

In [None]:
import json
import subprocess
import string
import re
from tqdm import tqdm_notebook as tqdm
import pandas as pd
import numpy as np
from os.path import join

### Helper functions to read probabilities
import glob
import os

def read_cnn_questions_probabilities():
    cnn_questions=[]
    cnn_probabilities=[]
    for n in range(10): # adjusted to 10-Fold Crossvalidation producing a separ. file for each split
        path_cnn = 'path_to_CNN_predictions'
        filename = glob.glob(os.path.join(path_cnn, 'file' + str(n) + '.txt')) # change to your file names  
        with open(filename[0]) as f:
            content = f.readlines()
        cnn_probabilities += [float(x.strip('\n').split('\t')[0]) for x in content[1:]] # ignore heading in the input file
        cnn_questions += [x.strip('\n').split('\t')[1] for x in content[1:]] # ignore heading in the input file
    return  cnn_probabilities, cnn_questions 

def read_bert_questions_probabilities():
    bert_questions=[]
    bert_probabilities=[]
    for n in range(10):
        path_bert = 'path_to_BERT_predictions'  
        filename = glob.glob(os.path.join(path_bert, 'file' + str(n) + '.txt'))
        with open(filename[0]) as f:
            content = f.readlines()
        bert_probabilities += [float(x.strip('\n').split('\t')[1]) for x in content] # no heading in the input file
        bert_questions += [x.strip('\n').split('\t')[2] for x in content] # no heading in the input file
    return bert_probabilities, bert_questions

def read_ml_questions_probabilities():
    ml_questions=[]
    svm_probabilities=[]
    lr_probabilities=[]
    for n in range(10):
        path_ml = 'path_to_ML_predictions'
        filename = glob.glob(os.path.join(path_ml, 'file' + str(n) + '.txt'))
        with open(filename[0]) as f:
            content = f.readlines()
        svm_probabilities += [float(x.strip('\n').split('\t')[0]) for x in content[1:]] # ignore heading in the input file
        lr_probabilities += [float(x.strip('\n').split('\t')[1]) for x in content[1:]] # ignore heading in the input file
        ml_questions += [x.strip('\n').split('\t')[2]for x in content[1:]]
    return  svm_probabilities, lr_probabilities, ml_questions


## Building Ensemble classifier on a training set

In [None]:
# Reading probabilities in a dataframe

cnn_probabilities, cnn_questions = read_cnn_questions_probabilities()
bert_probabilities, bert_questions = read_bert_questions_probabilities()
svm_probabilities, lr_probabilities, ml_questions = read_ml_questions_probabilities()
#neural_predictions = list(zip(cnn_probabilities, bert_probabilities, svm_probabilities, lr_probabilities, bert_questions, cnn_questions, ml_questions))
neural_predictions_df = pd.DataFrame({'cnn_probabilities':cnn_probabilities, 'bert_probabilities':bert_probabilities, 
                                      'svm_probabilities':svm_probabilities, 'lr_probabilities':lr_probabilities, 
                                      'bert_questions':bert_questions, 'cnn_questions':cnn_questions, 'ml_questions':ml_questions})

# all_predictions_df contains probabilities of all classifiers along with questions
# Columns: 'cnn_probabilities', 'bert_probabilities', 'svm_probabilities', 'lr_probabilities', 'bert_questions', 'cnn_questions', 'ml_questions'

global all_predictions_df
all_predictions_df = neural_predictions_df

In [None]:
### Read data from train split

dataset = 'train-binary.txt'
data = pd.read_csv(join('path_to_train_data', dataset), sep='\t')

questions_original = np.array([question for question in data['question'].tolist()]) # questions before POS tagging
questions = np.array([pos(mst.process(strip_punct(question))) for question in data['question'].tolist()]) # produce POS tagging on a fly
# see notebook rule_based_classification.ipynb for example with mystem in Russian
# or ignore if training data was already POS tagged
labels = np.array(data.comp.tolist())

dataset = 'train-binary-afterpb.txt' # dataset produced after removing questions classified with rules (see notebook rule_based_classification.ipynb)
# which produce predicitons with precision 1. In our case, the first 7 rules from the notebook

data_afterpb_df = pd.read_csv(join('path_to_train_data', dataset), sep='\t')

questions_afterpb = np.array([pos(mst.process(strip_punct(question))) for question in data_afterpb_df['question'].tolist()]) # produce POS tagging on a fly
# or ignore if training data was already POS tagged

labels_afterpb = np.array(data_afterpb_df.comp.tolist())


### Predictions based on the ML probabilities combined with rules

In [None]:
# define helper functions

from collections import Counter
from sklearn.metrics import classification_report

# We add the 8th rule (in Russian) which recalls extra 20% but makes precision not 1 any more.
# We will then fix FP using ML

def predict_pattern(question):
    if (re.search('луч.{0,1}ше ', question) and not re.search('как ', question)): prediction = 1
    elif (re.search('comp', question) and re.search(' или | vs | vs.', question) and question.find('comp') < question.find(' или ') and not re.search('более comp или conj менее comp', question)): prediction = 1
    elif (re.search('как', question) and re.search('правильно', question) and re.search(' или ', question)) or (re.search('как', question) and re.search('пишется | писать | написать', question) and re.search(' или ', question)): prediction = 1
    elif (re.search('что ', question) and re.search(r'общего | сходст| схож', question) and re.search(' и | от | или | между | vs | vs. | versus ', question)): prediction = 1
    elif (re.search('выб+рать|купить|взять', question) and re.search(r' или | между | vs | vs. | versus ', question)): prediction = 1
    elif ((re.search(' в ', question)) and re.search(' сравнении ', question)) or ((re.search(' по ', question)) and re.search(' сравнению ', question)): prediction = 1
    elif (re.search('преимуществ|недостаток', question) and re.search('перед | над | сравнен | vs | vs. | versus', question)): prediction = 1
    elif (re.search(' отлич| разница | различие | различ', question) and re.search(' и | от | или | между | vs | vs. | versus', question) and not re.search('что ', question)): prediction = 1
    else: prediction = 0
    return prediction

def binary_pred(prob, thre=0.5):
    if prob >= thre: pred = 1
    else: pred = 0
    return pred

def precision_recall(probabilities):
    comb_predictions = []
    thresholds = []

    for pr_pos in np.arange(0.0, 0.99, 0.001): # manipulate the step of the probability threshold in order to draw precision-recall curve
        comb_predictions_ = []
        pr_pos = round(pr_pos,5)
        for item in probabilities:
            pred = binary_pred(item, thre=pr_pos)
            comb_predictions_.append(pred)
        comb_predictions_ = np.pad(comb_predictions_, (0,476), 'constant', constant_values=1) # predictions would be padded with ones for those predicted by the rules as 1
        comb_predictions.append(comb_predictions_)
        thresholds.append(pr_pos)
    predictions_thresholds = list(zip(comb_predictions, thresholds))
    dicts_low = {}
    keys = [str(item[1]) for item in predictions_thresholds]
    for i in range(len(keys)):
            dicts_low[keys[i]] = predictions_thresholds[i][0]
            
    comb_predictions = []
    thresholds = []

    for pr_pos in np.arange(0.99, 1.00001, 0.00001): # the high probabilities are very senstitive to the step in making decision so that we reduces the step 
        comb_predictions_ = []
        pr_pos = round(pr_pos,5)
        for item in probabilities:
            pred = binary_pred(item, thre=pr_pos)
            comb_predictions_.append(pred)
        comb_predictions_ = np.pad(comb_predictions_, (0,476), 'constant', constant_values=1)
        comb_predictions.append(comb_predictions_)
        thresholds.append(pr_pos)
    predictions_thresholds = list(zip(comb_predictions, thresholds))
    dicts_hi = {}
    keys = [str(item[1]) for item in predictions_thresholds]
    for i in range(len(keys)):
            dicts_hi[keys[i]] = predictions_thresholds[i][0]
    
    dicts={}
    dicts = {**dicts_low, **dicts_hi}

    labels_afterpb_padded = np.pad(labels_afterpb, (0,476), 'constant', constant_values=1)
    recall = []
    precision = []
    thresholds = []
    for keys, values in dicts.items():
        precision_ = classification_report(y_true=labels_afterpb_padded, y_pred=values, output_dict=True)['1']['precision']
        recall_ = classification_report(y_true=labels_afterpb_padded, y_pred=values, output_dict=True)['1']['recall']
        precision.append(round(precision_, 3))
        recall.append(round(recall_,3))
        thresholds.append(keys)
    return precision, recall, thresholds