#### Define helper functions

In [None]:
import json
import subprocess
import string
import re
from tqdm import tqdm_notebook as tqdm
import pandas as pd
from os.path import join
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt


class Mystem(object):
    """
    Integrates mystem results to do POS-tagging. The executable mystem should be placed in the same directory or
    change the input path in the line specified
    """
    
    def __init__(self):
        self._proc = None

    def _start(self):
        self._proc = subprocess.Popen(
           "./mystem --format json -cgi --eng-gr".split(), # here you can specify location of mystem. Leave unchange if mystem is in the same directory as this code
           stdin=subprocess.PIPE, stdout=subprocess.PIPE)    

    def _getProc(self):
        if self._proc is None:
            self._start()
        return self._proc
    
    def process(self, text):
        p = self._getProc()
        p.stdin.write(text.strip().encode('utf8'))
        p.stdin.write('\n'.encode('utf8'))
        p.stdin.flush()
        return json.loads(p.stdout.readline().decode('utf8'))
    
mst = Mystem()

def strip_punct(s):
    """
    Removes punctuation
    """
    s = re.sub('[^А-Яа-яЁёA-Za-z0-9]', ' ', s.lower())
    return " ".join(s.split())

def pos(l):
    """
    Processes output json results from mystem and returns POS tags.
    In case a token isn't in Russian, mystem returns an empty field, which is replaced by the tag 'latin'.
    Returns tokens (words) followed by the POS tag. Some POS tags are a single-charachter. When deciding to remove a single charachter tokens,
    consider doing so before POS-tagging.
    """
    dd=[]
    for d1 in l:
        l2 = d1.get('analysis', [])
        l3 = d1.get('text', [])
        if l2 != []:
            dd.append(l3)
            grammems = []
            for d2 in l2:
                if 'gr' in d2:
                    grammems.append(d2['gr'])
                    grammems_str = ' '.join(grammems).lower()
            pos = ''
            if re.search('comp', grammems_str): pos = 'comp'
            elif re.search('supr', grammems_str): pos = 'supr'
            else: pos = grammems[0].split('=')[0].split(',')[0]
            dd.append(pos)
        elif 'analysis' in d1:  
            dd.append(l3)
            dd.append('latin')
    return ' '.join(dd).lower()

In [None]:
!chmod +x mystem # could need changing the mode

#### Read data

In [None]:
dataset = 'train.txt' # specify the name of the input file, which is in a format: question\tlabel\n
data = pd.read_csv(join('path', dataset), sep='\t') # specify the path to the input file

questions_original = np.array([question for question in data['question'].tolist()])
questions = np.array([pos(mst.process(strip_punct(question))) for question in data['question'].tolist()]) # apply mystem after punctuation removal
labels = np.array(data.comp.tolist())

#### Rule-based classification

In [None]:
# calculates and prints classification report for each rule independently
# the 15 rules are ordered as they occur in the publication

precision_pb = []
recall_pb = []

patterns = ['\t\tif (re.search(\'луч.{0,1}ше \', question) and not re.search(\'как \', question)): predictions.append(1)\n',
           '\t\tif (re.search(\'comp\', question) and re.search(\' или | vs | vs. | versus \', question) and question.find(\'comp\') < question.find(\' или \') and not re.search(\'более comp или conj менее comp\', question)): predictions.append(1)\n',
           '\t\tif (re.search(\'как\', question) and re.search(\'правильно\', question) and re.search(\' или \', question)) or (re.search(\'как\', question) and re.search(\'пишется | писать | написать\', question) and re.search(\' или \', question)): predictions.append(1)\n',
           '\t\tif (re.search(\'что \', question) and re.search(\'общего | сходст| схож\', question) and re.search(\' и | от | или | между | vs | vs. | versus \', question)): predictions.append(1)\n',
           '\t\tif (re.search(\'выб+рать|купить|взять\', question) and re.search(\' или | между | vs | vs. | versus \', question)): predictions.append(1)\n',
           '\t\tif ((re.search(\' в \', question)) and re.search(\' сравнении \', question)) or ((re.search(\' по \', question)) and re.search(\' сравнению \', question)): predictions.append(1)\n',
           '\t\tif (re.search(\'преимуществ|недостаток\', question) and re.search(\'перед | над | сравнен | vs | vs. | versus\', question)): predictions.append(1)\n',
           '\t\tif (re.search(\' отлич| разница | различ\', question) and re.search(\' и | от | или | между | vs | vs. | versus\', question) and not re.search(\'что \', question)): predictions.append(1)\n',
           '\t\tif (re.search(\'луч.{0,1}ше \', question)): predictions.append(1)\n',
           '\t\tif (re.search(\'comp\', question) and re.search(\'как.{2,2}\', question) and not re.search(\' или | vs | vs. | versus \', question) and not re.search(\'как \', question)): predictions.append(1)\n',
           '\t\tif (re.search(\' или \', question)): predictions.append(1)\n',
           '\t\tif (re.search(\'comp\', question)): predictions.append(1)\n',
           '\t\tif (re.search(\'comp\', question) and re.search(\'как.{2,2}\', question) and not re.search(\' или | vs | vs. | versus \', question)): predictions.append(1)\n',
           '\t\tif (re.search(\'supr\', question)): predictions.append(1)\n',
           '\t\tif (re.search(\' плюс\', question) and re.search(\' минус\', question)): predictions.append(1)\n']

func1 = 'def predict_str(corpus):\n\tpredictions = []\n\tfor question in corpus:\n'
func2 = '\t\telse: predictions.append(0)\n\r\treturn predictions'

for pattern in patterns:
    func = func1 + pattern + func2
    exec(func)
    predictions = predict_str(questions)
    print(pattern)
    print(classification_report(y_true=labels, y_pred=predictions, digits=4))

In [None]:
# plots a precision-recall curve for a comparative question class (labeled with 1 in a dataset)

precision_pb = []
recall_pb = []
patterns = ['\t\tif (re.search(\'луч.{0,1}ше \', question) and not re.search(\'как \', question)): predictions.append(1)\n',
           '\t\telif (re.search(\'comp\', question) and re.search(\' или | vs | vs. \', question) and question.find(\'comp\') < question.find(\' или \') and not re.search(\'более comp или conj менее comp\', question)): predictions.append(1)\n',
           '\t\telif (re.search(\'как\', question) and re.search(\'правильно\', question) and re.search(\' или \', question)) or (re.search(\'как\', question) and re.search(\'пишется | писать | написать\', question) and re.search(\' или \', question)): predictions.append(1)\n',
           '\t\telif (re.search(\'что \', question) and re.search(\'общего | сходст| схож\', question) and re.search(\' и | от | или | между | vs | vs. | versus \', question)): predictions.append(1)\n',
           '\t\telif (re.search(\'выб+рать|купить|взять\', question) and re.search(\' или | между | vs | vs. | versus \', question)): predictions.append(1)\n',
           '\t\telif ((re.search(\' в \', question)) and re.search(\' сравнении \', question)) or ((re.search(\' по \', question)) and re.search(\' сравнению \', question)): predictions.append(1)\n',
           '\t\telif (re.search(\'преимуществ|недостаток\', question) and re.search(\'перед | над | сравнен | vs | vs. | versus\', question)): predictions.append(1)\n',
           '\t\telif (re.search(\' отлич| разница | различ\', question) and re.search(\' и | от | или | между | vs | vs. | versus\', question) and not re.search(\'что \', question)): predictions.append(1)\n',
           '\t\telif (re.search(\'луч.{0,1}ше \', question)): predictions.append(1)\n',
           '\t\telif (re.search(\'comp\', question) and re.search(\'как.{2,2}\', question) and not re.search(\' или | vs | vs. | versus \', question) and not re.search(\'как \', question)): predictions.append(1)\n',
           '\t\telif (re.search(\' или \', question)): predictions.append(1)\n',
           '\t\telif (re.search(\'comp\', question)): predictions.append(1)\n',
           '\t\telif (re.search(\'comp\', question) and re.search(r\'как.{2,2}\', question) and not re.search(\' или | vs | vs. | versus \', question)): predictions.append(1)\n',
           '\t\telif (re.search(\'supr\', question)): predictions.append(1)\n',
           '\t\telif (re.search(\' плюс\', question) and re.search(\' минус\', question)): predictions.append(1)\n']

func1 = 'def predict_str(corpus):\n\tpredictions = []\n\tfor question in corpus:\n'
func2 = '\t\telse: predictions.append(0)\n\r\treturn predictions'
for pattern in patterns:
    func = func1 + pattern + func2
    func1 += pattern
    exec(func)
    predictions = predict_str(questions)
    precision = classification_report(y_true=labels, y_pred=predictions, output_dict=True)['1']['precision'] # collects precision for a comparative question class '1'
    recall = classification_report(y_true=labels, y_pred=predictions, output_dict=True)['1']['recall'] # collects recall for a comparative question class '1'
    precision_pb.append(round(precision,4))
    recall_pb.append(round(recall,4))
    

# Save precision-recall results for rules

pb_df = pd.DataFrame({'precision_pb': precision_pb, 'recall_pb': recall_pb})
# pb_df.to_csv('../prrecall-pb.txt', sep='\t', index=False)

In [None]:
%matplotlib inline

font = {'size' : 18}
plt.rc('font', **font)

fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(1, 1, 1)

major_ticks = np.arange(0, 1.01, 0.05)
minor_ticks = np.arange(0, 1.01, 0.01)

ax.set_xticks(major_ticks)
ax.set_xticks(minor_ticks, minor=True)
ax.set_yticks(major_ticks)
ax.set_yticks(minor_ticks, minor=True)
axes = plt.gca()
axes.set_xlim([0.0,1.02])
axes.set_ylim([0.55,1.02])

ax.grid(which='both')


ax.grid(which='minor', alpha=0.2)
ax.grid(which='major', alpha=0.5)

ax.plot(recall_pb, precision_pb, marker='^', label='Pattern-based', linestyle='dashed', 
        linewidth=2, markersize=12)

plt.xlabel('Recall', fontsize=22)
plt.ylabel('Precision', fontsize=22)
plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
           ncol=3, mode="expand", borderaxespad=0.)

plt.show()