In [None]:
!pip install simpletransformers
!pip install catboost

In [None]:
import catboost
import simpletransformers
import json
import pandas as pd
import logging
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.model_selection import train_test_split

In [None]:
def create_train_val_dataset(path):
    df = []
    with open(path, 'r') as json_file:
        json_list = list(json_file)
    for json_str in json_list:
        item = json.loads(json_str)
        label = 0
        if item['label'] == 'entailment':
            label = 2
        elif item['label'] == 'neutral':
            label = 1
        try:
            df.append([item['hypothesis'], item['premise'], item['negation'], item['genre'], item['verb'], label])
        except:
            df.append([item['hypothesis'], item['premise'], item['no_negation'], item['genre'], item['verb'], label])
    df = pd.DataFrame(df)
    df.columns = ['hypothesis', 'premise', 'negation', 'genre', 'verb', 'label']
    return df

In [None]:
def create_train_test_dataset(path):
    df = []
    with open(path, 'r') as json_file:
        json_list = list(json_file)
    for json_str in json_list:
        item = json.loads(json_str)
        try:
            df.append([item['hypothesis'], item['premise'], item['negation'], item['genre'], item['verb']])
        except:
            df.append([item['hypothesis'], item['premise'], item['no_negation'], item['genre'], item['verb']])
    df = pd.DataFrame(df)
    df.columns = ['hypothesis', 'premise', 'negation', 'genre', 'verb']
    return df

In [None]:
train_dataset = create_train_val_dataset('train.jsonl')
val_dataset = create_train_val_dataset('val.jsonl')

In [None]:
train_dataset1, train_dataset2 = train_test_split(train_dataset, test_size=0.2, random_state=655)

In [None]:
train_dataset1

Unnamed: 0,hypothesis,premise,negation,genre,verb,label
109,У меня были переговоры с президентом клуба.,Я ни слова не скажу по этой теме. Я не обязан ...,no_negation,interfax,рассказывать,2
31,Когда все по-разному и в индивидуальном порядк...,"Чтобы узнать, о чем жизнь, надо ее прожить. Но...",no_negation,fiction,думать,1
381,Я испытал странное чувство во время кратковрем...,На перроне он быстро обогнал нас и скрылся в т...,no_negation,fiction,понять,1
3,У зимы есть свой характер.,"Зима, наконец, показала свой характер.",no_negation,kp,показать,0
265,Вы покинули дом номер двадцать.,"Выгорела часть чердака. Мы вас не обвиняем, а ...",no_negation,fiction,сказать,2
...,...,...,...,...,...,...
301,"Я объяснял кузнецу, что мне нужно, и он понял ...","— Я пришёл к кузнецу, пряча сырое от слёз лиц...",negation,fiction,помнить,2
185,Ему нужно поставить точку.,Сначала он пил из бравады. Затем из ожесточени...,no_negation,fiction,чувствовать,0
393,Агентство стало службой одного окна для иностр...,Вообще это достаточно успешная международная п...,no_negation,interfax,сказать,1
49,Африканцы отстали в развитии от европейцев и а...,"— Между тем одного адского шума, по забавному ...",no_negation,fiction,думать,1


In [None]:
train_dataset_for_simple1 = pd.concat([train_dataset1['hypothesis'], train_dataset1['premise'], train_dataset1['label']], axis=1)
train_dataset_for_simple1.columns = ['text_a', 'text_b', 'labels']
train_dataset_for_simple2 = pd.concat([train_dataset2['hypothesis'], train_dataset2['premise'], train_dataset2['label']], axis=1)
train_dataset_for_simple2.columns = ['text_a', 'text_b', 'labels']
val_dataset_for_simple = pd.concat([val_dataset['hypothesis'], val_dataset['premise'], val_dataset['label']], axis=1)
val_dataset_for_simple.columns = ['text_a', 'text_b', 'labels']

In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)
model_args1 = ClassificationArgs(num_train_epochs=5,
                                overwrite_output_dir=True,
                                evaluate_during_training=True,
                                evaluate_during_training_verbose=True,
                                reprocess_input_data=True,
                                train_batch_size=16,
                                eval_batch_size=16,
                                save_model_every_epoch = False,
                                save_best_model = True,
                                max_seq_length = 64,
                                use_multiprocessing = True,
                                learning_rate = 3e-5,
                                output_dir='outputs1/',
                                )
model_args2 = ClassificationArgs(num_train_epochs=5,
                                overwrite_output_dir=True,
                                evaluate_during_training=True,
                                evaluate_during_training_verbose=True,
                                reprocess_input_data=True,
                                train_batch_size=16,
                                eval_batch_size=16,
                                save_model_every_epoch = False,
                                save_best_model = True,
                                max_seq_length = 64,
                                use_multiprocessing = True,
                                learning_rate = 3e-5,
                                output_dir='outputs2/',
                                )
model1 = ClassificationModel(
    "bert", "DeepPavlov/rubert-base-cased", args=model_args1, num_labels=3)
model2 = ClassificationModel(
    "distilbert", "distilbert-base-multilingual-cased", args=model_args2, num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly i

In [None]:
model1.train_model(train_dataset_for_simple1, eval_df=val_dataset_for_simple)

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/350 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_bert_64_3_3


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/22 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/220 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_64_3_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.04042604935382307, 'eval_loss': 1.156909018754959}


Running Epoch 1 of 5:   0%|          | 0/22 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/220 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_64_3_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.09284361673757059, 'eval_loss': 1.2307653171675546}


Running Epoch 2 of 5:   0%|          | 0/22 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/220 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_64_3_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.051282601529485744, 'eval_loss': 1.3524741189820426}


Running Epoch 3 of 5:   0%|          | 0/22 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/220 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_64_3_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.015519095524391154, 'eval_loss': 1.47980397939682}


Running Epoch 4 of 5:   0%|          | 0/22 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/220 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_64_3_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.06962120163591481, 'eval_loss': 1.4578158727713995}


RuntimeError: ignored

In [None]:
model2.train_model(train_dataset_for_simple1, eval_df=val_dataset_for_simple)

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/350 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_distilbert_64_3_3


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/22 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/220 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_distilbert_64_3_3
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
INFO:simpletransformers.classification.classification_model:{'mcc': 0.0, 'eval_loss': 0.9959407108170646}


Running Epoch 1 of 5:   0%|          | 0/22 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/220 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_distilbert_64_3_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.037187581596206276, 'eval_loss': 0.9903837272099086}


Running Epoch 2 of 5:   0%|          | 0/22 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/220 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_distilbert_64_3_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.05615744352677571, 'eval_loss': 1.0139213332108088}


Running Epoch 3 of 5:   0%|          | 0/22 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/220 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_distilbert_64_3_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.06244540841091712, 'eval_loss': 1.048089223248618}


Running Epoch 4 of 5:   0%|          | 0/22 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/220 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_distilbert_64_3_3
INFO:simpletransformers.classification.classification_model:{'mcc': 0.06942890011877001, 'eval_loss': 1.0445493587425776}
INFO:simpletransformers.classification.classification_model: Training of distilbert model complete. Saved to outputs2/.


(110,
 {'eval_loss': [0.9959407108170646,
   0.9903837272099086,
   1.0139213332108088,
   1.048089223248618,
   1.0445493587425776],
  'global_step': [22, 44, 66, 88, 110],
  'mcc': [0.0,
   0.037187581596206276,
   0.05615744352677571,
   0.06244540841091712,
   0.06942890011877001],
  'train_loss': [0.8708114624023438,
   1.0475351810455322,
   0.9211092591285706,
   0.8389957547187805,
   0.816865086555481]})

In [None]:
def create_for_predict(dataset):
    arr = []
    for i in range(len(dataset)):
        arr.append([dataset['text_a'].iloc[i], dataset['text_b'].iloc[i]])
    return arr

In [None]:
train_predictions, train_raw_outputs1 = model1.predict(create_for_predict(train_dataset_for_simple2))
val_predictions, val_raw_outputs1 = model1.predict(create_for_predict(val_dataset_for_simple))
train_predictions, train_raw_outputs2 = model2.predict(create_for_predict(train_dataset_for_simple2))
val_predictions, val_raw_outputs2 = model2.predict(create_for_predict(val_dataset_for_simple))
train_predictions, train_raw_outputs_verb1 = model1.predict(list(train_dataset2['verb'].values))
val_predictions, val_raw_outputs_verb1 = model1.predict(list(val_dataset['verb'].values))

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/220 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/220 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/220 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

In [None]:
def fit_model(X_train, X_test, y_train, y_test, catboost_params={}, verbose=1):
    learn_pool = catboost.Pool(
        X_train, 
        y_train,
        cat_features=['negation', 'genre'],
        text_features=['verb', 'hypothesises', 'premises'],
        feature_names=list(X_train)
    )
    test_pool = catboost.Pool(
        X_test, 
        y_test, 
        cat_features=['negation', 'genre'],
        text_features=['verb', 'hypothesises', 'premises'],
        feature_names=list(X_train)
    )
    
    catboost_default_params = {
        'iterations': 500 ,
        'learning_rate': 0.05,
        'depth': 9,
        #'leaf_estimation_method': 'Gradient',
        'eval_metric': 'Accuracy',
        #'feature_calcers': ['BoW:top_tokens_count=1500','NaiveBayes'],
        #'task_type': 'GPU',
        'bagging_temperature': 1,
        #'boosting_type': 'Ordered',
        #'l2_leaf_reg': 15,
        #'feature_border_type': 'Median',
        #'sampling_frequency': 'PerTreeLevel',
        #'loss_function': 'Logloss',
        #'bootstrap_type': 'Bayesian',
        #'max_ctr_complexity': 2,
        #'sampling_unit': 'Object' 
        'tokenizers': [{
            'tokenizer_id': 'Space',
            'delimiter': ' ',
            'separator_type': 'ByDelimiter' },{
            'tokenizer_id': 'Sense',
            'separator_type': 'BySense',
          }],
        'feature_calcers': ['BoW:top_tokens_count=500'],
        'dictionaries': [{
          'dictionary_id': 'Unigram',
          'max_dictionary_size': '10000',
          'gram_count': '1',
        } ]
          } 
    

    #4302135
    catboost_default_params.update(catboost_params)
    model = catboost.CatBoostClassifier(**catboost_default_params)
    model.fit(learn_pool, eval_set=test_pool, verbose=verbose)
    #model = cross_val_score(estimator=CatBoostClassifier(**catboost_default_params), X=X_train, y=y_train, cv=5, scoring='r2')
    #print(model)
    return model

In [None]:
def same_words(s1, s2):
    words_count = 0
    for word1 in s1.split():
        for word2 in s2.split():
            if word1.replace('.', '').replace(',', '').lower() == word2.replace('.', '').replace(',', '').lower():
                words_count += 1
    return words_count

In [None]:
def create_features(hypothesises, premises):
    data = []
    for i in range(len(hypothesises)):
        data.append([hypothesises[i], premises[i], len(hypothesises[i].split()), len(premises[i].split()), len(hypothesises[i]), len(premises[i]), len(hypothesises[i].split('.')), len(premises[i].split('.')), same_words(hypothesises[i], premises[i]), hypothesises[i].count('а'), premises[i].count('а'), hypothesises[i].count('и'), premises[i].count('и')])
    data = pd.DataFrame(data)
    data.columns = ['hypothesises', 'premises', 'hypothesises_words_len', 'premises_words_len', 'hypothesises_chars_len', 'premises_chars_len', 'hypothesises_sents_len', 'premises_sents_len', 'same_words', 'hypothesises_count_a', 'premises_count_a', 'hypothesises_count_i', 'premises_count_i']     
    return data

In [None]:
create_features(train_dataset2['hypothesis'].values, train_dataset2['premise'].values).head(1)

Unnamed: 0,hypothesises,premises,hypothesises_words_len,premises_words_len,hypothesises_chars_len,premises_chars_len,hypothesises_sents_len,premises_sents_len,same_words,hypothesises_count_a,premises_count_a,hypothesises_count_i,premises_count_i
0,ТРЦ «Галактика» в Барнауле строили с нарушениями.,Итоги проверки государственного строительного ...,7,22,49,176,2,2,8,6,13,5,9


In [None]:
train_dataset2.head(1)

Unnamed: 0,hypothesis,premise,negation,genre,verb,label
154,ТРЦ «Галактика» в Барнауле строили с нарушениями.,Итоги проверки государственного строительного ...,no_negation,kp,показать,2


In [None]:
train_raw_outputs_df1 = pd.DataFrame(train_raw_outputs1)
train_raw_outputs_df2 = pd.DataFrame(train_raw_outputs2)
train_raw_outputs_df3 = pd.DataFrame(train_raw_outputs_verb1)

X_train = pd.concat([train_raw_outputs_df1, train_raw_outputs_df2, train_raw_outputs_df3, pd.DataFrame(list(train_dataset2['negation'].values)), pd.DataFrame(list(train_dataset2['genre'].values)), pd.DataFrame(list(train_dataset2['verb'].values)), create_features(train_dataset2['hypothesis'].values, train_dataset2['premise'].values)], axis=1)
val_raw_outputs_df1 = pd.DataFrame(val_raw_outputs1)
val_raw_outputs_df2 = pd.DataFrame(val_raw_outputs2)
val_raw_outputs_df3 = pd.DataFrame(val_raw_outputs_verb1)

X_val = pd.concat([val_raw_outputs_df1, val_raw_outputs_df2, val_raw_outputs_df3, val_dataset['negation'], val_dataset['genre'], val_dataset['verb'], create_features(val_dataset['hypothesis'].values, val_dataset['premise'].values)], axis=1)

In [None]:
columns = ['rubert1', 'rubert2', 'rubert3', 'distilbert1', 'distilbert2', 'distilbert3', 'bert_verb1', 'bert_verb2', 'bert_verb3', 'negation', 'genre', 'verb', 'hypothesises', 'premises', 'hypothesises_words_len', 'premises_words_len', 'hypothesises_chars_len', 'premises_chars_len', 'hypothesises_sents_len', 'premises_sents_len', 'same_words', 'hypothesises_count_a', 'premises_count_a', 'hypothesises_count_i', 'premises_count_i']
X_train.columns = columns 
X_val.columns = columns

In [None]:
X_train.head(1)

Unnamed: 0,rubert1,rubert2,rubert3,distilbert1,distilbert2,distilbert3,bert_verb1,bert_verb2,bert_verb3,negation,genre,verb,hypothesises,premises,hypothesises_words_len,premises_words_len,hypothesises_chars_len,premises_chars_len,hypothesises_sents_len,premises_sents_len,same_words,hypothesises_count_a,premises_count_a,hypothesises_count_i,premises_count_i
0,-1.693359,0.712402,0.924316,-0.867188,-0.122437,0.820801,0.078613,0.255615,-0.195679,no_negation,kp,показать,ТРЦ «Галактика» в Барнауле строили с нарушениями.,Итоги проверки государственного строительного ...,7,22,49,176,2,2,8,6,13,5,9


In [None]:
cat_model = fit_model(X_train, X_val, train_dataset2['label'].values, val_dataset['label'].values)

0:	learn: 0.6250000	test: 0.4590909	best: 0.4590909 (0)	total: 35.2ms	remaining: 17.6s
1:	learn: 0.6477273	test: 0.4772727	best: 0.4772727 (1)	total: 170ms	remaining: 42.3s
2:	learn: 0.6704545	test: 0.4500000	best: 0.4772727 (1)	total: 308ms	remaining: 51s
3:	learn: 0.6818182	test: 0.4909091	best: 0.4909091 (3)	total: 442ms	remaining: 54.8s
4:	learn: 0.7386364	test: 0.4954545	best: 0.4954545 (4)	total: 582ms	remaining: 57.6s
5:	learn: 0.7613636	test: 0.4772727	best: 0.4954545 (4)	total: 717ms	remaining: 59s
6:	learn: 0.6704545	test: 0.4954545	best: 0.4954545 (4)	total: 718ms	remaining: 50.5s
7:	learn: 0.6477273	test: 0.5090909	best: 0.5090909 (7)	total: 851ms	remaining: 52.4s
8:	learn: 0.7386364	test: 0.4909091	best: 0.5090909 (7)	total: 989ms	remaining: 53.9s
9:	learn: 0.7159091	test: 0.4863636	best: 0.5090909 (7)	total: 1.13s	remaining: 55.5s
10:	learn: 0.7045455	test: 0.4954545	best: 0.5090909 (7)	total: 1.26s	remaining: 56.3s
11:	learn: 0.7159091	test: 0.4909091	best: 0.5090909 (7)

KeyboardInterrupt: ignored

In [None]:
test_dataset = create_train_test_dataset('/content/test.jsonl')

In [None]:
test_dataset_for_simple = pd.concat([test_dataset['hypothesis'], test_dataset['premise']], axis=1)
test_dataset_for_simple.columns = ['text_a', 'text_b']

In [None]:
test_predictions, test_raw_outputs1 = model1.predict(create_for_predict(test_dataset_for_simple))
test_predictions, test_raw_outputs2 = model2.predict(create_for_predict(test_dataset_for_simple))
train_predictions, test_raw_outputs_verb1 = model1.predict(list(test_dataset['verb'].values))

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/438 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/438 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/438 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

In [None]:
test_raw_outputs_df1 = pd.DataFrame(test_raw_outputs1)
test_raw_outputs_df2 = pd.DataFrame(test_raw_outputs2)
test_raw_outputs_df3 = pd.DataFrame(test_raw_outputs_verb1)

X_test1 = pd.concat([test_raw_outputs_df1, test_raw_outputs_df2, test_raw_outputs_df3, test_dataset['negation'], test_dataset['genre'], test_dataset['verb'], create_features(test_dataset['hypothesis'], test_dataset['premise'])], axis=1)

In [None]:
X_test1.columns = columns

In [None]:
X_test1.head(1)

Unnamed: 0,rubert1,rubert2,rubert3,distilbert1,distilbert2,distilbert3,bert_verb1,bert_verb2,bert_verb3,negation,genre,verb,hypothesises,premises,hypothesises_words_len,premises_words_len,hypothesises_chars_len,premises_chars_len,hypothesises_sents_len,premises_sents_len,hypothesises_count_a,premises_count_a
0,-1.290039,0.330811,1.073242,-1.254883,0.381592,0.521973,0.149536,0.248291,-0.255127,no_negation,kp,рассказать,Мужчину раньше уже судили.,"Мужчина уже был ранее судим, рассказали «Комсо...",4,11,26,91,2,2,0,0


In [None]:
preds = cat_model.predict(X_test1)

In [None]:
output = []
for i in range(len(preds)):
    label = 'contradiction'
    if preds[i] == 1:
        label = 'neutral'
    elif preds[i] == 2:
        label = 'entailment'
    output.append({'idx': i, 'label': label})

In [None]:
output[0]

{'idx': 0, 'label': 'entailment'}

In [None]:
with open('rcd_submit3.json', 'w') as json_file:
    for o in output:
        json.dump(o, json_file)
        json_file.write("\n")