In [1]:
import json
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from tqdm import trange

from pprint import pprint

from sklearn.feature_extraction.text import TfidfVectorizer

import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    BertModel, RobertaModel,
    BertPreTrainedModel,
    AdamW, get_linear_schedule_with_warmup,
    TrainingArguments,
)
from datasets import (
    Dataset,
    load_from_disk,
    concatenate_datasets,
)

from typing import List

In [2]:
# 난수 고정
def set_seed(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)  # if use multi-GPU
    random.seed(random_seed)
    np.random.seed(random_seed)
    
set_seed(42) # magic number :)

In [3]:
print ("PyTorch version:[%s]."%(torch.__version__))
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print ("device:[%s]."%(device))

PyTorch version:[1.7.1].
device:[cuda:0].


In [4]:
dataset = load_from_disk('/opt/ml/data/train_dataset')
train_dataset = dataset['train']

## Bi-Encoder

In [5]:
class BertEncoder(BertPreTrainedModel):
    def __init__(self, config):
        super(BertEncoder, self).__init__(config)

        self.bert = BertModel(config)
        self.init_weights()
      
    def forward(
            self,
            input_ids, 
            attention_mask=None,
            token_type_ids=None
        ): 

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        
        pooled_output = outputs[1]
        return pooled_output

In [6]:
with open('/opt/ml/data/wikipedia_documents.json', "r", encoding="utf-8") as f:
    wiki = json.load(f)

corpus = list(
    dict.fromkeys([v["text"] for v in wiki.values()])
)  # set 은 매번 순서가 바뀌므로

In [7]:
model_checkpoint = "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [8]:
import pickle
with open('/opt/ml/custom/passage_embedding_special_customsample_augmentation_20.bin', 'rb') as file :
    p_embs = pickle.load(file)
p_embs = p_embs

q_encoder = torch.load('/opt/ml/custom/q_encoder_special_customsample_augmentation_20.pt')

In [9]:
def get_relavant_doc(queries, q_encoder, p_embs, k=1) :
    with torch.no_grad() :
        q_encoder.eval()
        q_seqs_val = tokenizer(queries, padding='max_length',truncation=True,return_tensors='pt').to(device)
        q_emb = q_encoder(**q_seqs_val).to('cpu')
    dot_prod_scores = torch.mm(q_emb, p_embs.T)
    sort_result = torch.sort(dot_prod_scores, dim=1, descending=True)

    scores, ranks = sort_result[0], sort_result[1]

    result_scores = []
    result_indices = []
    for i in range(len(ranks)) :
        result_scores.append(scores[i].tolist()[:k])
        result_indices.append(ranks[i].tolist()[:k])
    
    return result_scores, result_indices

In [10]:
doc_scores, doc_indices = get_relavant_doc(dataset['validation']['question'], q_encoder, p_embs, k = 500)

In [24]:
# del q_encder
# del p_embs
# torch.cuda.empty_cache()

In [11]:
# Bi-Encoder Retrieval 정확도 출력
a_total = []
for idx, example in enumerate(
        tqdm(dataset['validation'], desc="Dense retrieval: ")
    ):
        tmp = {
            # Query와 해당 id를 반환합니다.
            "question": example["question"],
            "id": example["id"],
            # Retrieve한 Passage의 id, context를 반환합니다.
            "context_id": doc_indices[idx],
            "context": " ".join(  # 기존에는 ' '.join()
                [corpus[pid] for pid in doc_indices[idx]]
            ),
        }
        if "context" in example.keys() and "answers" in example.keys():
            # validation 데이터를 사용하면 ground_truth context와 answer도 반환합니다.
            tmp["original_context"] = example["context"]
            tmp["answers"] = example["answers"]
        a_total.append(tmp)

b_cqas_50 = pd.DataFrame(a_total)
correct_length = []
for i in range(len(b_cqas_50)) :
    if b_cqas_50['original_context'][i] in b_cqas_50['context'][i] :
        correct_length.append(i)
print(len(correct_length) / len(dataset['validation']))
# 출력결과 0.916666

HBox(children=(FloatProgress(value=0.0, description='Dense retrieval: ', max=240.0, style=ProgressStyle(descri…


0.9541666666666667


## Cross_Encoder

In [12]:
model_checkpoint = "klue/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [13]:
from transformers import AutoModel, RobertaPreTrainedModel, RobertaModel

class RoBertaEncoder(RobertaPreTrainedModel):
    def __init__(self, config):
        super(RoBertaEncoder, self).__init__(config)

        self.roberta = RobertaModel(config)
        self.init_weights()
        classifier_dropout=(
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = torch.nn.Dropout(classifier_dropout)
        self.linear = torch.nn.Linear(config.hidden_size, 1)
      
    def forward(
            self,
            input_ids, 
            attention_mask=None,
            #token_type_ids=None
        ): 

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            #token_type_ids=token_type_ids
        )
        
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        output = self.linear(pooled_output)
        return output

In [14]:
c_encoder = torch.load('/opt/ml/custom/c_roberta_encoder_e40_b16.pt')

In [None]:
# Validation Data의 경우 약 40분 소요
question_data = dataset['validation']['question']
with torch.no_grad() : 
    c_encoder.eval()

    result_scores = []
    result_indices = []
    for i in tqdm(range(len(question_data))) :
        question = question_data[i]
        question_score = []
        for indice in tqdm(doc_indices[i]) :
            passage = corpus[indice]
            tokenized_examples = tokenizer(
                question,
                passage,
                truncation="only_second",
                max_length=512,
                stride=128,
                return_overflowing_tokens=True,
                return_offsets_mapping=True,
                return_token_type_ids=False,  # roberta모델을 사용할 경우 False, bert를 사용할 경우 True로 표기해야합니다.
                padding="max_length",
                return_tensors='pt'
            )
            score = 0
            for i in range(len(tokenized_examples['input_ids'])) :
                c_input = {
                    'input_ids' : torch.tensor(tokenized_examples['input_ids'][i].unsqueeze(dim=0)).to('cuda'),
                    'attention_mask' : torch.tensor(tokenized_examples['attention_mask'][i].unsqueeze(dim=0)).to('cuda'),
                    # 'token_type_ids' : torch.tensor(tokenized_examples['token_type_ids'][i].unsqueeze(dim=0)).to('cuda')
                }
                tmp_score = c_encoder(**c_input).to('cpu')
                score += tmp_score
            score = score / len(tokenized_examples['input_ids'])
            question_score.append(score)
        sort_result = torch.sort(torch.tensor(question_score), descending=True)
        scores, index_list = sort_result[0], sort_result[1]

        result_scores.append(scores.tolist())
        result_indices.append(index_list.tolist())        

### result_scores & indices 저장 및 불러오기

In [20]:
import csv
with open('valid_bi_ce_indices_roberta.csv', 'w', newline='') as f: 
    writer = csv.writer(f)
    writer.writerow(result_indices)
with open('valid_bi_ce_scores_roberta.csv', 'w', newline='') as f: 
    writer = csv.writer(f)
    writer.writerow(result_scores)


In [15]:
import csv
with open('valid_bi_ce_indices_roberta.csv', 'r', encoding='utf-8') as f:
    rdr = csv.reader(f)
    for i, line in enumerate(rdr) :
        if i == 0 :
            check_indices = line
with open('valid_bi_ce_scores_roberta.csv', 'r', encoding='utf-8') as f:
    rdr = csv.reader(f)
    for i, line in enumerate(rdr) :
        if i == 0 :
            check_scores = line

In [16]:
result_indices = []
result_scores = []
for i in range(len(check_indices)) :
    result_indices.append(eval(check_indices[i]))
    result_scores.append(eval(check_scores[i]))

### 점수비교

In [17]:
final_indices = []
final_scores = []
for i in range(len(doc_indices)) :
    t_list = [doc_indices[i][result_indices[i][k]] for k in range(100)]
    s_list = result_scores[i][:100]
    final_indices.append(t_list)
    final_scores.append(s_list)

In [18]:
# Cross Encoder 정확도 출력
total = []
for idx, example in enumerate(
        tqdm(dataset['validation'], desc="Dense retrieval: ")
    ):
        tmp = {
            # Query와 해당 id를 반환합니다.
            "question": example["question"],
            "id": example["id"],
            # Retrieve한 Passage의 id, context를 반환합니다.
            "context_id": final_indices[idx],
            "context": " ".join(  # 기존에는 ' '.join()
                [corpus[pid] for pid in final_indices[idx]]
            ),
        }
        if "context" in example.keys() and "answers" in example.keys():
            # validation 데이터를 사용하면 ground_truth context와 answer도 반환합니다.
            tmp["original_context"] = example["context"]
            tmp["answers"] = example["answers"]
        total.append(tmp)

cqas_50 = pd.DataFrame(total)

HBox(children=(FloatProgress(value=0.0, description='Dense retrieval: ', max=240.0, style=ProgressStyle(descri…




In [19]:
# 출력결과 0.8708
correct_length = []
for i in range(len(cqas_50)) :
    if cqas_50['original_context'][i] in cqas_50['context'][i] :
        correct_length.append(i)
print(len(correct_length) / len(dataset['validation']))

0.95


In [66]:
cqas_50.to_csv('valid_roberta_augmentation_be_ce_t5.csv', index = False)

### Only Elastic

In [20]:
data = pd.read_csv('/opt/ml/custom/new_modified_elastic_top100_val.csv')

In [21]:
with open('/opt/ml/data/wikipedia_documents.json', "r", encoding="utf-8") as f:
    wiki = json.load(f)

omit_corpus = list(
    dict.fromkeys([v["text"] for v in wiki.values()])
)  # set 은 매번 순서가 바뀌므로

corpus = []
for v in wiki.values() :
    corpus.append(v['text'])

In [108]:
elastic_indices = []
elastic_scores = []
for i in tqdm(range(len(data))) :
    tt = eval(data['candidate_ids'][i])
    tmp_indices = []
    tmp_scores = []
    for j in range(4) :# range(len(tt)) :
        original_index = omit_corpus.index(corpus[int(tt[j][0])])
        tmp_indices.append(original_index)
        tmp_scores.append(tt[j][1])
    elastic_indices.append(tmp_indices)
    elastic_scores.append(tmp_scores)

HBox(children=(FloatProgress(value=0.0, max=240.0), HTML(value='')))




In [109]:
# Elastic
total = []
for idx, example in enumerate(
        tqdm(dataset['validation'], desc="Dense retrieval: ")
    ):
        tmp = {
            # Query와 해당 id를 반환합니다.
            "question": example["question"],
            "id": example["id"],
            # Retrieve한 Passage의 id, context를 반환합니다.
            "context_id": elastic_indices[idx],
            "context": " ".join(  # 기존에는 ' '.join()
                [omit_corpus[pid] for pid in elastic_indices[idx]]
            ),
        }
        if "context" in example.keys() and "answers" in example.keys():
            # validation 데이터를 사용하면 ground_truth context와 answer도 반환합니다.
            tmp["original_context"] = example["context"]
            tmp["answers"] = example["answers"]
        total.append(tmp)

elasitc_data = pd.DataFrame(total)

HBox(children=(FloatProgress(value=0.0, description='Dense retrieval: ', max=240.0, style=ProgressStyle(descri…




In [110]:
correct_length = []
for i in range(len(elasitc_data)) :
    if elasitc_data['original_context'][i] in elasitc_data['context'][i] :
        correct_length.append(i)
print(len(correct_length) / len(dataset['validation']))

0.8458333333333333


In [25]:
elasitc_data.to_csv('new_special_elastic_t4.csv', index = False)

### Elastic with CrossEncoder

In [28]:
data = pd.read_csv('/opt/ml/custom/es100valid_with_score.csv')

In [29]:
with open('/opt/ml/data/wikipedia_documents.json', "r", encoding="utf-8") as f:
    wiki = json.load(f)

omit_corpus = list(
    dict.fromkeys([v["text"] for v in wiki.values()])
)  # set 은 매번 순서가 바뀌므로

corpus = []
for v in wiki.values() :
    corpus.append(v['text'])

In [30]:
elastic_indices = []
elastic_scores = []
for i in tqdm(range(len(data))) :
    tt = eval(data['candidate_ids'][i])
    tmp_indices = []
    tmp_scores = []
    for j in range(len(tt)) :
        original_index = omit_corpus.index(corpus[tt[j][0]])
        tmp_indices.append(original_index)
        tmp_scores.append(tt[j][1])
    elastic_indices.append(tmp_indices)
    elastic_scores.append(tmp_scores)

HBox(children=(FloatProgress(value=0.0, max=240.0), HTML(value='')))




In [31]:
class BertEncoder(BertPreTrainedModel):
    def __init__(self, config):
        super(BertEncoder, self).__init__(config)

        self.bert = BertModel(config)
        self.init_weights()
        classifier_dropout=(
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = torch.nn.Dropout(classifier_dropout)
        self.linear = torch.nn.Linear(config.hidden_size, 1)
      
    def forward(
            self,
            input_ids, 
            attention_mask=None,
            token_type_ids=None
        ): 

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        output = self.linear(pooled_output)
        return output

In [33]:
c_encoder = torch.load('/opt/ml/custom/c_encoder_e40_b16.pt')
model_checkpoint = "klue/bert-base"
# 혹시 위에서 사용한 encoder가 있다면 주석처리 후 진행해주세요 (CUDA ...)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
question_data = dataset['validation']['question']
with torch.no_grad() : 
    c_encoder.eval()

    elastic_result_scores = []
    elastic_result_indices = []
    for i in tqdm(range(len(question_data))) :
        question = question_data[i]
        question_score = []
        for indice in elastic_indices[i] :
            passage = omit_corpus[indice]
            tokenized_examples = tokenizer(
                question,
                passage,
                truncation="only_second",
                max_length=512,
                stride=128,
                return_overflowing_tokens=True,
                return_offsets_mapping=True,
                #return_token_type_ids=False,  # roberta모델을 사용할 경우 False, bert를 사용할 경우 True로 표기해야합니다.
                padding="max_length",
                return_tensors='pt'
            )
            score = 0
            for i in range(len(tokenized_examples['input_ids'])) :
                c_input = {
                    'input_ids' : torch.tensor(tokenized_examples['input_ids'][i].unsqueeze(dim=0)).to('cuda'),
                    'attention_mask' : torch.tensor(tokenized_examples['attention_mask'][i].unsqueeze(dim=0)).to('cuda'),
                    'token_type_ids' : torch.tensor(tokenized_examples['token_type_ids'][i].unsqueeze(dim=0)).to('cuda')
                }
                tmp_score = c_encoder(**c_input).to('cpu')
                score += tmp_score
            score = score / len(tokenized_examples['input_ids'])
            question_score.append(score)
        sort_result = torch.sort(torch.tensor(question_score), descending=True)
        scores, index_list = sort_result[0], sort_result[1]

        elastic_result_scores.append(scores.tolist())
        elastic_result_indices.append(index_list.tolist())        

In [38]:
elastic_final_indices = []
for i in range(len(elastic_indices)) :
    t_list = [elastic_indices[i][elastic_result_indices[i][k]] for k in range(100)]
    elastic_final_indices.append(t_list)

In [39]:
total = []
for idx, example in enumerate(
        tqdm(dataset['validation'], desc="Dense retrieval: ")
    ):
        tmp = {
            # Query와 해당 id를 반환합니다.
            "question": example["question"],
            "id": example["id"],
            # Retrieve한 Passage의 id, context를 반환합니다.
            "context_id": elastic_final_indices[idx],
            "context": " ".join(  # 기존에는 ' '.join()
                [omit_corpus[pid] for pid in elastic_final_indices[idx]]
            ),
        }
        if "context" in example.keys() and "answers" in example.keys():
            # validation 데이터를 사용하면 ground_truth context와 answer도 반환합니다.
            tmp["original_context"] = example["context"]
            tmp["answers"] = example["answers"]
        total.append(tmp)

cqas_50 = pd.DataFrame(total)

HBox(children=(FloatProgress(value=0.0, description='Dense retrieval: ', max=240.0, style=ProgressStyle(descri…




In [40]:
correct_length = []
for i in range(len(cqas_50)) :
    if cqas_50['original_context'][i] in cqas_50['context'][i] :
        correct_length.append(i)
print(len(correct_length) / len(dataset['validation']))

0.9708333333333333


### Ensemble original

In [31]:
i = 0
print('elastic indices:', elastic_indices[i][:10])
print('bi-cross encoder', final_indices[i][:10])

elastic indices: [4459, 5294, 45213, 45585, 42795, 11263, 16268, 38008, 27888, 27943]
bi-cross encoder [5294, 4459, 18273, 40586, 29340, 9728, 18465, 16268, 8986, 56287]


In [32]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
scaler = MinMaxScaler()
scale_ce = []
scale_elastic = []
for ce, elastic in tqdm(zip(final_scores, elastic_scores)) :
    temp_ce = np.array(ce).reshape(-1, 1)
    temp_el = np.array(elastic).reshape(-1, 1)
    scale_ce.append(scaler.fit_transform(temp_ce).reshape(-1).tolist())
    scale_elastic.append(scaler.fit_transform(temp_el).reshape(-1).tolist())

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [103]:
real_scores = []
real_indices = []
for i in range(len(final_scores)) :
    doc = {}
    for j in range(len(final_scores[0])) :
        doc[final_indices[i][j]] = final_scores[i][j] * 2
    
    for j in range(len(elastic_scores[i])) :
        if elastic_indices[i][j] in doc.keys() :
            doc[elastic_indices[i][j]] += elastic_scores[i][j]
        else :
            doc[elastic_indices[i][j]] = elastic_scores[i][j]
    doc = dict(sorted(doc.items(), key = lambda x: x[1], reverse = True))
    real_scores.append(list(doc.values()))
    real_indices.append(list(doc.keys()))

In [102]:
real_scores = []
real_indices = []
for i in range(len(scale_ce)) :
    doc = {}
    for j in range(len(scale_ce[0])) :
        doc[final_indices[i][j]] = scale_ce[i][j]
    
    for j in range(len(scale_elastic[i])) :
        if elastic_indices[i][j] in doc.keys() :
            doc[elastic_indices[i][j]] += scale_elastic[i][j]
        else :
            doc[elastic_indices[i][j]] = scale_elastic[i][j]
    doc = dict(sorted(doc.items(), key = lambda x: x[1], reverse = True))
    real_scores.append(list(doc.values()))
    real_indices.append(list(doc.keys()))

In [104]:
i = 0
print('elastic indices:', elastic_indices[i][:10])
print('bi-cross encoder', final_indices[i][:10])
print('ensemble:', real_indices[i][:10])


elastic indices: [4459, 5294, 45213, 45585, 42795, 11263, 16268, 38008, 27888, 27943]
bi-cross encoder [5294, 4459, 18273, 40586, 29340, 9728, 18465, 16268, 8986, 56287]
ensemble: [5294, 4459, 16268, 18273, 5694, 29340, 1277, 22010, 20964, 11831]


In [105]:
the_end_indices = []
for i in range(len(real_indices)) :
    t_list = [real_indices[i][k] for k in range(5)]
    the_end_indices.append(t_list)

In [106]:
final = []
for idx, example in enumerate(
        tqdm(dataset['validation'], desc="Dense retrieval: ")
    ):
        tmp = {
            # Query와 해당 id를 반환합니다.
            "question": example["question"],
            "id": example["id"],
            # Retrieve한 Passage의 id, context를 반환합니다.
            "context_id": the_end_indices[idx],
            "context": " ".join(  # 기존에는 ' '.join()
                [omit_corpus[pid] for pid in the_end_indices[idx]]
            ),
        }
        if "context" in example.keys() and "answers" in example.keys():
            # validation 데이터를 사용하면 ground_truth context와 answer도 반환합니다.
            tmp["original_context"] = example["context"]
            tmp["answers"] = example["answers"]
        final.append(tmp)

cqas_50 = pd.DataFrame(final)    

HBox(children=(FloatProgress(value=0.0, description='Dense retrieval: ', max=240.0, style=ProgressStyle(descri…




In [107]:
correct_length = []
for i in range(len(cqas_50)) :
    if cqas_50['original_context'][i] in cqas_50['context'][i] :
        correct_length.append(i)
print(len(correct_length) / len(dataset['validation']))

0.8916666666666667


In [90]:
cqas_50.to_csv('valid_roberta_dpr_ce_special_new_ela_ensemble_t2.csv', index = False)

### Ensemble Special

In [49]:
i = 0
print('elastic indices:', elastic_indices[i][:10])
print('elastic final indices:', elastic_final_indices[i][:10])
print('bi-cross encoder', final_indices[i][:10])

elastic indices: [14109, 14110, 4459, 5294, 45213, 45585, 42795, 11263, 16268, 38008]
elastic final indices: [5294, 4459, 18273, 35047, 20964, 16268, 11263, 11831, 1277, 29340]
bi-cross encoder [5294, 4459, 18273, 9728, 20964, 35270, 40128, 16268, 22010, 8986]


In [50]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
scaler = MinMaxScaler()
scale_ce = []
scale_elastic = []
for ce, elastic in tqdm(zip(final_scores, elastic_final_indices)) :
    temp_ce = np.array(ce).reshape(-1, 1)
    temp_el = np.array(elastic).reshape(-1, 1)
    scale_ce.append(scaler.fit_transform(temp_ce).reshape(-1).tolist())
    scale_elastic.append(scaler.fit_transform(temp_el).reshape(-1).tolist())

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [67]:
real_scores = []
real_indices = []
for i in range(len(scale_ce)) :
    doc = {}
    for j in range(len(scale_ce[0])) :
        doc[final_indices[i][j]] = scale_ce[i][j] * 3
    
    for j in range(len(scale_elastic[i])) :
        if elastic_indices[i][j] in doc.keys() :
            doc[elastic_indices[i][j]] += scale_elastic[i][j]
        else :
            doc[elastic_indices[i][j]] = scale_elastic[i][j]
    doc = dict(sorted(doc.items(), key = lambda x: x[1], reverse = True))
    real_scores.append(list(doc.values()))
    real_indices.append(list(doc.keys()))

In [68]:
the_end_indices = []
for i in range(len(real_indices)) :
    t_list = [real_indices[i][k] for k in range(4)]
    the_end_indices.append(t_list)

In [69]:
final = []
for idx, example in enumerate(
        tqdm(dataset['validation'], desc="Dense retrieval: ")
    ):
        tmp = {
            # Query와 해당 id를 반환합니다.
            "question": example["question"],
            "id": example["id"],
            # Retrieve한 Passage의 id, context를 반환합니다.
            "context_id": the_end_indices[idx],
            "context": " ".join(  # 기존에는 ' '.join()
                [omit_corpus[pid] for pid in the_end_indices[idx]]
            ),
        }
        if "context" in example.keys() and "answers" in example.keys():
            # validation 데이터를 사용하면 ground_truth context와 answer도 반환합니다.
            tmp["original_context"] = example["context"]
            tmp["answers"] = example["answers"]
        final.append(tmp)

cqas_50 = pd.DataFrame(final)    

HBox(children=(FloatProgress(value=0.0, description='Dense retrieval: ', max=240.0, style=ProgressStyle(descri…




In [70]:
correct_length = []
for i in range(len(cqas_50)) :
    if cqas_50['original_context'][i] in cqas_50['context'][i] :
        correct_length.append(i)
print(len(correct_length) / len(dataset['validation']))

0.8625
