In [None]:
# !python -m pip install elasticsearch
# !wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
# !tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
# !chown -R daemon:daemon elasticsearch-7.9.2

In [None]:
# ! /opt/ml/code/elasticsearch-7.9.2/bin/elasticsearch-plugin install analysis-nori

In [None]:
import os
from subprocess import Popen, PIPE, STDOUT
es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'],
                   stdout=PIPE, stderr=STDOUT,
                   preexec_fn=lambda: os.setuid(1)  # as daemon
                  )
# wait until ES has started
! sleep 30

In [None]:
from elasticsearch import Elasticsearch
es = Elasticsearch('localhost:9200')
es

In [None]:
es.info()

In [None]:
es.indices.get('document')

In [None]:
import json
import pandas as pd

with open('/opt/ml/data/wikipedia_documents.json', 'r') as f:
    wiki_data = pd.DataFrame(json.load(f)).transpose()

In [None]:
es.indices.delete(index="document", ignore=[400, 404])
es

In [None]:
from tqdm import tqdm
mapping = {
                      'settings':{
                          'analysis':{
                              'analyzer':{
                                  'my_analyzer':{
                                      "type": "custom",
                                      'tokenizer':'nori_tokenizer',
                                      'decompound_mode':'mixed',
                                      'stopwords':'_korean_',
                                      "filter": ["lowercase",
                                                 "my_shingle_f",
                                                 "nori_readingform",
                                                 "nori_number"]
                                  }
                              },
                              'filter':{
                                  'my_shingle_f':{
                                      "type": "shingle"
                                  }
                              }
                          },
                          'similarity':{
                              'my_similarity':{
                                  'type':'BM25',
                              }
                          }
                      },
                      'mappings':{
                          'properties':{
                              'title':{
                                  'type':'text',
                                  'analyzer':'my_analyzer',
                                  'similarity':'my_similarity'
                              },
                              'text':{
                                  'type':'text',
                                  'analyzer':'my_analyzer',
                                  'similarity':'my_similarity'
                              }
                          }
                      }
                  }

In [None]:
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore

In [None]:
# from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", custom_mapping=mapping)

import json
with open('/opt/ml/data/wikipedia_documents.json', "r") as f:
    wiki = json.load(f)
contexts = list(dict.fromkeys([v['text'] for v in wiki.values()]))


dicts = [
    {
        'text': context,
    } for context in tqdm(contexts)
]
document_store.write_documents(dicts)

In [None]:
document_store

In [None]:
dicts

In [None]:
#only retriever
from haystack.retriever import ElasticsearchRetriever
retriever = ElasticsearchRetriever(document_store)
from haystack.pipeline import DocumentSearchPipeline
pipe = DocumentSearchPipeline(retriever)

In [None]:
# #reader(FARM of Transformer) +retriever
# from haystack.reader.farm import FARMReader
# from haystack.reader.transformers import TransformersReader
# reader = TransformersReader(model_name_or_path="/opt/ml/code/models/koelectra_test",max_seq_len=300, doc_stride=120)

# from haystack.pipeline import ExtractiveQAPipeline
# pipe = ExtractiveQAPipelinee(reader,retriever)

In [None]:
from datasets import load_from_disk
testset=load_from_disk('/opt/ml/data/test_dataset')
testset=testset['validation']

In [None]:
question=testset[0]['question']
prediction = pipe.run(query=question, params={"retriever": {"top_k": 30}})
prediction['documents'][:5]

In [None]:
# #dictionary로 top1 문서내용을 json으로 저장
# result={}
# #for i in range(10): #len(testset['question'])
# for example in testset:
#     question=example["question"]
#     prediction = pipe.run(query=question, params={"retriever": {"top_k": 10}})
#     #from haystack.utils import print_answers
#     if prediction['d'][0]['probability']>0.99:
#         result[example["id"]]=prediction['answers'][0]['answer']
#     else:
#         result[example["id"]]=None

# with open('elastic_reader.json', "w") as writer:
#     writer.write(json.dumps(result, indent=4, ensure_ascii=False) + "\n")

In [None]:
# question = testset[0]['question']
# top_k_docs = pipe.run(question, params={"retriever": {"top_k": 10}})

In [None]:
# #top1 score json으로 저장
# save_score={}

# for idx, example in enumerate(tqdm(testset, desc="elasticsearch: ")):
#     # relev_doc_ids = [el for i, el in enumerate(self.ids) if i in doc_indices[idx]]
#     question=example["question"]
#     top_k_docs = pipe.run(question, params={"retriever": {"top_k": 10}})

#     query = {
#         'query':{
#             'bool':{
#                 'must':[
#                           {'match':{'text':question}}
#                 ],
#                 'should':[
#                           {'match':{'text':question}}
#                 ]
#             }
#         }
#     }
#     doc = es.search(index='document',body=query,size=30)['hits']['hits']

    
#     save_score[example['id']]=doc[0]['_score']
    
# with open('top1_score.json', "w") as writer:
#     writer.write(json.dumps(save_score, indent=4, ensure_ascii=False) + "\n")

In [None]:
docs = []
quries = []

In [None]:
testset[0]

In [None]:
#datafram을 pickle로 top5 저장
import pandas as pd
total = []
docs = []
quries = []
for idx, example in enumerate(tqdm(testset, desc="elasticsearch: ")):
    # relev_doc_ids = [el for i, el in enumerate(self.ids) if i in doc_indices[idx]]
    question=example["question"]
    top_k_docs = pipe.run(question, params={"retriever": {"top_k": 30}})

    # query = {
    #     'query':{
    #         'bool':{
    #             'must':[
    #                       {'match':{'text':question}}
    #             ],
    #             'should':[
    #                       {'match':{'text':question}}
    #             ]
    #         }
    #     }
    # }

    query = {
            'query': {
                'match': {
                    'text': question
                    }
                }
            }
    quries.append(query)
    
    doc = es.search(index='document',body=query,size=30)['hits']['hits']
    cc = ''
    docs.append(doc)
    
    for i in range(10):
        cc += doc[i]['_source']['text']
        

    tmp = {
        "question": example["question"],
        "id": example['id'],
        "context_id": doc[0]['_id'],  # retrieved id
        "context": cc # doc[0]['_source']['text']+doc[1]['_source']['text']+doc[2]['_source']['text']+doc[3]['_source']['text']+doc[4]['_source']['text'] # retrieved doument
    }
 
    if 'context' in example.keys() and 'answers' in example.keys():
        tmp["original_context"] = example['context']  # original document
        tmp["answers"] = example['answers']           # original answer
    total.append(tmp)