In [2]:
# FAISSによるDocument Store
from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http, print_answers
from haystack.nodes import FARMReader, TransformersReader
from haystack.document_stores import FAISSDocumentStore

document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# サンプルデータでドキュメントを作成
doc_dir = "data/tutorial6"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt6.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
document_store.write_documents(docs)

Writing Documents: 10000it [00:02, 3458.35it/s]            


In [4]:
# EmbeddingによるRetriever
from haystack.nodes import EmbeddingRetriever

retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
    model_format="sentence_transformers",
)

document_store.update_embeddings(retriever) # 埋め込み表現の更新

Downloading: 100%|██████████| 737/737 [00:00<00:00, 296kB/s]
Downloading: 100%|██████████| 190/190 [00:00<00:00, 58.8kB/s]
Downloading: 100%|██████████| 8.65k/8.65k [00:00<00:00, 2.46MB/s]
Downloading: 100%|██████████| 571/571 [00:00<00:00, 266kB/s]
Downloading: 100%|██████████| 116/116 [00:00<00:00, 45.6kB/s]
Downloading: 100%|██████████| 25.5k/25.5k [00:00<00:00, 6.65MB/s]
Downloading: 100%|██████████| 438M/438M [00:05<00:00, 73.6MB/s] 
Downloading: 100%|██████████| 53.0/53.0 [00:00<00:00, 20.7kB/s]
Downloading: 100%|██████████| 239/239 [00:00<00:00, 112kB/s]
Downloading: 100%|██████████| 466k/466k [00:00<00:00, 569kB/s]  
Downloading: 100%|██████████| 363/363 [00:00<00:00, 166kB/s]
Downloading: 100%|██████████| 13.9k/13.9k [00:00<00:00, 89.4kB/s]
Downloading: 100%|██████████| 232k/232k [00:00<00:00, 351kB/s]  
Downloading: 100%|██████████| 229/229 [00:00<00:00, 107kB/s]
Batches: 100%|██████████| 74/74 [00:13<00:00,  5.39it/s]docs/s]
Documents Processed: 10000 docs [00:15, 625.19 doc

In [5]:
# FARMReader
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [6]:
# パイプラインの作成
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader, retriever)

In [7]:
# パイプラインによる質問のサンプル実行
prediction = pipe.run(
    query="Who created the Dothraki vocabulary?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
)
print_answers(prediction, details="minimum")

Batches: 100%|██████████| 1/1 [00:00<00:00, 21.44it/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 45.60 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 89.55 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 93.84 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 60.26 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 99.95 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 93.55 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 77.05 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 99.59 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 99.33 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 99.09 Batches/s]


Query: Who created the Dothraki vocabulary?
Answers:
[   {   'answer': 'David J. Peterson',
        'context': 'orld. The language was developed for the TV series by the '
                   'linguist David J. Peterson, working off the Dothraki words '
                   "and phrases in Martin's novels.\n"
                   ','},
    {   'answer': 'David J. Peterson',
        'context': '\n'
                   '===Valyrian===\n'
                   'David J. Peterson, who created the Dothraki language for '
                   'the first season of the show, was entrusted by the '
                   'producers to design a new '},
    {   'answer': 'David J. Peterson',
        'context': "age for ''Game of Thrones''\n"
                   'The Dothraki vocabulary was created by David J. Peterson '
                   'well in advance of the adaptation. HBO hired the Language '
                   'Creatio'},
    {   'answer': 'Dwight Schrute',
        'context': '\'s Ancestry" from the Unit


