In [None]:
pip install haystack-ai accelerate "sentence-transformers>=3.0.0" "datasets>=2.6.1"



Knowing you’re using this tutorial helps us decide where to invest our efforts to build a better product but you can always opt out by commenting the following line. See [Telemetry](https://docs.haystack.deepset.ai/docs/enabling-telemetry) for more details.

## Load data into the `DocumentStore`

Before you can use this data in the extractive pipeline, you'll use an indexing pipeline to fetch it, process it, and load it into the document store.


The data has already been cleaned and preprocessed, so turning it into Haystack `Documents` is fairly straightfoward.

Using an `InMemoryDocumentStore` here keeps things simple. However, this general approach would work with [any document store that Haystack 2.0 supports](https://docs.haystack.deepset.ai/docs/document-store).

The `SentenceTransformersDocumentEmbedder` transforms each `Document` into a vector. Here we've used [`sentence-transformers/multi-qa-mpnet-base-dot-v1`](https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-v1). You can substitute any embedding model you like, as long as you use the same one in your extractive pipeline.

Lastly, the `DocumentWriter` writes the vectorized documents to the `DocumentStore`.


In [None]:
import pandas as pd

In [None]:
#from datasets import load_dataset
from haystack import Document
from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.readers import ExtractiveReader
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_path = '/content/drive/My Drive/df_for_LLM_v2.csv'

In [None]:
df_for_LLM = pd.read_csv(file_path)

In [None]:
df_for_LLM

Unnamed: 0,name,property_tag_old_2,property_email,property_hours_of_operation,property_location,property_description,property_phone_number,combined_text
0,San Carlos,Hardware Store,mmora@depositosancarlos.com,"Monday - Friday 7:00 AM- 5:00 PM, Saturday 7:0...",Playa del Coco,Deposito San Carlos is a largest building mate...,+506 2670-1211,"San Carlos, Hardware Store, mmora@depositosanc..."
1,Angel's salon,Beauty Bar,myrnaangel@hotmail.com,"Monday 9:00 AM - 6:00 PM, Tuesday - Saturday 9...",Playa del Coco,Angel's Salon is a professional hair and beaut...,no information available,"Angel's salon, Beauty Bar, myrnaangel@hotmail...."
2,La Casa del Plastico,Department Store,info@lacasadelplastico.com,"Monday - Saturday 8:00 AM - 6:00 PM, Sunday C...",Playa del Coco,La Casa del Plastico specializes in the sale o...,+506 2670-0700,"La Casa del Plastico, Department Store, info@l..."
3,EPA Solarium,"Department Store, Hardware Store",contacto@xr.epa.biz,"Monday - Friday 9:00 AM - 5:00 PM, Saturday 9:...",Liberia,EPA Solarium - Hardware and Home Goods Store o...,no information available,"EPA Solarium, Department Store, Hardware Store..."
4,Policia Upala,"Police Department, Information Services",no information available,"Monday - Friday 8:00 AM - 4:00 PM, Saturday - ...",Canalete,Policia Upala - Police Department and Informat...,+506 2470-0235,"Policia Upala, Police Department, Information ..."
...,...,...,...,...,...,...,...,...
143,Coopeguanacaste,Utilities,servicioalcliente@coopeguanacaste.com,"Monday - Saturday 7:00 AM - 5:00 PM,\nSunday ...",Playa del Coco,Coopeguanacaste R.L. is a cooperative in Guana...,+506 2681-4700,"Coopeguanacaste, Utilities, servicioalcliente@..."
144,Liberia International Airport,Airport,\ninfo@guanacasteairport.com,Monday - Sunday 6:00 AM - 12:00 AM,Liberia,Daniel Oduber Quirós International Airport (LI...,+506 2666-9600,"Liberia International Airport, Airport, \ninfo..."
145,Playa del Coco,Beaches,no information available,no information available,Playa del Coco,"Playas del Coco is a vibrant hub of activity, ...",no information available,"Playa del Coco, Beaches, no information availa..."
146,Mega Super,Supermarket,no information available,Monday - Saturday 7:00 AM - 10:00 PM Sunday 7:...,Playa del Coco,"Mega Super offers a wide range of products, fr...",+506 2670-1239,"Mega Super, Supermarket, no information availa..."


In [None]:
# # Пример №1, в контенте только описание заведений
# documents = [
#     Document(
#         content=f"{row['property_description']}",
#         meta={
#             "name": row['name'],
#             "tags": row['property_tag_old_2'],
#             "email": row['property_email'],
#             "location": row['property_location'],
#             "phone_number": row['property_phone_number'],
#             "hours_of_operation": row['property_hours_of_operation']
#         }
#     )
#     for _, row in df_for_LLM.iterrows()
# ]

In [None]:
# # Пример №2, в контенте также режим работы
# documents = [
#     Document(
#         content=f"{row['property_description']}\nHours: {row['property_hours_of_operation']}",
#         meta={
#             "name": row['name'],
#             "tags": row['property_tag_old_2'],
#             "email": row['property_email'],
#             "location": row['property_location'],
#             "phone_number": row['property_phone_number']
#         }
#     )
#     for _, row in df_for_LLM.iterrows()
# ]

In [None]:
# Пример №3, в контенте combined_text то есть содержимое всех столбцов
documents = [
    Document(
        content=f"{row['combined_text']}",

    )
    for _, row in df_for_LLM.iterrows()
]

In [None]:
# # Пример №4, в контенте combined_text, а также есть meta
# documents = [
#     Document(
#         content=f"{row['combined_text']}",
#         meta={
#             "name": row['name'],
#             "tags": row['property_tag_old_2'],
#             "email": row['property_email'],
#             "location": row['property_location'],
#             "phone_number": row['property_phone_number'],
#             "hours_of_operation": row['property_hours_of_operation']
#         }
#     )
#     for _, row in df_for_LLM.iterrows()
# ]

In [None]:
documents

[Document(id=15d5fa5cb856487f11d2bf5c60d218b65b97056a8485c246e7313d05bd7afe8e, content: 'San Carlos, Hardware Store, mmora@depositosancarlos.com, Monday - Friday 7:00 AM- 5:00 PM, Saturday ...'),
 Document(id=f2a36e72ab889a4a0d9fc4d9e4f513226cbba05cf774689c596ae541f9488c11, content: 'Angel's salon, Beauty Bar, myrnaangel@hotmail.com, Monday 9:00 AM - 6:00 PM, Tuesday - Saturday 9:00...'),
 Document(id=df5708ebb664a1d8728a25452a763a5736931ac8776a5eaba13d68e2c434b604, content: 'La Casa del Plastico, Department Store, info@lacasadelplastico.com, Monday - Saturday 8:00 AM - 6:00...'),
 Document(id=652bb2e573623ca2933923f5322e29e26601f617f0ceb673e0374bc1adaaa266, content: 'EPA Solarium, Department Store, Hardware Store, contacto@xr.epa.biz, Monday - Friday 9:00 AM - 5:00 ...'),
 Document(id=f28c9df328974b5c719ed0b76c1146c1a33913d0470d3ce70dc6e08960592fc4, content: 'Policia Upala, Police Department, Information Services, no information available, Monday - Friday 8:...'),
 Document(id=5f0e74a

In [None]:
model = "sentence-transformers/multi-qa-mpnet-base-dot-v1"

#model = "deepset/roberta-base-squad2"

#model="google/tapas-base-finetuned-wtq"                             #Хочу проверить модель, которая сделана для задач table QA. Также для использования в docs возьму:
                                                                     #Пример №1, где в контенте только описание заведений. Не подходит, так как на вход нужен только pd dataframe


#model="microsoft/tapex-base"                                         #Видимо они все так устроены, также на вход нужен pd dataframe


#model="deepset/roberta-large-squad2"

#model="deepset/deberta-v3-large-squad2"

#model="deepset/deberta-v3-base-squad2"

#model="deepset/bert-base-cased-squad2"                                #Учитывает регистр

#Далее буду пробовать модели из библиотеки sentence-transformers.

#model = "sentence-transformers/multi-qa-distilbert-dot-v1"

#model = "sentence-transformers/multi-qa-MiniLM-L6-dot-v1"

#model = "sentence-transformers/msmarco-bert-base-dot-v5"

#model = "sentence-transformers/msmarco-distilbert-base-tas-b"

#model = "sentence-transformers/msmarco-MiniLM-L12-cos-v5"

document_store = InMemoryDocumentStore()

indexing_pipeline = Pipeline()

indexing_pipeline.add_component(instance=SentenceTransformersDocumentEmbedder(model=model), name="embedder")
indexing_pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="writer")
indexing_pipeline.connect("embedder.documents", "writer.documents")

indexing_pipeline.run({"documents": documents})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

{'writer': {'documents_written': 148}}

## Build an Extractive QA Pipeline

Your extractive QA pipeline will consist of three components: an embedder, retriever, and reader.

- The `SentenceTransformersTextEmbedder` turns a query into a vector, usaing the same embedding model defined above.

- Vector search allows the retriever to efficiently return relevant documents from the document store. Retrievers are tightly coupled with document stores; thus, you'll use an `InMemoryEmbeddingRetriever`to go with the `InMemoryDocumentStore`.

- The `ExtractiveReader` returns answers to that query, as well as their location in the source document, and a confidence score.


In [None]:
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.readers import ExtractiveReader
from haystack.components.embedders import SentenceTransformersTextEmbedder

In [None]:
retriever = InMemoryEmbeddingRetriever(document_store=document_store)
reader = ExtractiveReader()
reader.warm_up()

extractive_qa_pipeline = Pipeline()

extractive_qa_pipeline.add_component(instance=SentenceTransformersTextEmbedder(model=model), name="embedder")
extractive_qa_pipeline.add_component(instance=retriever, name="retriever")
extractive_qa_pipeline.add_component(instance=reader, name="reader")

extractive_qa_pipeline.connect("embedder.embedding", "retriever.query_embedding")
extractive_qa_pipeline.connect("retriever.documents", "reader.documents")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7b9e20b5cc40>
🚅 Components
  - embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
  - reader: ExtractiveReader
🛤️ Connections
  - embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> reader.documents (List[Document])

In [None]:
query = "What are the opening hours of the restaurant Zi Lounge?"
extractive_qa_pipeline.run(
    data={"embedder": {"text": query}, "retriever": {"top_k": 3}, "reader": {"query": query, "top_k": 3}}
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'reader': {'answers': [ExtractedAnswer(query='What are the opening hours of the restaurant Zi Lounge?', score=0.8432546854019165, data='11:00 AM - 2:30 AM', document=Document(id=e8a323a4533b246c32c4f3b40c1c583c7684cad1b757abb5e4bd8b211c473ac4, content: 'Zi Lounge, Restaurant, Bar, International, Seafood, Pizza, Pasta, Meat, Steak, Salads, Cocktails, Al...', score: 0.6435337060392761), context=None, document_offset=ExtractedAnswer.Span(start=142, end=160), context_offset=None, meta={}),
   ExtractedAnswer(query='What are the opening hours of the restaurant Zi Lounge?', score=0.8176264762878418, data='Monday - Sunday', document=Document(id=e8a323a4533b246c32c4f3b40c1c583c7684cad1b757abb5e4bd8b211c473ac4, content: 'Zi Lounge, Restaurant, Bar, International, Seafood, Pizza, Pasta, Meat, Steak, Salads, Cocktails, Al...', score: 0.6435337060392761), context=None, document_offset=ExtractedAnswer.Span(start=126, end=141), context_offset=None, meta={}),
   ExtractedAnswer(query='What are the o

In [None]:
query = "What are the opening days and hours of the restaurant Zi Lounge?"
extractive_qa_pipeline.run(
    data={"embedder": {"text": query}, "retriever": {"top_k": 3}, "reader": {"query": query, "top_k": 3}}
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'reader': {'answers': [ExtractedAnswer(query='What are the opening days and hours of the restaurant Zi Lounge?', score=0.8397794961929321, data='Monday - Sunday', document=Document(id=e8a323a4533b246c32c4f3b40c1c583c7684cad1b757abb5e4bd8b211c473ac4, content: 'Zi Lounge, Restaurant, Bar, International, Seafood, Pizza, Pasta, Meat, Steak, Salads, Cocktails, Al...', score: 0.6269520563996132), context=None, document_offset=ExtractedAnswer.Span(start=126, end=141), context_offset=None, meta={}),
   ExtractedAnswer(query='What are the opening days and hours of the restaurant Zi Lounge?', score=0.791502058506012, data='11:00 AM - 2:30 AM', document=Document(id=e8a323a4533b246c32c4f3b40c1c583c7684cad1b757abb5e4bd8b211c473ac4, content: 'Zi Lounge, Restaurant, Bar, International, Seafood, Pizza, Pasta, Meat, Steak, Salads, Cocktails, Al...', score: 0.6269520563996132), context=None, document_offset=ExtractedAnswer.Span(start=142, end=160), context_offset=None, meta={}),
   ExtractedAnswer(quer

In [None]:
query = "When is open the restaurant Zi Lounge?"
extractive_qa_pipeline.run(
    data={"embedder": {"text": query}, "retriever": {"top_k": 3}, "reader": {"query": query, "top_k": 3}}
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'reader': {'answers': [ExtractedAnswer(query='When is open the restaurant Zi Lounge?', score=0.7865237593650818, data='Monday - Sunday', document=Document(id=e8a323a4533b246c32c4f3b40c1c583c7684cad1b757abb5e4bd8b211c473ac4, content: 'Zi Lounge, Restaurant, Bar, International, Seafood, Pizza, Pasta, Meat, Steak, Salads, Cocktails, Al...', score: 0.6357605786058824), context=None, document_offset=ExtractedAnswer.Span(start=126, end=141), context_offset=None, meta={}),
   ExtractedAnswer(query='When is open the restaurant Zi Lounge?', score=0.729029655456543, data='Monday - Sunday', document=Document(id=0259c633d15088de015f32391e35593c62914f2ccdb89c3a878ba00cdd7887aa, content: 'Aqua Sport, Restaurant, Bar, Latin, Vegetarian, Cocktails, Pizza, Seafood, Fast Food, infoaquarestau...', score: 0.4430885420666822), context=None, document_offset=ExtractedAnswer.Span(start=117, end=132), context_offset=None, meta={}),
   ExtractedAnswer(query='When is open the restaurant Zi Lounge?', score=0.693

Try extracting some answers. Если подавать только description в качестве content, а остальное в качестве meta, работа алгоритма не очень. Не правильно определяются заведения.

В ходе тестов выявлено, что лучше всего подавать combined_text, это столбец, содержащий все данные со столбцов в одном.

Не идеально. Сейчас заведение определяется правильно, но режим работы почему то дробится. На дни недели и часы работы.

Если в контенте combined text, а также присутствует meta, результат аналогичный.

При использовании модели Roberta версии Large и Base, качество ответов неуд (не те заведения). Если в Roberta подвавать только описание, без остальных столбцов, то результат работы совсем плачевный

Deberta Large и Base версии ответы неуд (не те заведения).

Лучший результат у базовой модели multi-qa-mpnet-base-dot-v1. Попадает в заведения, но отвечает слишком буквально.

Работа bert-base-cased-squad2 также неуд.

Разницы в ответе между multi-qa-mpnet-base-dot-v1 и multi-qa-distilbert-dot-v1 и multi-qa-MiniLM-L6-dot-v1 нет.

Если использовать sentence-transformers/msmarco-bert-base-dot-v5, то есть аналогичную (но побольше) модель берта, обученного на другом датасете, результат тот же. При использовании sentence-transformers/msmarco-distilbert-base-tas-b и msmarco-MiniLM-L12-cos-v5 результат тот же.

По сути, хорошо матчит по названию sentence трансформеры. А отвечает Deberta (но не те заведения). Возможно стоит использовать 2 модели. Отложим пока и попробуем другие варианты по тюнингу алгоритма.



В частности использовать Part-of-speech tagging (POS-tagging) — это процесс, используемый в обработке естественного языка (NLP), который заключается в автоматическом определении и обозначении частей речи для каждого слова в тексте. Части речи включают такие категории, как существительные, глаголы, прилагательные, наречия и другие. POS-тегинг помогает понять синтаксическую структуру текста и часто используется в более сложных задачах NLP, таких как синтаксический разбор, анализ тональности и извлечение информации.

Процесс POS-тегинга обычно включает следующие этапы:

1. **Токенизация**: Разделение текста на отдельные слова или токены.

2. **Тегирование**: Каждому токену присваивается соответствующая часть речи. Это может быть сделано с помощью предварительно обученных моделей, которые используют вероятностные методы, такие как скрытые марковские модели (HMM), или современные подходы, такие как нейронные сети и трансформеры.

3. **Контекстное определение**: Поскольку некоторые слова могут принадлежать к нескольким частям речи в зависимости от контекста (например, слово «run» может быть как существительным, так и глаголом), алгоритмы POS-тегинга учитывают ближайшее окружение слова.

POS-тегинг важен для более глубокого понимания текста и является основой для многих других приложений в области обработки естественного языка.

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
def pos_tagging(text):
    doc = nlp(text)
    return [(token.text, token.pos_) for token in doc]

def create_document(row):
    content = row['combined_text']
    pos_tags = pos_tagging(content)
    return Document(content=content, meta={'pos_tags': pos_tags})

documents = [
    create_document(row)
    for _, row in df_for_LLM.iterrows()
]

In [None]:
documents

[Document(id=e79d09a1ec1d9c3a8f4f558b9590369d0e0f0e3e5bd3c9705e33725fbec9f19f, content: 'San Carlos, Hardware Store, mmora@depositosancarlos.com, Monday - Friday 7:00 AM- 5:00 PM, Saturday ...', meta: {'pos_tags': [('San', 'PROPN'), ('Carlos', 'PROPN'), (',', 'PUNCT'), ('Hardware', 'PROPN'), ('Store', 'PROPN'), (',', 'PUNCT'), ('mmora@depositosancarlos.com', 'X'), (',', 'PUNCT'), ('Monday', 'PROPN'), ('-', 'PUNCT'), ('Friday', 'PROPN'), ('7:00', 'NUM'), ('AM-', 'PROPN'), ('5:00', 'NUM'), ('PM', 'NOUN'), (',', 'PUNCT'), ('Saturday', 'PROPN'), ('7:00', 'NUM'), ('AM-', 'PROPN'), ('3:00', 'NUM'), ('PM', 'NOUN'), (',', 'PUNCT'), ('Sunday', 'PROPN'), (' ', 'SPACE'), ('Сlosed', 'VERB'), (',', 'PUNCT'), ('Playa', 'PROPN'), ('del', 'PROPN'), ('Coco', 'PROPN'), (',', 'PUNCT'), ('Deposito', 'PROPN'), ('San', 'PROPN'), ('Carlos', 'PROPN'), ('is', 'AUX'), ('a', 'DET'), ('largest', 'ADJ'), ('building', 'NOUN'), ('materials', 'NOUN'), ('store', 'NOUN'), ('with', 'ADP'), ('a', 'DET'), ('strong', 'ADJ'

In [None]:
model = "sentence-transformers/multi-qa-mpnet-base-dot-v1"

document_store = InMemoryDocumentStore()

indexing_pipeline = Pipeline()

indexing_pipeline.add_component(instance=SentenceTransformersDocumentEmbedder(model=model), name="embedder")
indexing_pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="writer")
indexing_pipeline.connect("embedder.documents", "writer.documents")

indexing_pipeline.run({"documents": documents})


retriever = InMemoryEmbeddingRetriever(document_store=document_store)
reader = ExtractiveReader()
reader.warm_up()

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
# Создаем пайплайн для извлечения ответов
extractive_qa_pipeline = Pipeline()

In [None]:
# Добавляем новый компонент для использования POS-tagging
class PosTaggingReader(ExtractiveReader):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def extract_answers(self, query, documents):
        answers = []
        for doc in documents:
            pos_tags = doc.meta['pos_tags']
            for token, pos in pos_tags:
                if pos == 'NN' and token in query:
                    answers.append(token)
        return answers

pos_tagging_reader = PosTaggingReader()

# Добавляем компоненты в пайплайн
extractive_qa_pipeline.add_component(instance=SentenceTransformersTextEmbedder(model=model), name="embedder")
extractive_qa_pipeline.add_component(instance=retriever, name="retriever")
extractive_qa_pipeline.add_component(instance=pos_tagging_reader, name="reader")

# Соединяем компоненты в пайплайн
extractive_qa_pipeline.connect("embedder.embedding", "retriever.query_embedding")
extractive_qa_pipeline.connect("retriever.documents", "reader.documents")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7e724e502530>
🚅 Components
  - embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
  - reader: PosTaggingReader
🛤️ Connections
  - embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> reader.documents (List[Document])

In [None]:
query = "What are the opening hours of the restaurant Zi Lounge?"
extractive_qa_pipeline.run(
    data={"embedder": {"text": query}, "retriever": {"top_k": 3}, "reader": {"query": query, "top_k": 3}}
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'reader': {'answers': [ExtractedAnswer(query='What are the opening hours of the restaurant Zi Lounge?', score=0.843254804611206, data='11:00 AM - 2:30 AM', document=Document(id=cd115b04dbac72c7c7b41f1d8310fa2ec93bb3fb2704c6d18a3829da87338b58, content: 'Zi Lounge, Restaurant, Bar, International, Seafood, Pizza, Pasta, Meat, Steak, Salads, Cocktails, Al...', meta: {'pos_tags': [('Zi', 'PROPN'), ('Lounge', 'PROPN'), (',', 'PUNCT'), ('Restaurant', 'PROPN'), (',', 'PUNCT'), ('Bar', 'PROPN'), (',', 'PUNCT'), ('International', 'PROPN'), (',', 'PUNCT'), ('Seafood', 'PROPN'), (',', 'PUNCT'), ('Pizza', 'PROPN'), (',', 'PUNCT'), ('Pasta', 'PROPN'), (',', 'PUNCT'), ('Meat', 'NOUN'), (',', 'PUNCT'), ('Steak', 'PROPN'), (',', 'PUNCT'), ('Salads', 'PROPN'), (',', 'PUNCT'), ('Cocktails', 'PROPN'), (',', 'PUNCT'), ('Alcohol', 'PROPN'), (',', 'PUNCT'), ('info@zilounge.com', 'X'), (',', 'PUNCT'), ('Monday', 'PROPN'), ('-', 'PUNCT'), ('Sunday', 'PROPN'), ('11:00', 'NUM'), ('AM', 'PROPN'), ('-', 'PUNCT'),

In [None]:
query = "What are the opening days and hours of the restaurant Zi Lounge?"
extractive_qa_pipeline.run(
    data={"embedder": {"text": query}, "retriever": {"top_k": 3}, "reader": {"query": query, "top_k": 3}}
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'reader': {'answers': [ExtractedAnswer(query='What are the opening days and hours of the restaurant Zi Lounge?', score=0.8397794961929321, data='Monday - Sunday', document=Document(id=cd115b04dbac72c7c7b41f1d8310fa2ec93bb3fb2704c6d18a3829da87338b58, content: 'Zi Lounge, Restaurant, Bar, International, Seafood, Pizza, Pasta, Meat, Steak, Salads, Cocktails, Al...', meta: {'pos_tags': [('Zi', 'PROPN'), ('Lounge', 'PROPN'), (',', 'PUNCT'), ('Restaurant', 'PROPN'), (',', 'PUNCT'), ('Bar', 'PROPN'), (',', 'PUNCT'), ('International', 'PROPN'), (',', 'PUNCT'), ('Seafood', 'PROPN'), (',', 'PUNCT'), ('Pizza', 'PROPN'), (',', 'PUNCT'), ('Pasta', 'PROPN'), (',', 'PUNCT'), ('Meat', 'NOUN'), (',', 'PUNCT'), ('Steak', 'PROPN'), (',', 'PUNCT'), ('Salads', 'PROPN'), (',', 'PUNCT'), ('Cocktails', 'PROPN'), (',', 'PUNCT'), ('Alcohol', 'PROPN'), (',', 'PUNCT'), ('info@zilounge.com', 'X'), (',', 'PUNCT'), ('Monday', 'PROPN'), ('-', 'PUNCT'), ('Sunday', 'PROPN'), ('11:00', 'NUM'), ('AM', 'PROPN'), ('-', 'P

In [None]:
query = "When is open the restaurant Zi Lounge?"
extractive_qa_pipeline.run(
    data={"embedder": {"text": query}, "retriever": {"top_k": 3}, "reader": {"query": query, "top_k": 3}}
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'reader': {'answers': [ExtractedAnswer(query='When is open the restaurant Zi Lounge?', score=0.786523699760437, data='Monday - Sunday', document=Document(id=cd115b04dbac72c7c7b41f1d8310fa2ec93bb3fb2704c6d18a3829da87338b58, content: 'Zi Lounge, Restaurant, Bar, International, Seafood, Pizza, Pasta, Meat, Steak, Salads, Cocktails, Al...', meta: {'pos_tags': [('Zi', 'PROPN'), ('Lounge', 'PROPN'), (',', 'PUNCT'), ('Restaurant', 'PROPN'), (',', 'PUNCT'), ('Bar', 'PROPN'), (',', 'PUNCT'), ('International', 'PROPN'), (',', 'PUNCT'), ('Seafood', 'PROPN'), (',', 'PUNCT'), ('Pizza', 'PROPN'), (',', 'PUNCT'), ('Pasta', 'PROPN'), (',', 'PUNCT'), ('Meat', 'NOUN'), (',', 'PUNCT'), ('Steak', 'PROPN'), (',', 'PUNCT'), ('Salads', 'PROPN'), (',', 'PUNCT'), ('Cocktails', 'PROPN'), (',', 'PUNCT'), ('Alcohol', 'PROPN'), (',', 'PUNCT'), ('info@zilounge.com', 'X'), (',', 'PUNCT'), ('Monday', 'PROPN'), ('-', 'PUNCT'), ('Sunday', 'PROPN'), ('11:00', 'NUM'), ('AM', 'PROPN'), ('-', 'PUNCT'), ('2:30', 'NUM'), ('

Использование Part-of-speech tagging (POS-tagging) не оправдало ожидание.

Попробуем использовать Named Entity Recognition (NER).

Named Entity Recognition (NER) — это задача в области обработки естественного языка (NLP), которая заключается в идентификации и классификации именованных сущностей в тексте. Именованными сущностями могут быть имена собственные, такие как имена людей, названия организаций, географические названия, даты, временные интервалы, денежные суммы и другие специфические категории, которые имеют конкретное значение в контексте.

Основные аспекты NER:

1. **Идентификация**: Процесс выделения последовательностей символов, которые представляют собой именованные сущности. Например, в предложении "Илон Маск основал SpaceX в 2002 году" NER-система должна идентифицировать "Илон Маск", "SpaceX" и "2002" как потенциальные именованные сущности.

2. **Классификация**: После идентификации система классифицирует каждую сущность в одну из предопределённых категорий. В приведённом примере "Илон Маск" может быть классифицирован как "PERSON" (человек), "SpaceX" как "ORGANIZATION" (организация), а "2002" как "DATE" (дата).

3. **Контекстуальный анализ**: NER-системы должны учитывать контекст, поскольку некоторые слова и фразы могут быть омонимами или иметь разные значения в зависимости от окружения. Например, "Apple" может означать как технологическую компанию, так и фрукт.

NER используется в различных приложениях, таких как извлечение информации, автоматическое аннотирование текстов, улучшение поиска по ключевым словам и анализ текстовых данных в реальном времени. Современные системы NER часто строятся с использованием машинного обучения и глубокого обучения, что позволяет им достигать высокой точности в сложных и многозначных текстах.


Следует отметить, что en_core_web_sm базовая модель в spacy. В нашем случае используется и в POS-tagging и NER. Бывают сложнее.

In [None]:
retriever = InMemoryEmbeddingRetriever(document_store=document_store)

In [None]:
# Создаем пайплайн для извлечения ответов
extractive_qa_pipeline = Pipeline()

In [None]:
class NERReader(ExtractiveReader):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def extract_answers(self, query, documents):
        answers = []
        for doc in documents:
            text = doc.content
            doc_spacy = nlp(text)
            for ent in doc_spacy.ents:
                if ent.text in query:
                    answers.append(ent.text)
        return answers


In [None]:
ner_reader = NERReader()

# Добавляем компонент в пайплайн
extractive_qa_pipeline.add_component(instance=ner_reader, name="reader")
extractive_qa_pipeline.add_component(instance=SentenceTransformersTextEmbedder(model=model), name="embedder")
extractive_qa_pipeline.add_component(instance=retriever, name="retriever")

# Соединяем компоненты в пайплайн
extractive_qa_pipeline.connect("embedder.embedding", "retriever.query_embedding")
extractive_qa_pipeline.connect("retriever.documents", "reader.documents")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7b6380613430>
🚅 Components
  - reader: NERReader
  - embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
🛤️ Connections
  - embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> reader.documents (List[Document])

In [None]:
query = "What are the opening hours of the restaurant Zi Lounge?"
extractive_qa_pipeline.run(
    data={"embedder": {"text": query}, "retriever": {"top_k": 3}, "reader": {"query": query, "top_k": 3}}
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'reader': {'answers': [ExtractedAnswer(query='What are the opening hours of the restaurant Zi Lounge?', score=0.843254804611206, data='11:00 AM - 2:30 AM', document=Document(id=e8a323a4533b246c32c4f3b40c1c583c7684cad1b757abb5e4bd8b211c473ac4, content: 'Zi Lounge, Restaurant, Bar, International, Seafood, Pizza, Pasta, Meat, Steak, Salads, Cocktails, Al...', score: 26.02480504900634), context=None, document_offset=ExtractedAnswer.Span(start=142, end=160), context_offset=None, meta={}),
   ExtractedAnswer(query='What are the opening hours of the restaurant Zi Lounge?', score=0.8176265358924866, data='Monday - Sunday', document=Document(id=e8a323a4533b246c32c4f3b40c1c583c7684cad1b757abb5e4bd8b211c473ac4, content: 'Zi Lounge, Restaurant, Bar, International, Seafood, Pizza, Pasta, Meat, Steak, Salads, Cocktails, Al...', score: 26.02480504900634), context=None, document_offset=ExtractedAnswer.Span(start=126, end=141), context_offset=None, meta={}),
   ExtractedAnswer(query='What are the open

In [None]:
query = "What are the opening days and hours of the restaurant Zi Lounge?"
extractive_qa_pipeline.run(
    data={"embedder": {"text": query}, "retriever": {"top_k": 3}, "reader": {"query": query, "top_k": 3}}
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'reader': {'answers': [ExtractedAnswer(query='What are the opening days and hours of the restaurant Zi Lounge?', score=0.8397794961929321, data='Monday - Sunday', document=Document(id=e8a323a4533b246c32c4f3b40c1c583c7684cad1b757abb5e4bd8b211c473ac4, content: 'Zi Lounge, Restaurant, Bar, International, Seafood, Pizza, Pasta, Meat, Steak, Salads, Cocktails, Al...', score: 26.470380126861734), context=None, document_offset=ExtractedAnswer.Span(start=126, end=141), context_offset=None, meta={}),
   ExtractedAnswer(query='What are the opening days and hours of the restaurant Zi Lounge?', score=0.8241560459136963, data='Monday - Saturday', document=Document(id=829cb8c526265bc03477b109eb89acda01db27d62b51c8fa586ca3c63ec56afd, content: 'Zarpe, Bar, Cocktails, Alcohol, LASENORA@ZARPE.BAR, Monday - Saturday 4:00 PM - 12:00 AM, Sunday - C...', score: 21.329905508721964), context=None, document_offset=ExtractedAnswer.Span(start=52, end=69), context_offset=None, meta={}),
   ExtractedAnswer(query=

In [None]:
query = "When is open the restaurant Zi Lounge?"
extractive_qa_pipeline.run(
    data={"embedder": {"text": query}, "retriever": {"top_k": 3}, "reader": {"query": query, "top_k": 3}}
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'reader': {'answers': [ExtractedAnswer(query='When is open the restaurant Zi Lounge?', score=0.786523699760437, data='Monday - Sunday', document=Document(id=e8a323a4533b246c32c4f3b40c1c583c7684cad1b757abb5e4bd8b211c473ac4, content: 'Zi Lounge, Restaurant, Bar, International, Seafood, Pizza, Pasta, Meat, Steak, Salads, Cocktails, Al...', score: 27.195980742193726), context=None, document_offset=ExtractedAnswer.Span(start=126, end=141), context_offset=None, meta={}),
   ExtractedAnswer(query='When is open the restaurant Zi Lounge?', score=0.7416019439697266, data='Monday - Saturday', document=Document(id=829cb8c526265bc03477b109eb89acda01db27d62b51c8fa586ca3c63ec56afd, content: 'Zarpe, Bar, Cocktails, Alcohol, LASENORA@ZARPE.BAR, Monday - Saturday 4:00 PM - 12:00 AM, Sunday - C...', score: 21.518277971272383), context=None, document_offset=ExtractedAnswer.Span(start=52, end=69), context_offset=None, meta={}),
   ExtractedAnswer(query='When is open the restaurant Zi Lounge?', score=0.687

Применение NER также не оправдало ожиданий. Можно конечно упороться в поиск навороченных spacy моделей, но мне кажется, в нашем случае нужно использовать другие подходы.

Попробуем Dependency Parsing - это способ анализа синтаксической структуры предложения, который позволяет нам определить зависимости между словами в предложении.

Попробуем собрать гибрид из того, о чем говорили ранее. Ретривер + Deberta в качестве Reader.

In [None]:
retriever = InMemoryEmbeddingRetriever(document_store=document_store)

In [None]:
# Создаем пайплайн для извлечения ответов
extractive_qa_pipeline = Pipeline()

In [None]:
class DependencyParsingReader(ExtractiveReader):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def extract_answers(self, query, documents):
        answers = []
        for doc in documents:
            text = doc.content
            doc_spacy = nlp(text)
            for token in doc_spacy:
                if token.dep_ == "ROOT":
                    root_token = token
                    for child in root_token.children:
                        if child.dep_ == "nsubj":
                            subject_token = child
                            answers.append(subject_token.text)
        return answers

In [None]:
dependency_parsing_reader = DependencyParsingReader()

# Добавляем компонент в пайплайн
extractive_qa_pipeline.add_component(instance=dependency_parsing_reader, name="reader")
extractive_qa_pipeline.add_component(instance=SentenceTransformersTextEmbedder(model=model), name="embedder")
extractive_qa_pipeline.add_component(instance=retriever, name="retriever")

# Соединяем компоненты в пайплайн
extractive_qa_pipeline.connect("embedder.embedding", "retriever.query_embedding")
extractive_qa_pipeline.connect("retriever.documents", "reader.documents")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7b63800e6fe0>
🚅 Components
  - reader: DependencyParsingReader
  - embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
🛤️ Connections
  - embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> reader.documents (List[Document])

In [None]:
query = "What are the opening hours of the restaurant Zi Lounge?"
extractive_qa_pipeline.run(
    data={"embedder": {"text": query}, "retriever": {"top_k": 3}, "reader": {"query": query, "top_k": 3}}
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'reader': {'answers': [ExtractedAnswer(query='What are the opening hours of the restaurant Zi Lounge?', score=0.843254804611206, data='11:00 AM - 2:30 AM', document=Document(id=e8a323a4533b246c32c4f3b40c1c583c7684cad1b757abb5e4bd8b211c473ac4, content: 'Zi Lounge, Restaurant, Bar, International, Seafood, Pizza, Pasta, Meat, Steak, Salads, Cocktails, Al...', score: 26.02480504900634), context=None, document_offset=ExtractedAnswer.Span(start=142, end=160), context_offset=None, meta={}),
   ExtractedAnswer(query='What are the opening hours of the restaurant Zi Lounge?', score=0.8176265358924866, data='Monday - Sunday', document=Document(id=e8a323a4533b246c32c4f3b40c1c583c7684cad1b757abb5e4bd8b211c473ac4, content: 'Zi Lounge, Restaurant, Bar, International, Seafood, Pizza, Pasta, Meat, Steak, Salads, Cocktails, Al...', score: 26.02480504900634), context=None, document_offset=ExtractedAnswer.Span(start=126, end=141), context_offset=None, meta={}),
   ExtractedAnswer(query='What are the open

In [None]:
query = "What are the opening days and hours of the restaurant Zi Lounge?"
extractive_qa_pipeline.run(
    data={"embedder": {"text": query}, "retriever": {"top_k": 3}, "reader": {"query": query, "top_k": 3}}
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'reader': {'answers': [ExtractedAnswer(query='What are the opening days and hours of the restaurant Zi Lounge?', score=0.8397794961929321, data='Monday - Sunday', document=Document(id=e8a323a4533b246c32c4f3b40c1c583c7684cad1b757abb5e4bd8b211c473ac4, content: 'Zi Lounge, Restaurant, Bar, International, Seafood, Pizza, Pasta, Meat, Steak, Salads, Cocktails, Al...', score: 26.470380126861734), context=None, document_offset=ExtractedAnswer.Span(start=126, end=141), context_offset=None, meta={}),
   ExtractedAnswer(query='What are the opening days and hours of the restaurant Zi Lounge?', score=0.8241560459136963, data='Monday - Saturday', document=Document(id=829cb8c526265bc03477b109eb89acda01db27d62b51c8fa586ca3c63ec56afd, content: 'Zarpe, Bar, Cocktails, Alcohol, LASENORA@ZARPE.BAR, Monday - Saturday 4:00 PM - 12:00 AM, Sunday - C...', score: 21.329905508721964), context=None, document_offset=ExtractedAnswer.Span(start=52, end=69), context_offset=None, meta={}),
   ExtractedAnswer(query=

In [None]:
query = "When is open the restaurant Zi Lounge?"
extractive_qa_pipeline.run(
    data={"embedder": {"text": query}, "retriever": {"top_k": 3}, "reader": {"query": query, "top_k": 3}}
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'reader': {'answers': [ExtractedAnswer(query='When is open the restaurant Zi Lounge?', score=0.786523699760437, data='Monday - Sunday', document=Document(id=e8a323a4533b246c32c4f3b40c1c583c7684cad1b757abb5e4bd8b211c473ac4, content: 'Zi Lounge, Restaurant, Bar, International, Seafood, Pizza, Pasta, Meat, Steak, Salads, Cocktails, Al...', score: 27.195980742193726), context=None, document_offset=ExtractedAnswer.Span(start=126, end=141), context_offset=None, meta={}),
   ExtractedAnswer(query='When is open the restaurant Zi Lounge?', score=0.7416019439697266, data='Monday - Saturday', document=Document(id=829cb8c526265bc03477b109eb89acda01db27d62b51c8fa586ca3c63ec56afd, content: 'Zarpe, Bar, Cocktails, Alcohol, LASENORA@ZARPE.BAR, Monday - Saturday 4:00 PM - 12:00 AM, Sunday - C...', score: 21.518277971272383), context=None, document_offset=ExtractedAnswer.Span(start=52, end=69), context_offset=None, meta={}),
   ExtractedAnswer(query='When is open the restaurant Zi Lounge?', score=0.687

Dependency Parsing также не оправдал ожиданий.

Мы также можем попробовать использовать более сложные алгоритмы извлечения ответов, такие как Span-based QA или Sequence-to-sequence QA.

Смысла использовать Span-based QA нет, так как алгоритм хорошо работает, только если в тексте ответ напрямую присутствует в тексте.

In [None]:
# pip install farm-haystack

In [None]:
# from haystack.nodes import T5Reader
# from transformers import T5Tokenizer, T5ForConditionalGeneration

In [None]:
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever

In [None]:
# Создаем DocumentStore
document_store = InMemoryDocumentStore()

# Добавляем документы в DocumentStore
document_store.write_documents(documents)

# Создаем InMemoryEmbeddingRetriever
retriever = InMemoryEmbeddingRetriever(document_store=document_store)

In [None]:
qa_pipeline = Pipeline()

In [None]:
retriever = InMemoryBM25Retriever(document_store=document_store)
qa_pipeline.add_component(instance=retriever, name="retriever")

In [None]:
qa_pipeline.add_component(instance=ExtractiveReader(model="deepset/deberta-v3-base-squad2"), name="reader")

In [None]:
# qa_pipeline.add_component(instance=InMemoryBM25Retriever(document_store=document_store), name="retriever")
# qa_pipeline.add_component(instance=ExtractiveReader(model="deepset/deberta-v3-base-squad2"), name="reader")

In [None]:
qa_pipeline.connect("retriever", "reader")

<haystack.core.pipeline.pipeline.Pipeline object at 0x780b17860640>
🚅 Components
  - retriever: InMemoryBM25Retriever
  - reader: ExtractiveReader
🛤️ Connections
  - retriever.documents -> reader.documents (List[Document])

In [None]:
query = "What are the opening hours of the restaurant Zi Lounge?"
qa_pipeline.run(
    data={"retriever": {"query": query, "top_k": 3}, "reader": {"query": query, "top_k": 3}}
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


{'reader': {'answers': [ExtractedAnswer(query='What are the opening hours of the restaurant Zi Lounge?', score=0.8549867868423462, data=' Monday - Sunday 11:00 AM - 2:30 AM', document=Document(id=e8a323a4533b246c32c4f3b40c1c583c7684cad1b757abb5e4bd8b211c473ac4, content: 'Zi Lounge, Restaurant, Bar, International, Seafood, Pizza, Pasta, Meat, Steak, Salads, Cocktails, Al...', score: 25.75866793740871), context=None, document_offset=ExtractedAnswer.Span(start=125, end=160), context_offset=None, meta={}),
   ExtractedAnswer(query='What are the opening hours of the restaurant Zi Lounge?', score=0.6113113164901733, data=' Monday - Saturday 4:00 PM - 12:00 AM, Sunday - Closed', document=Document(id=829cb8c526265bc03477b109eb89acda01db27d62b51c8fa586ca3c63ec56afd, content: 'Zarpe, Bar, Cocktails, Alcohol, LASENORA@ZARPE.BAR, Monday - Saturday 4:00 PM - 12:00 AM, Sunday - C...', score: 20.38204179149279), context=None, document_offset=ExtractedAnswer.Span(start=51, end=105), context_offset=Non

In [None]:
query = "What are the opening days and hours of the restaurant Zi Lounge?"
qa_pipeline.run(
    data={"retriever": {"query": query, "top_k": 3}, "reader": {"query": query, "top_k": 3}}
)

{'reader': {'answers': [ExtractedAnswer(query='What are the opening days and hours of the restaurant Zi Lounge?', score=0.8552565574645996, data=' Monday - Sunday 11:00 AM - 2:30 AM', document=Document(id=e8a323a4533b246c32c4f3b40c1c583c7684cad1b757abb5e4bd8b211c473ac4, content: 'Zi Lounge, Restaurant, Bar, International, Seafood, Pizza, Pasta, Meat, Steak, Salads, Cocktails, Al...', score: 25.88915923886433), context=None, document_offset=ExtractedAnswer.Span(start=125, end=160), context_offset=None, meta={}),
   ExtractedAnswer(query='What are the opening days and hours of the restaurant Zi Lounge?', score=0.6460022926330566, data=' Monday - Saturday 4:00 PM - 12:00 AM, Sunday - Closed', document=Document(id=829cb8c526265bc03477b109eb89acda01db27d62b51c8fa586ca3c63ec56afd, content: 'Zarpe, Bar, Cocktails, Alcohol, LASENORA@ZARPE.BAR, Monday - Saturday 4:00 PM - 12:00 AM, Sunday - C...', score: 20.497650393518462), context=None, document_offset=ExtractedAnswer.Span(start=51, end=105),

In [None]:
query = "When is open the restaurant Zi Lounge?"
qa_pipeline.run(
    data={"retriever": {"query": query, "top_k": 3}, "reader": {"query": query, "top_k": 3}}
)

{'reader': {'answers': [ExtractedAnswer(query='When is open the restaurant Zi Lounge?', score=0.757442057132721, data=' Monday - Sunday 11:00 AM - 2:30 AM', document=Document(id=e8a323a4533b246c32c4f3b40c1c583c7684cad1b757abb5e4bd8b211c473ac4, content: 'Zi Lounge, Restaurant, Bar, International, Seafood, Pizza, Pasta, Meat, Steak, Salads, Cocktails, Al...', score: 19.801461834952015), context=None, document_offset=ExtractedAnswer.Span(start=125, end=160), context_offset=None, meta={}),
   ExtractedAnswer(query='When is open the restaurant Zi Lounge?', score=0.48066219687461853, data=' Monday - Saturday 4:00 PM - 12:00 AM, Sunday - Closed', document=Document(id=829cb8c526265bc03477b109eb89acda01db27d62b51c8fa586ca3c63ec56afd, content: 'Zarpe, Bar, Cocktails, Alcohol, LASENORA@ZARPE.BAR, Monday - Saturday 4:00 PM - 12:00 AM, Sunday - C...', score: 16.35872539077387), context=None, document_offset=ExtractedAnswer.Span(start=51, end=105), context_offset=None, meta={}),
   ExtractedAnswer(q