In [38]:
! pip install elasticsearch==7.17.7 huggingface-hub==0.11.0 transformers==4.21.2 datasets

Installing collected packages: elasticsearch
Successfully installed elasticsearch-7.17.7


In [39]:
from datasets import get_dataset_config_names
import transformers
import torch

In [40]:
domains = get_dataset_config_names('subjqa')
domains


['books', 'electronics', 'grocery', 'movies', 'restaurants', 'tripadvisor']

In [41]:
print(f"Pytorch version: {torch.__version__} ")

Pytorch version: 2.0.1+cu118 


In [42]:
torch.cuda.is_available()

True

In [43]:
from datasets import load_dataset
subjqa = load_dataset("subjqa" , name = 'electronics')
subjqa




  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['domain', 'nn_mod', 'nn_asp', 'query_mod', 'query_asp', 'q_reviews_id', 'question_subj_level', 'ques_subj_score', 'is_ques_subjective', 'review_id', 'id', 'title', 'context', 'question', 'answers'],
        num_rows: 1295
    })
    test: Dataset({
        features: ['domain', 'nn_mod', 'nn_asp', 'query_mod', 'query_asp', 'q_reviews_id', 'question_subj_level', 'ques_subj_score', 'is_ques_subjective', 'review_id', 'id', 'title', 'context', 'question', 'answers'],
        num_rows: 358
    })
    validation: Dataset({
        features: ['domain', 'nn_mod', 'nn_asp', 'query_mod', 'query_asp', 'q_reviews_id', 'question_subj_level', 'ques_subj_score', 'is_ques_subjective', 'review_id', 'id', 'title', 'context', 'question', 'answers'],
        num_rows: 255
    })
})

In [44]:

print(subjqa['train']['question'][1])
print("###########################################################################")
print(subjqa['train']['context'][1])
print("###########################################################################")
print(subjqa['train']['answers'][1])

Is this music song have a goo bass?
###########################################################################
To anyone who hasn't tried all the various types of headphones, it is important to remember exactly what these are: cheap portable on-ear headphones. They give a totally different sound then in-ears or closed design phones, but for what they are I would say they're good. I currently own six pairs of phones, from stock apple earbuds to Sennheiser HD 518s. Gave my Portapros a run on both my computer's sound card and mp3 player, using 256 kbps mp3s or better. The clarity is good and they're very lightweight. The folding design is simple but effective. The look is certainly retro and unique, although I didn't find it as comfortable as many have claimed. Earpads are *very* thin and made my ears sore after 30 minutes of listening, although this can be remedied to a point by adjusting the "comfort zone" feature (tightening the temple pads while loosening the ear pads). The cord seem

In [45]:
import pandas as pd

dfs = {split: dset.to_pandas() for split , dset in subjqa.flatten().items()}

for split , df in dfs.items():
    print(f"number of question in {split}: {df['id'].nunique()}")


number of question in train: 1295
number of question in test: 358
number of question in validation: 255


### build a QA system

In [None]:
# ## download elasticsearch
url = "https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz"

!wget -nc -q {url}
# # unpack it with the tar shell command
!tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz

In [None]:
# run elastic search as background process

import os
from subprocess import Popen , PIPE , STDOUT

!chown -R AIMastery:AIMastery elasticsearch-7.9.2

es_server = Popen(args = ['elasticsearch-7.9.2/bin/elasticsearch'] ,
                 stdout = PIPE ,
                 stderr = STDOUT)

! sleep 30

In [None]:
# test the connection

!curl -X GET "localhost:9200/?pretty"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
{
  "name" : "abdelmageed-virtual-machine",
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "5hYii2vhQVmski_ILFOqJQ",
  "version" : {
    "number" : "7.9.2",
    "build_flavor" : "default",
    "build_type" : "tar",
    "build_hash" : "d34da0ea4a966c4e49417f2da2f244e3e97b4e6e",
    "build_date" : "2020-09-23T00:45:33.626720Z",
    "build_snapshot" : false,
    "lucene_version" : "8.6.2",
    "minimum_wire_compatibility_version" : "6.8.0",
    "minimum_index_compatibility_version" : "6.0.0-beta1"
  },
  "tagline" : "You Know, for Search"
}


In [None]:


# pip install farm-haystack==1.10.0

from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore

# Return the document embedding for later use with dense retriever
document_store = ElasticsearchDocumentStore(return_embedding=True)

for split , df  in dfs.items():
    # exclude duplicate reviews
    docs = [{"content": row['context'],
            "meta":{"item_id": row['title'],
                    "question_id": row['id'],
                    "split": split}} for _ , row in df.drop_duplicates(subset = "context").iterrows()]
    document_store.write_documents(docs , index = "document")

print(f'loaded {document_store.get_document_count()} documents')

loaded 1615 documents


In [None]:
from haystack.nodes.retriever.sparse import ElasticsearchRetriever


es_retriever = ElasticsearchRetriever(document_store = document_store)




In [None]:
item_id = "B0074BW614"
query = "is it good for reading"

retrieved_docs = es_retriever.retrieve(query = query,
                                       top_k = 3 ,
                                       filters = { "item_id":[item_id],
                                                   "split" : ["train"]})
print(retrieved_docs[0])


<Document: id=252e83e25d52df7311d597dc89eef9f6, content='This is a gift to myself.  I have been a kindle user for 4 years and this is my third one.  I never  ...'>


In [None]:
retrieved_docs[0]

<Document: {'content': 'This is a gift to myself.  I have been a kindle user for 4 years and this is my third one.  I never thought I would want a fire for I mainly use it for book reading.  I decided to try the fire for when I travel I take my laptop, my phone and my iPod classic.  I love my iPod but watching movies on the plane with it can be challenging because it is so small. Laptops battery life is not as good as the Kindle.  So the Fire combines for me what I needed all three to do. So far so good.', 'content_type': 'text', 'score': 0.6857824513476455, 'meta': {'item_id': 'B0074BW614', 'question_id': '868e311275e26dbafe5af70774a300f3', 'split': 'train'}, 'embedding': None, 'id': '252e83e25d52df7311d597dc89eef9f6'}>

In [None]:
# part 2 the reader

from haystack.nodes import FARMReader

model_ckpt = "deepset/minilm-uncased-squad2"
max_seq_length = 384
doc_stride = 128

reader = FARMReader(model_name_or_path=model_ckpt,
                    progress_bar=False,
                    max_seq_len=max_seq_length,
                    doc_stride=doc_stride,
                    return_no_answer=True)


In [None]:
print(reader.predict_on_texts(question=query, texts= [retrieved_docs[0].content], top_k=3))


{'query': 'is it good for reading', 'no_ans_gap': 10.394830226898193, 'answers': [<Answer {'answer': '', 'type': 'extractive', 'score': 0.5389688395815545, 'context': None, 'offsets_in_document': [{'start': 0, 'end': 0}], 'offsets_in_context': [{'start': 0, 'end': 0}], 'document_id': None, 'meta': {}}>, <Answer {'answer': 'I mainly use it for book reading', 'type': 'extractive', 'score': 0.36830073595046997, 'context': ' is my third one.  I never thought I would want a fire for I mainly use it for book reading.  I decided to try the fire for when I travel I take my la', 'offsets_in_document': [{'start': 132, 'end': 164}], 'offsets_in_context': [{'start': 59, 'end': 91}], 'document_id': '252e83e25d52df7311d597dc89eef9f6', 'meta': {}}>]}


In [None]:

from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader, es_retriever)


In [None]:

n_answers = 3

preds = pipe.run(query=query,
                 params={"Retriever": {"top_k": 3}, "Reader": {"top_k": 3}})

print(f"Question: {preds['query']} \n" )

for idx in range(n_answers):
    print(f"Answer {idx+1}: {preds['answers'][idx].answer}" )
    print(f"Review snippet: ...{preds['answers'][idx].context}..." )
    print(" \n\n" )

Question: is it good for reading 

Answer 1: This is a good tablet for reading books
Review snippet: ... I expect of a tablet.The text is very clear and sharp. This is a good tablet for reading books, which is more than I expected.There is the one small ...
 


Answer 2: it is great for reading books when no light is available
Review snippet: ...ecoming addicted to hers! Our son LOVES it and it is great for reading books when no light is available. Amazing sound but I suggest good headphones t...
 


Answer 3: 
Review snippet: ...None...
 




In [None]:
from haystack.nodes.retriever.dense import DensePassageRetriever

dpr_retriever = DensePassageRetriever(document_store=document_store,
                                      query_embedding_model="facebook/dpr-question_encoder-single-nq-base" ,
                                      passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base" ,
                                      embed_title=False)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.


In [None]:

document_store.update_embeddings(retriever=dpr_retriever)


In [None]:

from haystack.nodes import FARMReader

model_ckpt = "deepset/minilm-uncased-squad2"
max_seq_length = 384
doc_stride = 128

reader = FARMReader(model_name_or_path=model_ckpt,
                    progress_bar=False,
                    max_seq_len=max_seq_length,
                    doc_stride=doc_stride,
                    return_no_answer=True)

In [None]:
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader, dpr_retriever)



n_answers = 3

preds = pipe.run(query=query,
                 params={"Retriever": {"top_k": 3}, "Reader": {"top_k": 3}})

print(f"Question: {preds['query']} \n" )

for idx in range(n_answers):
    print(f"Answer {idx+1}: {preds['answers'][idx].answer}" )
    print(f"Review snippet: ...{preds['answers'][idx].context}..." )
    print(" \n\n" )

Question: is it good for reading 

Answer 1: ibooks
Review snippet: ... this case. The Ipad part removes easily, for playing games, or reading ibooks. All Ipad Control buttons and camera are not obstructed by the case. I ...
 


Answer 2: 
Review snippet: ...None...
 


Answer 3: It works really well
Review snippet: ...had good and bad Amazon reviews. I took a chance and bought one. It works really well.  Keys are understandably smaller than a Standard Keyboard size,...
 


