In [1]:
from crapstack.document_stores.faiss import FAISSDocumentStore


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
document_store = FAISSDocumentStore(embedding_dim=128, faiss_index_factory_str="Flat")


In [2]:
import pandas as pd

data = pd.read_csv('c:/users/kchun/desktop/notebooks/patents_v6.csv')
data

Unnamed: 0,patent_id,mainclass_id,year,patent_text
0,3930300,257,1976,Junction field effect transistor A junction f...
1,3930303,257,1976,Method manufacturing compact thermoelectric mo...
2,3930304,257,1976,Method apparatus selective burnout trimming in...
3,3930305,257,1976,Method manufacturing integrated circuits A met...
4,3930306,257,1976,Process attaching lead member semiconductor de...
...,...,...,...,...
7814692,8496534,470,2013,Group taps prepared hole cutting tools In FIG ...
7814693,8534022,470,2013,Twisted threaded reinforcing bar Techniques re...
7814694,8550755,470,2013,Tap driver rigidsynchronous tapping Disclosed ...
7814695,8602696,470,2013,Form tap plurality lobes A form tap tapping ar...


In [3]:
from crapstack.haystack.schema import Document

documents = [Document(content=txt) for txt in data['patent_text'].sample(frac=.0005)]
document_store.write_documents(documents=documents)

Writing Documents: 10000it [00:05, 1802.50it/s]            


In [5]:
from crapstack.haystack.nodes.retriever import DensePassageRetriever
from crapstack.haystack.nodes.answer_generator.transformers import Seq2SeqGenerator
from crapstack.haystack.pipelines.standard_pipelines import GenerativeQAPipeline
# from transformers import T5ForConditionalGeneration
# import torch

retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="vblagoje/dpr-question_encoder-single-lfqa-wiki",
    passage_embedding_model="vblagoje/dpr-ctx_encoder-single-lfqa-wiki",
)

generator = Seq2SeqGenerator(model_name_or_path="vblagoje/bart_lfqa")
# generator = T5ForConditionalGeneration.from_pretrained("google/flan-ul2", torch_dtype=torch.bfloat16, device_map="auto")                                                                 

pipe = GenerativeQAPipeline(generator, retriever)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.
ML logging didn't work: name 'tracker' is not defined


In [6]:
document_store.update_embeddings(retriever)

Documents Processed: 10000 docs [15:08, 11.01 docs/s]           


In [7]:
prediction = pipe.run(
    query="what is machine learning?"
)

prediction

{'query': 'what is machine learning?',
 'answers': [<Answer {'answer': "Machine learning is the process of training a neural network to recognize patterns in data. For example, let's say you want to train a computer to recognize faces in pictures. You have a bunch of pictures of faces, and you want the computer to be able to recognize the faces. You give the computer a picture of a face, and it looks at the pictures and tries to figure out what the face looks like. You then give it another picture of the same face and it does the same thing, and so on. Machine learning is a way of teaching a computer how to recognise faces.", 'type': 'generative', 'score': None, 'context': None, 'offsets_in_document': None, 'offsets_in_context': None, 'document_ids': ['d0cf790b5e9880fb4caa647b7ff55fd6', 'c614b1b24441ba6ae044a23fd40a4681', '9f194f13ccb811c3c34cd662bf9f262d', 'e71aeadb00daaa25d07c3d506c7cf9d4', 'cc42c1df9b4afa058ecfb69c2715b9be', 'c2ef8a6a81a76b9b84de55da638d2d84', 'a8a8e51de24d39f9ae17d

In [1]:
from crapstack.document_stores.faiss import FAISSDocumentStore
from crapstack.haystack.nodes.retriever import DensePassageRetriever
from crapstack.haystack.nodes.answer_generator import RAGenerator


# Initialize FAISS document store.
# Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding
document_store = FAISSDocumentStore(faiss_index_factory_str="Flat", return_embedding=True)

# Initialize DPR Retriever to encode documents, encode question and query documents
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    use_gpu=True,
    embed_title=True,
)

# Initialize RAG Generator
generator = RAGenerator(
    model_name_or_path="facebook/rag-token-nq",
    use_gpu=True,
    top_k=1,
    max_length=200,
    min_length=2,
    embed_title=True,
    num_beams=2,
)


  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 7.09kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 935kB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.41MB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 493/493 [00:00<00:00, 61.5kB/s]
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [01:56<00:00, 3.76MB/s] 
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 3.46kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 568kB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 

In [4]:
# Delete existing documents in documents store
document_store.delete_documents()

# Write documents to document store
document_store.write_documents(documents)

# Add documents embeddings to index
document_store.update_embeddings(retriever=retriever)


Writing Documents: 10000it [00:06, 1539.28it/s]            
Updating Embedding:   0%|          | 0/3903 [16:52<?, ? docs/s]


KeyboardInterrupt: 