In [None]:
# !pip install git+https://github.com/deepset-ai/haystack.git
# !pip install 'ray[default]'

In [14]:
import json
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from glob import glob
from tqdm.notebook import tqdm
from sklearn.manifold import TSNE
from haystack import Pipeline
from haystack.utils import (
    clean_wiki_text, 
    convert_files_to_dicts, 
    fetch_archive_from_http, 
    print_answers, 
    print_documents, 
)
from haystack.nodes import (
    PreProcessor, 
    FARMReader, 
    TransformersReader, 
    DensePassageRetriever, 
    TransformersQueryClassifier,
)
from haystack.document_stores import InMemoryDocumentStore, FAISSDocumentStore

Generating new fontManager, this may take some time...
Failed to extract font properties from /System/Library/Fonts/Apple Color Emoji.ttc: In FT2Font: Could not set the fontsize (error code 0x17)
Failed to extract font properties from /System/Library/Fonts/Supplemental/NISC18030.ttf: In FT2Font: Could not set the fontsize (error code 0x17)
Failed to extract font properties from /System/Library/Fonts/LastResort.otf: tuple indices must be integers or slices, not str


In [2]:
files = glob("data/fbmetaverse/*.json")

docs = []
for f in tqdm(files[:10]):
    with open(f, 'rb') as fp:
        data = fp.read()
        docs.append(json.loads(data))

  0%|          | 0/10 [00:00<?, ?it/s]

In [4]:
# load and clean documents with processor
processor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=200,
    split_respect_sentence_boundary=True,
    split_overlap=0
)
documents = processor.process(docs)

# create document store
document_store = FAISSDocumentStore(
    similarity='dot_product',
    return_embedding=True,
    faiss_index_factory_str="Flat"
)
document_store.write_documents(documents, duplicate_documents='overwrite')

# load retriever
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base"
)

# save/load document store
document_store.update_embeddings(retriever)
document_store.save("facebook-meta.faiss")
document_store = FAISSDocumentStore.load("facebook-meta.faiss")

  0%|          | 0/10 [00:00<?, ?docs/s]One or more sentence found with word count higher than the split length.
100%|██████████| 10/10 [00:00<00:00, 378.65docs/s]
Using devices: CPU
Number of GPUs: 0
Lock 140569170383632 acquired on /Users/nicholaslincoln/.cache/huggingface/transformers/4ad08b5f983c1384baaf257d8edf51a7a3961fd8c75a1778ac604e3c0b564dd9.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99.lock


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Lock 140569170383632 released on /Users/nicholaslincoln/.cache/huggingface/transformers/4ad08b5f983c1384baaf257d8edf51a7a3961fd8c75a1778ac604e3c0b564dd9.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99.lock
Lock 140569170706976 acquired on /Users/nicholaslincoln/.cache/huggingface/transformers/b305bc9085b3d0ce33551c251b75c11b6c6df1d4d51e5d3439d01cf4bb1abc9d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4.lock


Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Lock 140569170706976 released on /Users/nicholaslincoln/.cache/huggingface/transformers/b305bc9085b3d0ce33551c251b75c11b6c6df1d4d51e5d3439d01cf4bb1abc9d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4.lock
Lock 140569169403472 acquired on /Users/nicholaslincoln/.cache/huggingface/transformers/d5b5f07ee846d5baa7142e121b6ee77d11ac68bd5d4541faab38a1ea76c2954a.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79.lock


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Lock 140569169403472 released on /Users/nicholaslincoln/.cache/huggingface/transformers/d5b5f07ee846d5baa7142e121b6ee77d11ac68bd5d4541faab38a1ea76c2954a.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79.lock
Lock 140569170286048 acquired on /Users/nicholaslincoln/.cache/huggingface/transformers/52774638a790c9ebc5ce11005b260f79cd4cc389abdab9eaa31e8f09d15b4f46.13b559f49587470ab6d85a7dde13174670a0b61c1b942d1489c96023f5d03772.lock


Downloading:   0%|          | 0.00/493 [00:00<?, ?B/s]

Lock 140569170286048 released on /Users/nicholaslincoln/.cache/huggingface/transformers/52774638a790c9ebc5ce11005b260f79cd4cc389abdab9eaa31e8f09d15b4f46.13b559f49587470ab6d85a7dde13174670a0b61c1b942d1489c96023f5d03772.lock
Lock 140569163550624 acquired on /Users/nicholaslincoln/.cache/huggingface/transformers/41dac75f5df9070331cb0e4bf318c9fdeaef38d9ffd8ca80993c7db830d0c674.446ee898f4788c3ee90f8e7ee5a50281905f509e698f76dc0b583eb74ef973bd.lock


Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Lock 140569163550624 released on /Users/nicholaslincoln/.cache/huggingface/transformers/41dac75f5df9070331cb0e4bf318c9fdeaef38d9ffd8ca80993c7db830d0c674.446ee898f4788c3ee90f8e7ee5a50281905f509e698f76dc0b583eb74ef973bd.lock
Lock 140569170782048 acquired on /Users/nicholaslincoln/.cache/huggingface/transformers/deacb2c219c1bfe83909173f286b60d7cbfd37fc73dc8de723805ca82cabd183.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99.lock


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Lock 140569170782048 released on /Users/nicholaslincoln/.cache/huggingface/transformers/deacb2c219c1bfe83909173f286b60d7cbfd37fc73dc8de723805ca82cabd183.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99.lock
Lock 140569170814528 acquired on /Users/nicholaslincoln/.cache/huggingface/transformers/9a42d18175a45f8dcfd587d7056cbe397e0fe49828bcc543bc3f5b4d2862f7e5.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4.lock


Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Lock 140569170814528 released on /Users/nicholaslincoln/.cache/huggingface/transformers/9a42d18175a45f8dcfd587d7056cbe397e0fe49828bcc543bc3f5b4d2862f7e5.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4.lock
Lock 140569170300304 acquired on /Users/nicholaslincoln/.cache/huggingface/transformers/70b0d7ed89bb3511a323f99b7cfa4a3e0c35754fda6a3ac74c3458ca8ffb5764.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79.lock


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Lock 140569170300304 released on /Users/nicholaslincoln/.cache/huggingface/transformers/70b0d7ed89bb3511a323f99b7cfa4a3e0c35754fda6a3ac74c3458ca8ffb5764.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79.lock
Lock 140569170901312 acquired on /Users/nicholaslincoln/.cache/huggingface/transformers/f31ea67434695abc6c4fbe109214416d8b48a44f2fe5a0617e7faa3d6a4f8d05.be8dbf4cc0650b9c5997b3b3bc47d0d6c20749c3871e9285d3b624cd75dd9ee6.lock


Downloading:   0%|          | 0.00/492 [00:00<?, ?B/s]

Lock 140569170901312 released on /Users/nicholaslincoln/.cache/huggingface/transformers/f31ea67434695abc6c4fbe109214416d8b48a44f2fe5a0617e7faa3d6a4f8d05.be8dbf4cc0650b9c5997b3b3bc47d0d6c20749c3871e9285d3b624cd75dd9ee6.lock
Lock 140568816676976 acquired on /Users/nicholaslincoln/.cache/huggingface/transformers/2623d56adfe8cc7bf9275b0c620a0e271ee4004c335173bde56310dc8ea99d4f.714228ba33c6248205269978fd6d0ca0ef96508cbd4a11d894882e71d45fad7c.lock


Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Lock 140568816676976 released on /Users/nicholaslincoln/.cache/huggingface/transformers/2623d56adfe8cc7bf9275b0c620a0e271ee4004c335173bde56310dc8ea99d4f.714228ba33c6248205269978fd6d0ca0ef96508cbd4a11d894882e71d45fad7c.lock
Updating embeddings for 59 docs...
Updating Embedding:   0%|          | 0/59 [00:00<?, ? docs/s]

Create embeddings:   0%|          | 0/64 [00:00<?, ? Docs/s]

Documents Processed: 10000 docs [00:35, 285.44 docs/s]        


In [5]:
# create reader from specified model
model = "deepset/roberta-base-squad2"
reader = FARMReader(model, use_gpu=True)

Using devices: CPU
Number of GPUs: 0
Using devices: CPU
Number of GPUs: 0
Lock 140569488125856 acquired on /Users/nicholaslincoln/.cache/huggingface/transformers/c40d0abb589629c48763f271020d0b1f602f5208c432c0874d420491ed37e28b.122ed338b3591c07dba452777c59ff52330edb340d3d56d67aa9117ad9905673.lock


Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Lock 140569488125856 released on /Users/nicholaslincoln/.cache/huggingface/transformers/c40d0abb589629c48763f271020d0b1f602f5208c432c0874d420491ed37e28b.122ed338b3591c07dba452777c59ff52330edb340d3d56d67aa9117ad9905673.lock
Lock 140569170234912 acquired on /Users/nicholaslincoln/.cache/huggingface/transformers/eac3273a8097dda671e3bea1db32c616e74f36a306c65b4858171c98d6db83e9.084aa7284f3a51fa1c8f0641aa04c47d366fbd18711f29d0a995693cfdbc9c9e.lock


Downloading:   0%|          | 0.00/496M [00:00<?, ?B/s]

Lock 140569170234912 released on /Users/nicholaslincoln/.cache/huggingface/transformers/eac3273a8097dda671e3bea1db32c616e74f36a306c65b4858171c98d6db83e9.084aa7284f3a51fa1c8f0641aa04c47d366fbd18711f29d0a995693cfdbc9c9e.lock
Some weights of the model checkpoint at deepset/roberta-base-squad2 were not used when initializing RobertaModel: ['qa_outputs.weight', 'qa_outputs.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['roberta.pooler.dense.bias',

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Lock 140567826999952 released on /Users/nicholaslincoln/.cache/huggingface/transformers/81c80edb4c6cefa5cae64ccfdb34b3b309ecaf60da99da7cd1c17e24a5d36eb5.647b4548b6d9ea817e82e7a9231a320231a1c9ea24053cc9e758f3fe68216f05.lock
Lock 140567826998704 acquired on /Users/nicholaslincoln/.cache/huggingface/transformers/b87d46371731376b11768b7839b1a5938a4f77d6bd2d9b683f167df0026af432.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock


Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Lock 140567826998704 released on /Users/nicholaslincoln/.cache/huggingface/transformers/b87d46371731376b11768b7839b1a5938a4f77d6bd2d9b683f167df0026af432.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock
Lock 140567792121984 acquired on /Users/nicholaslincoln/.cache/huggingface/transformers/c9d2c178fac8d40234baa1833a3b1903d393729bf93ea34da247c07db24900d0.cb2244924ab24d706b02fd7fcedaea4531566537687a539ebb94db511fd122a0.lock


Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

Lock 140567792121984 released on /Users/nicholaslincoln/.cache/huggingface/transformers/c9d2c178fac8d40234baa1833a3b1903d393729bf93ea34da247c07db24900d0.cb2244924ab24d706b02fd7fcedaea4531566537687a539ebb94db511fd122a0.lock
Lock 140567792121984 acquired on /Users/nicholaslincoln/.cache/huggingface/transformers/e8a600814b69e3ee74bb4a7398cc6fef9812475010f16a6c9f151b2c2772b089.451739a2f3b82c3375da0dfc6af295bedc4567373b171f514dd09a4cc4b31513.lock


Downloading:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Lock 140567792121984 released on /Users/nicholaslincoln/.cache/huggingface/transformers/e8a600814b69e3ee74bb4a7398cc6fef9812475010f16a6c9f151b2c2772b089.451739a2f3b82c3375da0dfc6af295bedc4567373b171f514dd09a4cc4b31513.lock
Failed to log params: Changing param values is not allowed. Param with key='processor' was already logged with value='TextSimilarityProcessor' for run ID='5a046582f81344a7b400bdc50b025df0'. Attempted logging new value 'SquadProcessor'.
ML Logging is turned off. No parameters, metrics or artifacts will be logged to MLFlow.
Using devices: CPU
Number of GPUs: 0
Got ya 7 parallel workers to do inference ...
 0    0    0    0    0    0    0 
/w\  /w\  /w\  /w\  /w\  /w\  /w\
/'\  / \  /'\  /'\  / \  / \  /'\
            


In [8]:
# classifier for pipeline input
question_classifier = TransformersQueryClassifier(
    model_name_or_path="shahrukhx01/bert-mini-finetune-question-detection"
)

Using devices: CPU
Number of GPUs: 0
Lock 140569170194496 acquired on /Users/nicholaslincoln/.cache/huggingface/transformers/4d234001a83eae968081ec17d51595e697e54a0db0fa1d5bb419e042cf14b24b.d76a0b1a82684bce82554efb4cbe093f287a29945e4298543a2561d1036510cf.lock


Downloading:   0%|          | 0.00/619 [00:00<?, ?B/s]

Lock 140569170194496 released on /Users/nicholaslincoln/.cache/huggingface/transformers/4d234001a83eae968081ec17d51595e697e54a0db0fa1d5bb419e042cf14b24b.d76a0b1a82684bce82554efb4cbe093f287a29945e4298543a2561d1036510cf.lock
Lock 140567826893120 acquired on /Users/nicholaslincoln/.cache/huggingface/transformers/e8abcef969219de89fbd9e9cf9a2b945bbe043b01854dc92f697180da4c7bd6f.40ad410a2c6387d007645e40a171b1568fcda3cafd4008a4032aa4b5e5b6e69a.lock


Downloading:   0%|          | 0.00/44.7M [00:00<?, ?B/s]

Lock 140567826893120 released on /Users/nicholaslincoln/.cache/huggingface/transformers/e8abcef969219de89fbd9e9cf9a2b945bbe043b01854dc92f697180da4c7bd6f.40ad410a2c6387d007645e40a171b1568fcda3cafd4008a4032aa4b5e5b6e69a.lock
Lock 140567123820208 acquired on /Users/nicholaslincoln/.cache/huggingface/transformers/7afed1f7c22a9b2f2a4751a50d00591c62f8d4ce4fca06dcd7c881ce0aae62fc.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99.lock


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Lock 140567123820208 released on /Users/nicholaslincoln/.cache/huggingface/transformers/7afed1f7c22a9b2f2a4751a50d00591c62f8d4ce4fca06dcd7c881ce0aae62fc.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99.lock
Lock 140567123817568 acquired on /Users/nicholaslincoln/.cache/huggingface/transformers/5a5b2544837a891c31ea5e5af99f9a855590d01d7417f962eb35cf1d03ba166c.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock


Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Lock 140567123817568 released on /Users/nicholaslincoln/.cache/huggingface/transformers/5a5b2544837a891c31ea5e5af99f9a855590d01d7417f962eb35cf1d03ba166c.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
Lock 140567826894608 acquired on /Users/nicholaslincoln/.cache/huggingface/transformers/91b4cc2d434f81bed4fd4ad831991ff8ad4f840d7d22a5578ae0f6d2d897ea78.ddf44c24d87fa9a8d4c3d7ae53c574f2a5391a17dcef25d20f9a64d70ee350c5.lock


Downloading:   0%|          | 0.00/334 [00:00<?, ?B/s]

Lock 140567826894608 released on /Users/nicholaslincoln/.cache/huggingface/transformers/91b4cc2d434f81bed4fd4ad831991ff8ad4f840d7d22a5578ae0f6d2d897ea78.ddf44c24d87fa9a8d4c3d7ae53c574f2a5391a17dcef25d20f9a64d70ee350c5.lock


In [6]:
def determine_input_classification(input_text):
  """
  If the question classifier classification does not match the return_type, 
  then adjust the pipeline so that the results can be obtained.  The results 
  depend solely on the classification in this scenario, meaning your 
  return_type will be ignored so that results can be returned.  If this 
  adjustment were not made, errors caused by misclassification could prevent 
  results from showing at all.

  :param input_text: (str) the question or key words to search

  :return: (str) classification output
  """
  prediction = question_classifier.run(query=input_text)
  if prediction[1] == "output_1":
    category = "question"
  else:
    category = "keywords"
  return category


def build_pipeline(input_text, return_type="answer"):
  """
  Builds Haystack pipeline

  :param input_text: (str) the question or key words to search
  :param return_type: (str) either "answer" to return Q&A style answers and 
    their contexts, or "document" to return whole documents

  :return: Haystack pipeline object
  """
  category = determine_input_classification(input_text=input_text)

  pipe = Pipeline()
  pipe.add_node(
      component=question_classifier, 
      name="QueryClassifier", 
      inputs=["Query"]
  )

  if category == "question":
    query_classifier_output = "output_1"
  else:
    query_classifier_output = "output_2"

  pipe.add_node(
      component=retriever, 
      name="ESRetriever", 
      inputs=[f"QueryClassifier.{query_classifier_output}"]
  )

  if return_type == "answer":
    pipe.add_node(
        component=reader, 
        name="QAReader", 
        inputs=["ESRetriever"]
    )

  return pipe


def view_results(prediction):
  """
  Prints result from Haystack query

  :param prediction: (Haystack pipeline output) the object containing the 
    output of a pipeline
  """
  # try will only work if return_type was "answer" and the query was 
  #   correctly classified
  try:
    print_answers(prediction, details="minimal")
  except:
    for doc in prediction["documents"]:
      print(doc.content)


In [9]:
# Example of key word search Q&A
query_string = "zuckerberg position"
return_type = "answer"
pipe = build_pipeline(input_text=query_string, return_type=return_type)
prediction = pipe.run(query=query_string)
view_results(prediction=prediction)

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.31 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.62 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.51 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.60 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.60 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.29s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.62 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.19s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.64 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.70 Batches/s]
print_answers received details='minimal', which was not understood. Valid values are 'minimum', 'medium', and 'all'. Using 'all'.



Query: zuckerberg position
Answers:
[   <Answer {'answer': 'CTO', 'type': 'extractive', 'score': 0.637283205986023, 'context': 'd. Zuckerberg echoed this in a statement on Facebook\'s blog. "As our next CTO, Boz will continue leading Facebook Reality Labs and overseeing our work', 'offsets_in_document': [{'start': 190, 'end': 193}], 'offsets_in_context': [{'start': 74, 'end': 77}], 'document_id': '4d05d9b41b8672163ca4408b18f99ec5', 'meta': {'publisher': 'Business Insider', 'title': 'How Andrew Bosworth, Mark Zuckerberg’s old teaching assistant at Harvard, rose to become his key lieutenant in building the metaverse', 'publish_date': '2021-11-07 10:15:00', 'mediatype': 'News', '_split_id': '6', 'vector_id': '17'}}>,
    <Answer {'answer': 'key lieutenants', 'type': 'extractive', 'score': 0.5706248432397842, 'context': 'n at the company since 2006 and has become one of Mark Zuckerberg\'s key lieutenants. Andrew Bosworth, also known as "Boz" inside the company formerly ', 'offsets_in_docu

In [41]:
# Example of key word search document retrieval
query_string = "what is mark zuckerberg's vision of the metaverse"
return_type = "document"
pipe = build_pipeline(input_text=query_string, return_type=return_type)
prediction = pipe.run(query=query_string, params={"ESRetriever": {"top_k": 100}})
view_results(prediction=prediction)

The term “metaverse” seems to be everywhere. Facebook is hiring thousands of engineers in Europe to work on it, while video game companies are outlining their long-term visions for what some consider the next big thing online. The metaverse, which could spring up again when Facebook releases earnings Monday, is the latest buzzword to capture the tech industry’s imagination. It could be the future, or it could be the latest grandiose vision by Facebook CEO Mark Zuckerberg that doesn’t turn out as expected or isn’t widely adopted for years — if at all. Plus, many have concerns about a new online world tied to a social media giant that could get access to even more personal data and is accused of failing to stop harmful content. Here’s what this online world is all about: WHAT IS THE METAVERSE? Think of it as the internet brought to life, or at least rendered in 3D. Zuckerberg has described it as a “virtual environment” you can go inside of — instead of just looking at on a screen.
Essent

In [77]:
# embed documents to show how similar they are
documents = {idx: doc.content for idx, doc in enumerate(prediction["documents"])}
embeddings = np.array([doc.embedding for doc in prediction["documents"]])
scores = {idx: doc.score for idx, doc in enumerate(prediction["documents"])}
tsne = TSNE(n_components=2, perplexity=3.9, init='random', random_state=14, n_jobs=-1)
embeddings_reduced = tsne.fit_transform(embeddings)
graph_data = pd.DataFrame({
    'x': embeddings_reduced[:,0], 
    'y': embeddings_reduced[:,1], 
    'doc': documents.values(), 
    'score': scores.values()
})

In [78]:
# wrap long text lines
graph_data.doc = graph_data.doc.str.wrap(50)
graph_data.doc = graph_data.doc.apply(lambda x: x.replace('\n', '<br>'))

In [79]:
fig = px.scatter(
    graph_data, 
    x='x', y='y', 
    color="score", 
    hover_data=['doc'], 
    title="TSNE Plot of Retrieved Documents Colored by Relevancy Score"
)
fig.show()