The implementation is from the book, hands on large language models by `Jay Alammar`

In [None]:
!pip install cohere
!pip install faiss-cpu # vector db

In [4]:
import cohere
import numpy as np
import pandas as pd
from tqdm import tqdm

In [5]:
api_key = ''

In [6]:
co = cohere.Client(api_key)

In [7]:
# First section of Wikipedia article on Interstellar
text = """
Interstellar is a 2014 epic science fiction film co-written,
directed, and produced by Christopher Nolan.
It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain,
Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine.
Set in a dystopian future where humanity is struggling to
survive, the film follows a group of astronauts who travel
through a wormhole near Saturn in search of a new home for
mankind.
Brothers Christopher and Jonathan Nolan wrote the screenplay,
which had its origins in a script Jonathan developed in 2007.
Caltech theoretical physicist and 2017 Nobel laureate in
Physics[4] Kip Thorne was an executive producer, acted as a
scientific consultant, and wrote a tie-in book, The Science of
Interstellar.
Cinematographer Hoyte van Hoytema shot it on 35 mm movie film in
the Panavision anamorphic format and IMAX 70 mm.
Principal photography began in late 2013 and took place in
Alberta, Iceland, and Los Angeles.
Interstellar uses extensive practical and miniature effects and
the company Double Negative created additional digital effects.
Interstellar premiered on October 26, 2014, in Los Angeles.
In the United States, it was first released on film stock,
expanding to venues using digital projectors.
The film had a worldwide gross over $677 million (and $773
million with subsequent re-releases), making it the tenth-highest
grossing film of 2014.
It received acclaim for its performances, direction, screenplay,
musical score, visual effects, ambition, themes, and emotional
weight.
It has also received praise from many astronomers for its
scientific accuracy and portrayal of theoretical astrophysics.
Since its premiere, Interstellar gained a cult following,[5] and
now is regarded by many sci-fi experts as one of the best
science-fiction films of all time.
Interstellar was nominated for five awards at the 87th Academy
Awards, winning Best Visual Effects, and received numerous other
accolades"""

In [8]:
texts = text.split('.') # Split text into lines/sentences
texts[:2]

['\nInterstellar is a 2014 epic science fiction film co-written,\ndirected, and produced by Christopher Nolan',
 ' \nIt stars Matthew McConaughey, Anne Hathaway, Jessica Chastain,\nBill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine']

In [9]:
# Clean up to remove empty spaces and new lines
texts = [t.strip(' \n') for t in texts]

In [10]:
# Get the embeddings
response = co.embed(
 texts=texts,
 input_type="search_document",
).embeddings
embeds = np.array(response)
print(embeds.shape)

(15, 4096)


## Building Search index

In [18]:
import faiss
import numpy as np
import pandas as pd

In [23]:
dims = embeds.shape[1]
index = faiss.IndexFlatL2(dims)
print(index.is_trained)
index.add(np.float32(embeds))

True


# Dense Retrieval

## Search the Index

In [26]:
def search(query, no_results=3):
  query_embeds = co.embed(texts=[query], input_type="search_query").embeddings
  query_embeds = np.float32(query_embeds)
  distances, similar_items_indices = index.search(query_embeds, k=no_results)

  texts_np = np.array(texts)
  results = pd.DataFrame(data={'texts':texts_np[similar_items_indices[0]], 'distance':distances[0]})
  print(f"Query:'{query}'\nNearest neighbors:")

  return results

In [38]:
query = "who wrote the screenplay"
results = search(query)
results

Query:'who wrote the screenplay'
Nearest neighbors:


Unnamed: 0,texts,distance
0,Brothers Christopher and Jonathan Nolan wrote ...,7989.950684
1,Interstellar is a 2014 epic science fiction fi...,10071.662109
2,"It stars Matthew McConaughey, Anne Hathaway, J...",10095.088867


In [47]:
query1 = "how precise was the science"
results1 = search(query1)
results1

Query:'how precise was the science'
Nearest neighbors:


Unnamed: 0,texts,distance
0,It has also received praise from many astronom...,11216.210938
1,Interstellar uses extensive practical and mini...,12409.666016
2,Caltech theoretical physicist and 2017 Nobel l...,12459.5


In [39]:
query2 = "what was worldwide gross"
results2 = search(query2)
results2

Query:'what was worldwide gross'
Nearest neighbors:


Unnamed: 0,texts,distance
0,The film had a worldwide gross over $677 milli...,7834.939453
1,"In the United States, it was first released on...",12043.701172
2,"Interstellar premiered on October 26, 2014, in...",12448.242188


In [40]:
results2.iloc[0][0]

  results2.iloc[0][0]


'The film had a worldwide gross over $677 million (and $773\nmillion with subsequent re-releases), making it the tenth-highest\ngrossing film of 2014'

## Keyword Search

Keyword search is not always effective, because many times the top answers that are most relevant do not have those keywords.

In [42]:
!pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [43]:
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string

In [44]:
def bm25_tokenizer(text):
  tokenized_doc = []
  for token in text.lower().split():
    token = token.strip(string.punctuation)
    if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
      tokenized_doc.append(token)

  return tokenized_doc

In [45]:
tokenized_corpus = []
for passage in tqdm(texts):
 tokenized_corpus.append(bm25_tokenizer(passage))
bm25 = BM25Okapi(tokenized_corpus)

100%|██████████| 15/15 [00:00<00:00, 21006.53it/s]


In [48]:
def keyword_search(query, top_k=3, num_candidates=15):
  print("Input question:", query)
  ##### BM25 search (lexical search) #####
  bm25_scores = bm25.get_scores(bm25_tokenizer(query))
  top_n = np.argpartition(bm25_scores, -num_candidates)[-num_candidates:]
  bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
  bm25_hits = sorted(bm25_hits, key=lambda x: x['score'],reverse=True)

  print(f"Top-3 lexical search (BM25) hits")
  for hit in bm25_hits[0:top_k]:
    print("\t{:.3f}\t{}".format(hit['score'],
  texts[hit['corpus_id']].replace("\n", " ")))

In [49]:
keyword_search(query = "how precise was the science")

Input question: how precise was the science
Top-3 lexical search (BM25) hits
	1.789	Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan
	1.373	Caltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar
	0.000	It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine


In [50]:
keyword_search(query = "who wrote the screenplay")

Input question: who wrote the screenplay
Top-3 lexical search (BM25) hits
	3.577	Brothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007
	1.663	It received acclaim for its performances, direction, screenplay, musical score, visual effects, ambition, themes, and emotional weight
	1.373	Caltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar


In [51]:
# Based on similarity irrelevant questions are also answered
keyword_search(query = "What is the mass of the moon?")

Input question: What is the mass of the moon?
Top-3 lexical search (BM25) hits
	0.000	Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan
	0.000	It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine
	0.000	Set in a dystopian future where humanity is struggling to survive, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for mankind
