# Semantic Search

In [1]:
''' 
Chapter 2: Launching an Application with Proprietary MOdels
    Overview of Proprietary Models
    Introduction to OpenAI + Embeddings / GPT3 / ChatGPT
    Introduction to Vector Databases
    Building a Neural/Semantic information retrieval system with vector databases, BERT & GPT3
'''

' \nChapter 2: Launching an Application with Proprietary MOdels\n    Overview of Proprietary Models\n    Introduction to OpenAI + Embeddings / GPT3 / ChatGPT\n    Introduction to Vector Databases\n    Building a Neural/Semantic information retrieval system with vector databases, BERT & GPT3\n'

In [2]:
# import libraries
from openai import OpenAI
from datetime import datetime, timezone
import hashlib
import re
import os
from tqdm.notebook import tqdm
import numpy as np

import logging
import keyring

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

## Vector Database

In [3]:
pinecone_key = keyring.get_password('pinecone', 'ahn283')
openai_api_key = keyring.get_password('openai', 'key_for_windows')

client = OpenAI(
    api_key=openai_api_key
)

INDEX_NAME = 'semantic-search'
NAMESPACE = 'default'
ENGINE = 'text-embedding-ada-002'

In [4]:
# from pinecone import Pinecone       # !pip install pinecone-client
# from pinecone import ServerlessSpec
# https://docs.pinecone.io/guides/get-started/quickstart

from pinecone import Pinecone 
from pinecone import ServerlessSpec, PodSpec

pc = Pinecone(
    api_key=pinecone_key, 
    # environment='us-west1-gcp'
)

In [5]:
# helper functions to get lists of embeddings from the OpenAI API
def get_embeddings(texts, engine=ENGINE):
    response = client.embeddings.create(
        input=texts,
        model=engine
    )
    
    return [d.embedding for d in list(response.data)]

def get_embedding(text):
    return get_embeddings([text])[0]


len(get_embedding('hi')), len(get_embeddings(['hi','hello']))

(1536, 2)

In [6]:
np.array(get_embedding('hi')).shape, np.array(get_embeddings(['hi', 'hello'])).shape

((1536,), (2, 1536))

In [7]:
if not INDEX_NAME in pc.list_indexes().names():
    pc.create_index(
        name=INDEX_NAME,         # the name of the index
        dimension=1536,     # The dimensionality of the vectors
        metric='cosine',    # The similarity metric to use when searching the index
        # pod_type = 'p1'     # the type of Pinecone pod  # deprecated
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
        # spec=PodSpec(
        #     environment='us-west1-gcp',
        #     pod_type='p1.x1',
        #     pods=1
        # )
    )
    
# store the index as a variable
index = pc.Index(INDEX_NAME)      # resource already exists

In [8]:
def my_hash(s):
    # return the WD5 hash of the input string as a hexadecimal string
    return hashlib.md5(s.encode()).hexdigest()

my_hash('I love to hash it.')

'27c3e2ecf64fd7081f80bc0837dfbeb8'

In [9]:
def prepare_for_pinecone(texts, engine=ENGINE):
    # get the current UTC date and time
    now = datetime.now(timezone.utc)
    
    # generate vector embeddings for each string in the input list, using the specified engine
    embeddings = get_embeddings(texts, engine=engine)
    
    # create tuples of (hash, embedding, metadata) for each input string and its corresponding vector embedding
    # The my_hash() function is used to generate a unique hash for each string, and the datetime.utcnow() function is used to generate the current UTC date and time
    return [
        (
            my_hash(text),      # A unique 1D for each string, generated using the my_hash() function
            embedding,          # the vector embedding of the string
            dict(text=text, date_upload=now)    # a dictionary of metadata, including the original text and the current UTC date and time
        )
        for text, embedding in zip(texts, embeddings)
    ]

In [10]:
texts = ['hi']
prepare_for_pinecone(texts)[0]

('49f68a5c8493ec2c0bf489821c21fc3b',
 [-0.030913319438695908,
  -0.020414210855960846,
  -0.019505759701132774,
  -0.04178878664970398,
  -0.024813713505864143,
  0.024307576939463615,
  -0.0179743692278862,
  -0.017701834440231323,
  -0.0065019200555980206,
  -0.015910886228084564,
  0.025890879333019257,
  -0.006949656642973423,
  -0.01790948025882244,
  -0.011848808266222477,
  0.011465960182249546,
  0.01648191176354885,
  0.038751959800720215,
  0.0005187098286114633,
  0.03221110627055168,
  -0.008701670914888382,
  -0.019635537639260292,
  -0.0049056401476264,
  -0.009298654273152351,
  -0.014327583834528923,
  -0.022867031395435333,
  0.002483642427250743,
  0.010051371529698372,
  -0.01176445186138153,
  0.0026069325394928455,
  -0.026020657271146774,
  0.014535229653120041,
  0.0006987779634073377,
  -0.035767048597335815,
  -0.014963500201702118,
  -0.009486833587288857,
  -0.024748824536800385,
  0.006988590583205223,
  -0.02111501805484295,
  0.01918131299316883,
  -0.0056

In [11]:
_id, embedding, metadata = prepare_for_pinecone(texts)[0]
print('ID: ', _id, '\nLEN: ', len(embedding), '\nMETA:', metadata)

ID:  49f68a5c8493ec2c0bf489821c21fc3b 
LEN:  1536 
META: {'text': 'hi', 'date_upload': datetime.datetime(2024, 8, 12, 9, 23, 10, 822889, tzinfo=datetime.timezone.utc)}


In [12]:
def upload_texts_to_pinecone(texts, namespace=NAMESPACE, batch_size=None, show_progress_bar=False):
    # call the prepare_for_pinecone function to prepare input texts for indexing
    total_upserted = 0
    if not batch_size:
        batch_size = len(texts)
    
    _range = range(0, len(texts), batch_size)
    for i in tqdm(_range) if show_progress_bar else _range:
        batch = texts[i : i + batch_size]
        prepared_texts = prepare_for_pinecone(batch)
        
        # use the upsert() method of the index object to upload the prepared texts to Pinecone
        total_upserted += index.upsert(
            prepared_texts,
            namespace=namespace
        )['upserted_count']
        
    return total_upserted

# call the upload_texts_to_pinecone() function with the input texts
upload_texts_to_pinecone(texts)

1

In [13]:
def query_from_pinecone(query, top_k=3):
    # get embedding from THE SAME embedder as the documents
    query_embedding = get_embeddings(query, engine=ENGINE)
    
    return index.query(
        vector=query_embedding,
        top_k=top_k,
        namespace=NAMESPACE,
        include_metadata=True   # gets the metadata (dates, text, etc)
    ).get('matches')
    
query_from_pinecone('hello')

[{'id': '49f68a5c8493ec2c0bf489821c21fc3b',
  'metadata': {'date_upload': '2024-08-11T15:12:11.606184+00:00', 'text': 'hi'},
  'score': 0.932166,
  'values': []},
 {'id': '093601540a641d12a6f734a9fa624ce5',
  'metadata': {'date_uploaded': '2024-07-20T00:07:10.001249+00:00',
               'text': "Alexander Graham Bell originally suggested 'ahoy-hoy' "
                       'be adopted as the standard greeting when answering a '
                       "telephone, before 'hello' (suggested by Thomas Edison) "
                       'became common.'},
  'score': 0.780908227,
  'values': []},
 {'id': '9588c26cecaaf486eae14858827a6699',
  'metadata': {'date_uploaded': '2024-07-20T00:06:57.790296+00:00',
               'text': 'The Abbott family -- wife Evelyn, husband Lee, '
                       'congenitally deaf daughter Regan, and sons Marcus and '
                       'Beau -- silently scavenge for supplies in a deserted '
                       'town. While out in the open, the f

In [14]:
texts

['hi']

In [15]:
import hashlib

def delete_texts_from_pinecone(texts, namespace=NAMESPACE):
    # compute the hash (id) for each text
    hashes = [hashlib.md5(text.encode()).hexdigest() for text in texts]
    
    # the ids parameter is used to specify the list of IDs (hashes) to delete.
    return index.delete(ids=hashes, namespace=namespace)

# delete our text
delete_texts_from_pinecone('hello')

{}

In [16]:
# importing the tiktoken library
import tiktoken

# initialize a tokenizer for the 'cl100k_base' model
tokenizer = tiktoken.get_encoding('cl100k_base')

# using the tokenizer to encode the text 'hey there' 
# the resulting output is a list of integers representing the encoded text
# this is the input format required for embedding using the 'ada-002' model
tokenizer.encode('hey there')

[36661, 1070]

## Document chunking

In [17]:
# function to split the text into chunks of a maximum of tokens, inspired by OpenAI
def overlapping_chunks(text, max_tokens=500, overlapping_factor=5):
    '''
    max_tokens : tokens we want per chunk
    overlapping_factor : number of sentences to start each chink with that overlaps with the previous chunk
    '''
    
    # split the text using punctuation
    sentences = re.split(r'[.?!]', text)
    
    # get the number of tokens for each sentence
    n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]
    
    chunks, tokens_so_far, chunk = [], 0, []
    
    # Loop through the sentences and tokens joined together in a tuple
    for sentence, token in zip(sentences, n_tokens):
        
        # if the number of tokens so far plus number of tokens in the current sentence is greater 
        # than the max number of tokens, then add the chunk to the list of chinks and reset
        # the chunk and tokens so far
        if tokens_so_far + token > max_tokens:
            chunks.append(". ".join(chunk) + ".")
            if overlapping_factor > 0:
                chunk = chunk[-overlapping_factor:]
                tokens_so_far = sum([len(tokenizer.encode(c)) for c in chunk])
            else:
                chunk = []
                tokens_so_far = 0
        
        
        # if the number of tokens in the current sentence is greater than the max number of
        # tokens, go to the next sentence
        if token > max_tokens:
            continue
        
        # Otherwise, add the sentence to the chunk and add the number of tokens to the total
        chunk.append(sentence)
        tokens_so_far += token + 1
    
    if chunk:
        chunks.append(". ".join(chunk) + ".")
        
    return chunks

In [18]:
import PyPDF2

# open the PDF file in read-binary mode
with open('./data/pds2.pdf', 'rb') as file:
    
    # create a PDF reader object
    reader = PyPDF2.PdfReader(file)
    
    # initialize an empty string to hold the text
    principles_of_ds = ''
    # Loop throught each page in the PDF file
    for page in tqdm(reader.pages):
        text = page.extract_text()
        principles_of_ds += '\n\n' + text[text.find(' ]')+2:]
    
# print the final string containing all the text from the PDF file
principles_of_ds = principles_of_ds.strip()

print(len(principles_of_ds))

  0%|          | 0/428 [00:00<?, ?it/s]

575490


In [19]:
from urllib.request import urlopen

# a textbook about insects
text = urlopen('https://www.gutenberg.org/cache/epub/10834/pg10834.txt').read().decode()

In [20]:
split = overlapping_chunks(principles_of_ds, overlapping_factor=0)
avg_length = sum([len(tokenizer.encode(t)) for t in split]) / len(split)
print(f'non-overlapping chunking approach has {len(split)} documents with average length {avg_length:.1f} tokens')

non-overlapping chunking approach has 286 documents with average length 474.1 tokens


In [21]:
split = overlapping_chunks(principles_of_ds)
avg_length = sum([len(tokenizer.encode(t)) for t in split]) / len(split)
print(f'overlapping chunking approach has {len(split)} documents with average length')

overlapping chunking approach has 392 documents with average length


### Custom Delimiters

In [22]:
# importing the Counter and re libraries
from collections import Counter
import re

# find all occurances of one or more spaces in 'principle_of_ds'
matches = re.findall(r'[\s]{1,}', principles_of_ds)

# The 10 most frequent spaces that occur in the document
most_common_spaces = Counter(matches).most_common(10)

# print the most common spaces and their frequencies
print(most_common_spaces)

[(' ', 82259), ('\n', 9220), ('  ', 1592), ('\n\n', 333), ('\n   ', 250), ('\n\n\n', 82), ('\n    ', 73), ('\n ', 46), (' \n', 39), ('     ', 34)]


In [23]:
# only keep documents of at least 100 characters split by a custom delimiter
split = list(filter(lambda x: len(x) > 50, principles_of_ds.split('\n\n')))

avg_length = sum([len(tokenizer.encode(t)) for t in split]) / len(split)
print(f'custom delimiter approach has {len(split)} documents with average length {avg_length:.1f} tokens') 

custom delimiter approach has 426 documents with average length 316.3 tokens


In [24]:
embeddings = None
for s in tqdm(range(0, len(split), 100)):
    if embeddings is None:
        embeddings = np.array(get_embeddings(split[s:s+100], engine=ENGINE))
    else:
        embeddings = np.vstack([embeddings, np.array(get_embeddings(split[s:s+100], engine=ENGINE))])

  0%|          | 0/5 [00:00<?, ?it/s]

### Clustering chunking

In [25]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# assume you have a list of text embeddings called 'embeddings' 
# first, compute the cosine similarity between all pairs of embeddings
cosine_sim_matrix = cosine_similarity(embeddings)

# instantiate the AgglomerativeClustering model
agg_clustering = AgglomerativeClustering(
    n_clusters=None,            # the algorithm will determine the optimal number of clusters based on the data
    distance_threshold=0.1,     # clusters will be formed untill all pairwise distances between clusters are greater than 0.1
    # affinity='precomputed',   # we are providing a precimputed distance matrix (1 - similarity matrix) as input
    metric='precomputed',       
    linkage='complete'          # form clusters by iteratively merging the smallest clusters based on the maximum distance between their components
)

# fit the model to the cosine distnace matrix (1 - similarity matrix)
agg_clustering.fit(1 - cosine_sim_matrix)

# get the cluster labels for each embedding
cluster_labels = agg_clustering.labels_

# print the number of embeddings in each cluster
unique_labels, counts = np.unique(cluster_labels, return_counts=True)
for label, count in zip(unique_labels, counts):
    print(f'Cluster {label}: {count} embeddings')

Cluster 0: 2 embeddings
Cluster 1: 3 embeddings
Cluster 2: 2 embeddings
Cluster 3: 2 embeddings
Cluster 4: 2 embeddings
Cluster 5: 2 embeddings
Cluster 6: 4 embeddings
Cluster 7: 2 embeddings
Cluster 8: 2 embeddings
Cluster 9: 2 embeddings
Cluster 10: 2 embeddings
Cluster 11: 2 embeddings
Cluster 12: 2 embeddings
Cluster 13: 2 embeddings
Cluster 14: 2 embeddings
Cluster 15: 2 embeddings
Cluster 16: 2 embeddings
Cluster 17: 2 embeddings
Cluster 18: 2 embeddings
Cluster 19: 2 embeddings
Cluster 20: 2 embeddings
Cluster 21: 2 embeddings
Cluster 22: 2 embeddings
Cluster 23: 2 embeddings
Cluster 24: 2 embeddings
Cluster 25: 2 embeddings
Cluster 26: 2 embeddings
Cluster 27: 2 embeddings
Cluster 28: 3 embeddings
Cluster 29: 2 embeddings
Cluster 30: 2 embeddings
Cluster 31: 2 embeddings
Cluster 32: 2 embeddings
Cluster 33: 2 embeddings
Cluster 34: 1 embeddings
Cluster 35: 2 embeddings
Cluster 36: 2 embeddings
Cluster 37: 2 embeddings
Cluster 38: 2 embeddings
Cluster 39: 2 embeddings
Cluster 40

## Re-ranking and Retrieved Results

In [26]:
pruned_documents = []
for _label, count in zip(unique_labels, counts):
    pruned_documents.append('\n\n'.join([text for text, label in zip(split, cluster_labels) if label == _label]))
avg_length = sum([len(tokenizer.encode(t)) for t in pruned_documents]) / len(pruned_documents)
print(f'Our pruning apporach has {len(pruned_documents)} documents with average length {avg_length} tokens')

Our pruning apporach has 340 documents with average length 396.3705882352941 tokens


In [27]:
print(pruned_documents[0])


How to Sound Like a Data
Scientist
No matter which industry you work in —IT, fashion, food, or finance —there is no doubt
that data affects your life and work. At some point this week, you will either have or hear a
conversation about data. News outlets are covering more and more stories about data leaks,
cybercrimes, and how data can give us a glimpse into our lives. But why now? What makes
this era such a hotbed of data-related industries?
In the nineteenth century, the world was in the grip of the I ndustrial Age . Mankind was
exploring its place in the industrial world, working with giant mechanical inventions.
Captains of industry, such as Henry Ford, recognized that using these machines could open
major market opportunities, enabling industries to achieve previously unimaginable
profits. Of course, the Industrial Age had its pros and cons. While mass production placed
goods in the hands of more consumers, our battle with pollution also began at around this
time.
By the twentie t

In [28]:
upload_texts_to_pinecone(pruned_documents, batch_size=128)

340

In [29]:
query = 'How do z scores work?'

results_from_pinecone = query_from_pinecone(query=query, top_k=5)
for result_from_pinecone in results_from_pinecone:
    print(f"{result_from_pinecone['id']}\t{result_from_pinecone['score']:.2f}\t{result_from_pinecone['metadata']['text'][:50]}")

28c9f36450ba7f7a78fcf29f2e4e9909	0.84	Z-scores are an effective way to standardize  data
2a8a0bb77991e8040e16f9a02b82703b	0.84	Let's begin by learning a very  important value  i
268922260e7eabb5ed6170d07ec4585a	0.83	The preceding code gives us this graph:
Now, our d
7da124bb8c927e0c392cf75d4bfb2c76	0.82	
Basic Statistics
This chapter will focus on the s
e1d5467e3658041aa16113edf0d3ebec	0.81	
This is no coincidence! When we standardize the d


"""

This example computes the score between a query and all possible sentences in a corpus using a Cross-Encoder for semantic textual similarity (STS). It outputs then the most similar sentences for the given query.

"""

In [34]:
from sentence_transformers.cross_encoder import CrossEncoder
import numpy as np
from torch import nn

# pre-trained cross encoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')



In [41]:
def get_results_from_pinecone(query, top_k=3, re_rank=False, verbose=True):
    
    results_from_pinecone = query_from_pinecone(query, top_k=top_k)
    if not results_from_pinecone:
        return []
    
    if verbose:
        print("Query: ", query)
        
    final_results = []
    
    if re_rank:
        if verbose:
            print("Document ID (Hash)\t\tRetrieval Score\tCE Score\tText")
        
        sentence_combinations = [[query, result_from_pinecone['metadata']['text']] for result_from_pinecone in results_from_pinecone]
        
        # compute the similarity scores for these combinations
        similarity_scores = cross_encoder.predict(sentence_combinations, activation_fct=nn.Sigmoid())
        
        # sort the scores in decreasing order
        sim_scores_argsort = reversed(np.argsort(similarity_scores))
        
        # print the scores
        for idx in sim_scores_argsort:
            result_from_pinecone = results_from_pinecone[idx]
            final_results.append(result_from_pinecone)
            if verbose:
                print(f"{result_from_pinecone['id']}\t{result_from_pinecone['score']:.2f}\t{similarity_scores[idx]:.2f}\t{result_from_pinecone['metadata']['text'][:50]}")
        return final_results
    
    if verbose:
        print('Document ID (Hash)\t\tRetrieval Score\tText')
    for result_from_pinecone in results_from_pinecone:
        final_results.append(result_from_pinecone)
        if verbose:
            print(f"{result_from_pinecone['id']}\t{result_from_pinecone['score']:.2f}\t{result_from_pinecone['metadata']['text'][:50]}")
            
    return final_results

In [42]:
final_results = get_results_from_pinecone(query, top_k=3, re_rank=True)

Query:  How do z scores work?
Document ID (Hash)		Retrieval Score	CE Score	Text


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2a8a0bb77991e8040e16f9a02b82703b	0.84	0.99	Let's begin by learning a very  important value  i
28c9f36450ba7f7a78fcf29f2e4e9909	0.84	0.99	Z-scores are an effective way to standardize  data
268922260e7eabb5ed6170d07ec4585a	0.83	0.68	The preceding code gives us this graph:
Now, our d


In [43]:
final_results = get_results_from_pinecone(query, top_k=10, re_rank=True)

Query:  How do z scores work?
Document ID (Hash)		Retrieval Score	CE Score	Text


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2a8a0bb77991e8040e16f9a02b82703b	0.84	0.99	Let's begin by learning a very  important value  i
28c9f36450ba7f7a78fcf29f2e4e9909	0.84	0.99	Z-scores are an effective way to standardize  data
e1d5467e3658041aa16113edf0d3ebec	0.81	0.97	
This is no coincidence! When we standardize the d
7da124bb8c927e0c392cf75d4bfb2c76	0.82	0.93	
Basic Statistics
This chapter will focus on the s
268922260e7eabb5ed6170d07ec4585a	0.83	0.68	The preceding code gives us this graph:
Now, our d
75cf0bcd29070579356e3c50e9440d94	0.80	0.01	The following is the probability distribution of o
e9ec4bdefba2f80157ffce04812a1cf4	0.80	0.00	
Let's look at each of the elements in this formul
f90c995652f29ac9c2a4ca94f0e6809c	0.81	0.00	
We can think of this problem like as follows:
The
5bf0b775d7e41ee065e79ef1d2ac9f2c	0.80	0.00	The empirical rule
Recall that a normal distributi
0fdfad6a76d2dafdcc309cf732e744fe	0.79	0.00	First, take a minute and convince yourself that yo


In [44]:
delete_texts_from_pinecone(pruned_documents)

{}

## BoolQ

In [45]:
from datasets import load_dataset
from evaluate import load

dataset = load_dataset("boolq")

In [46]:
dataset['validation'][0]

{'question': 'does ethanol take more energy make that produces',
 'answer': False,
 'passage': "All biomass goes through at least some of these steps: it needs to be grown, collected, dried, fermented, distilled, and burned. All of these steps require resources and an infrastructure. The total amount of energy input into the process compared to the energy released by burning the resulting ethanol fuel is known as the energy balance (or ``energy returned on energy invested''). Figures compiled in a 2007 report by National Geographic Magazine point to modest results for corn ethanol produced in the US: one unit of fossil-fuel energy is required to create 1.3 energy units from the resulting ethanol. The energy balance for sugarcane ethanol produced in Brazil is more favorable, with one unit of fossil-fuel energy required to create 8 from the ethanol. Energy balance estimates are not easily produced, thus numerous such reports have been generated that are contradictory. For instance, a sep

In [47]:
from random import sample

query = sample(dataset['validation']['question'], 1)[0]
print(query)
final_results = get_results_from_pinecone(query, top_k=3, re_rank=True)

is a garbanzo bean and a chickpea the same thing
Query:  is a garbanzo bean and a chickpea the same thing
Document ID (Hash)		Retrieval Score	CE Score	Text


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

c3180d42f8e2598e9247dac0a5cac23f	0.86	1.00	The chickpea or chick pea (Cicer arietinum) is a l
cf784f1c35b6e06c62e49be2fa08320f	0.85	1.00	The chickpea or chick pea (Cicer arietinum) is a l
d2e2c2b3d5c70978fb6b9afc0a33c9ac	0.78	0.00	Cultivated cowpeas are known by the common names b


In [48]:
q_to_hash = {data['question']: my_hash(data['passage']) for data in dataset['validation']}
q_to_hash[query]

'cf784f1c35b6e06c62e49be2fa08320f'

In [49]:
# Let's test the performance re-ranking against 1000 of our validation datapoints
# Note we could not use Pinecone here to speed things up
# but it's also a good time to test latency of the pipeline with Pinecone
val_sample = dataset['validation']#[:1000]

In [50]:
logger.setLevel(logging.CRITICAL)

predictions = []

# Note we will keep top_k the same so latency from Pinecone is consistent
# and the only major time difference will be in the re-ranking

for question in tqdm(val_sample['question']):
    retrieved_hash = get_results_from_pinecone(question, top_k=1, re_rank=False, verbose=False)[0]['id']
    correct_hash = q_to_hash[question]
    predictions.append(retrieved_hash == correct_hash)
    
accuracy = sum(predictions)/len(predictions)
print(f'Accuracy without re-ranking: {accuracy}')

  0%|          | 0/3270 [00:00<?, ?it/s]

Accuracy without re-ranking: 0.8486238532110092


In [51]:
logger.setLevel(logging.CRITICAL)

predictions = []

# Note we will keep top_k the same as latency from Pinecone is consistent
# and the only major time difference will be in the re-ranking

for question in tqdm(val_sample['question']):
    retrieved_hash = get_results_from_pinecone(question, top_k=3, re_rank=True, verbose=False)[0]['id']
    correct_hash = q_to_hash[question]
    predictions.append(retrieved_hash == correct_hash)
    
accuracy = sum(predictions) / len(predictions)
print(f'Accuracy with re-ranking: {accuracy}')

  0%|          | 0/3270 [00:00<?, ?it/s]

Accuracy with re-ranking: 0.8422018348623853


In [52]:
# note the time differences between with and without re-ranking

In [56]:
def eval_ranking(query, cross_encoder, top_k=3):
    results_from_pinecone = query_from_pinecone(query, top_k=top_k)
    sentence_combinations = [[query, result_from_pinecone['metadata']['text']] for result_from_pinecone in results_from_pinecone]
    similarity_scores = cross_encoder.predict(sentence_combinations)
    sim_scores_argsort = list(reversed(np.argsort(similarity_scores)))
    re_ranked_final_result = results_from_pinecone[sim_scores_argsort[0]]
    return results_from_pinecone[0]['id'], re_ranked_final_result['id']

In [57]:
# trying another pre-trained cross encoder
# sentence-transformers/mullti-qu-mpnet-base-cos-v1
newer_cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')



In [58]:
i = 0
print_every = 50
predictions = []
for question in tqdm(val_sample['question']):
    retrieved_hash, reranked_hash = eval_ranking(question, newer_cross_encoder, top_k=3)
    correct_hash = q_to_hash[question]
    predictions.append((retrieved_hash == correct_hash, reranked_hash == correct_hash))
    i += 1
    if i % print_every == 0:
        print(f'Step {i}')
        raw_accuracy = sum([p[0] for p in predictions]) / len(predictions)
        reranked_accuracy = sum([p[1] for p in predictions]) / len(predictions)
        
        print(f'Accuracy without re-ranking: {raw_accuracy}')
        print(f'Accuracy with re-ranking: {reranked_accuracy}')

  0%|          | 0/3270 [00:00<?, ?it/s]

Step 50
Accuracy without re-ranking: 0.88
Accuracy with re-ranking: 0.84
Step 100
Accuracy without re-ranking: 0.85
Accuracy with re-ranking: 0.85
Step 150
Accuracy without re-ranking: 0.86
Accuracy with re-ranking: 0.8466666666666667
Step 200
Accuracy without re-ranking: 0.865
Accuracy with re-ranking: 0.845
Step 250
Accuracy without re-ranking: 0.872
Accuracy with re-ranking: 0.84
Step 300
Accuracy without re-ranking: 0.85
Accuracy with re-ranking: 0.8433333333333334
Step 350
Accuracy without re-ranking: 0.8628571428571429
Accuracy with re-ranking: 0.8485714285714285
Step 400
Accuracy without re-ranking: 0.865
Accuracy with re-ranking: 0.8475
Step 450
Accuracy without re-ranking: 0.86
Accuracy with re-ranking: 0.8422222222222222
Step 500
Accuracy without re-ranking: 0.852
Accuracy with re-ranking: 0.84
Step 550
Accuracy without re-ranking: 0.84
Accuracy with re-ranking: 0.84
Step 600
Accuracy without re-ranking: 0.8366666666666667
Accuracy with re-ranking: 0.8333333333333334
Step 650

In [59]:
raw_accuracy = sum([p[0] for p in predictions]) / len(predictions)
reranked_accuracy = sum([p[1] for p in predictions]) / len(predictions)

print(f'Using cross-encoder: {newer_cross_encoder.config._name_or_path}')
print(f'Accuracy without re-ranking: {raw_accuracy}')
print(f'Accuracy with re-ranking: {reranked_accuracy}')

Using cross-encoder: cross-encoder/ms-marco-MiniLM-L-12-v2
Accuracy without re-ranking: 0.8495412844036697
Accuracy with re-ranking: 0.8422018348623853


## Fine-tuning re-ranker

In [60]:
# https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/ms_marco/train_cross-encoder_scratch.py

In [61]:
dataset['train'][1]

{'question': 'do good samaritan laws protect those who help at an accident',
 'answer': True,
 'passage': "Good Samaritan laws offer legal protection to people who give reasonable assistance to those who are, or who they believe to be, injured, ill, in peril, or otherwise incapacitated. The protection is intended to reduce bystanders' hesitation to assist, for fear of being sued or prosecuted for unintentional injury or wrongful death. An example of such a law in common-law areas of Canada: a good Samaritan doctrine is a legal principle that prevents a rescuer who has voluntarily helped a victim in distress from being successfully sued for wrongdoing. Its purpose is to keep people from being reluctant to help a stranger in need for fear of legal repercussions should they make some mistake in treatment. By contrast, a duty to rescue law requires people to offer assistance and holds those who fail to do so liable."}

In [62]:
from sentence_transformers import InputExample, losses, evaluation
from torch.utils.data import DataLoader
from random import shuffle

shuffled_training_passages = dataset['train']['passage'].copy()
shuffle(shuffled_training_passages)

train_samples = [
    InputExample(texts=[d['question'], d['passage']], label=1) for d in dataset['train']
]

# add some negative samples
train_samples += [
    InputExample(texts=[d['question'] ,shuffled_training_passages[i]], label=0) for i, d in enumerate(dataset['train'])
]

shuffle(train_samples)

In [63]:
len(train_samples)

18854

In [67]:
# model = CrossEncoder('cross-encoder/ms-macro-MiniLM-L-12-v2', num_labels=1)
model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2', num_labels=1)



In [68]:
train_samples[0].__dict__

{'guid': '',
 'texts': ['was there a group called the five heartbeats',
  "On May 17, 2010, the Hawaii Five-O remake was picked up by CBS, which scheduled it for Monday nights in the 10--11 p.m. time slot. The news was good for the state of Hawaii, which hoped that the remake would pump new life into the economy. Production of the remainder of the first season started in June 2010. On June 24, 2010, the producers announced that it would use the warehouse at the former Honolulu Advertiser building as the official soundstage studio for the series starting in July 2010. Exteriors representing Five-0 headquarters in the series are located at the Ali'iolani Hale in Honolulu, directly across the street from Iolani Palace, which represented Five-O headquarters in the original series."],
 'label': 0}

In [69]:
model.predict(train_samples[0].texts, activation_fct=nn.Sigmoid())

6.5349435e-05

In [71]:
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator, CEBinaryClassificationEvaluator
import math
import torch
from random import sample

logger.setLevel(logging.DEBUG)          # just to get some logs
num_epochs = 2
model_save_path = './fine_tuned_ir_cross_encoder'

# train_samples = sample(train_samples, 1000)
# int(len(train_samples)*.8)
train_dataloader = DataLoader(train_samples[:int(len(train_samples)*.8)], shuffle=True, batch_size=32)

# An evaluator for training performance
evaluator = CEBinaryClassificationEvaluator.from_input_examples(train_samples[-int(len(train_samples)*.8):], name='test')

# rule of thumb for warmup step
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
print(f"Warmup-steps: {warmup_steps}")

Warmup-steps: 95


In [82]:
# # #### Load model and eval on test set
# print(evaluator(model))
from sentence_transformers.SentenceTransformer import SentenceTransformer
from torch import nn

criterion = nn.CrossEntropyLoss()

# train the model
model.fit(
    train_dataloader=train_dataloader,
    # loss_fct=losses.nn.CrossEntropyLoss(),
    # loss_fct=losses.SoftmaxLoss(model=SentenceTransformer, sentence_embedding_dimension=2, num_labels=2, loss_fct=nn.CrossEntropyLoss()),
    loss_fct=criterion,
    activation_fct=nn.Sigmoid(),
    evaluator=evaluator,
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    output_path=model_save_path,
    use_amp=True
)

# #### Load model and eval on test set
# print(evaluator(model))



Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/472 [00:00<?, ?it/s]

In [None]:
# run the more fine tuned version on open source as well to match??
# depend if it does better here

In [None]:
finetuned = CrossEncoder(model_save_path)

print(finetuned.predict(['hello', 'hi'], activation_fct=nn.Sigmoid()))
print(finetuned.predict(['hellp', 'hi'], activation_fct=nn.Identity()))