# Retrieve & Re-Rank Demo on Column Y from Education Excel


You can input a query or a question. The script then uses semantic search
to find relevant passages in a sample of papers from our Education Excel.

For semantic search, we use `SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')` and retrieve potentially passages that answer the input query.

Next, we use a more powerful CrossEncoder (`cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')`) that
scores the query and all retrieved passages for their relevancy. The cross-encoder further boost the performance,
especially when you search over a corpus for which the bi-encoder was not trained for.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -U sentence-transformers rank_bm25

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m80.7 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m76.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl

In [None]:
import json
import gzip
import os
import torch
import nltk
import nltk

from collections import Counter
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer, CrossEncoder, util

if not torch.cuda.is_available():
    print("Warning: No GPU found. Please add GPU to your notebook")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


#Experiments on whole data

In [None]:
#We split these articles into sentences and encode them with the bi-encoder

data_filepath = 'information_retrieval/focused_sample_paragraphs.jsonl'

with open(data_filepath, 'r', encoding='utf8') as fIn:
    papers = json.load(fIn)
print(len(papers))
for paper in papers:
  #paper['sent_tokenized_txt'] = sent_tokenize(paper['pdf_txt'])
  print("Passages for paper ",paper['paper_id'],':', len(paper['paragraphs']))

6
Passages for paper  #2598 : 104
Passages for paper  #17247 : 194
Passages for paper  #17284 : 69
Passages for paper  #17755 : 1339
Passages for paper  #17192 : 190
Passages for paper  #17725 : 73


In [None]:

#Filter which sentences we choose to encode

import re
general_keywords = ["beneficiaries","beneficiary","service", "user", "participants", "eligible","population", "eligibility","criteria","cohort","client","target",
                    "intervention","identified","enrolled","attended","sample"]

currency_keywords = ['$','USD','dollar','pound','euro','£','gbp','€','₹','rupee','franc','sterling','dinar','dirham','yen']

def filter(sentence):
  k = 0
  for keyword in general_keywords:
    if keyword in sentence: #and any(char.isdigit() for char in sentence):
      k = 1
  return k    

for paper in papers:
    paper['filtered_paragraphs'] = []
    for sentence in paper['paragraphs']:
      if len(sentence.split())>=10:# and any([x.lower() in sentence.lower() for x in currency_keywords]): #and bool(re.search(r'\d', sentence):
        paper['filtered_paragraphs'].append(sentence)
    if paper['filtered_paragraphs']== []:
      paper['filtered_paragraphs']=['nothing']
    print('Filtered passages for {}: {}'.format(paper['paper_id'],len(paper['filtered_paragraphs'])) )

  

Filtered passages for #2598: 97
Filtered passages for #17247: 188
Filtered passages for #17284: 68
Filtered passages for #17755: 1265
Filtered passages for #17192: 184
Filtered passages for #17725: 71


File #17271 does not contain the full text of the paper.

In [None]:
#We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
bi_encoder = SentenceTransformer('multi-qa-mpnet-base-dot-v1')#('multi-qa-MiniLM-L6-cos-v1')
bi_encoder.max_seq_length = 512     #Truncate long passages to 256 tokens
top_k = 30                          #Number of passages we want to retrieve with the bi-encoder

#The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

# We encode all passages into our vector space. This takes about 5 minutes (depends on your GPU speed)
all_corpus_embeddings = {}
for paper in papers:
  all_corpus_embeddings[paper['paper_id']] = bi_encoder.encode(paper['filtered_paragraphs'], convert_to_tensor=True, show_progress_bar=True)


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# We also compare the results to lexical search (keyword search). Here, we use 
# the BM25 algorithm which is implemented in the rank_bm25 package.

from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np


# We lower case our text and remove stop-words from indexing
def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)

        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
            tokenized_doc.append(token)
    return tokenized_doc


tokenized_corpus = {}
for paper in papers:
  tokenized_corpus[paper['paper_id']] = []
  for passage in tqdm(paper['filtered_paragraphs']):
    tokenized_corpus[paper['paper_id']].append(bm25_tokenizer(passage))
bm25 = BM25Okapi(tokenized_corpus)

  0%|          | 0/97 [00:00<?, ?it/s]

  0%|          | 0/188 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/1265 [00:00<?, ?it/s]

  0%|          | 0/184 [00:00<?, ?it/s]

  0%|          | 0/71 [00:00<?, ?it/s]

In [None]:
# This function will search all passages that answer the query
def search(query,passages,corpus_embeddings,n):
   
    ##### BM25 search (lexical search) #####
    '''bm25_scores = bm25.get_scores(bm25_tokenizer(query))
    top_n = np.argpartition(bm25_scores, -5)[-5:]
    bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)'''
    
    
    ##### Sematic Search #####
    # Encode the query using the bi-encoder and find potentially relevant passages
    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    question_embedding = question_embedding.cuda()
    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
    hits = hits[0]  # Get the hits for the first query

    ##### Re-Ranking #####
    # Now, score all retrieved passages with the cross_encoder
    cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    # Sort results by the cross-encoder scores
    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]


    # Output the best hit from re-ranker
    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
    return [passages[hit['corpus_id']] for hit in hits[0:n]] 
    
    #Output the best hit from BM25
    #return [passages[hit['corpus_id']] for hit in bm25_hits[0:n]]  

    #Ouput the best hit from bi-encoder
    #hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    #return [passages[hit['corpus_id']] for hit in hits[0:n]]




In [None]:
from nltk.sem.drt import DrtFunctionVariableExpression
import copy

questions_tp = ["What is the target population?","Who are the intended beneficiaries of the service?",
                "Who does the service try to help?","Who was eligible for inclusion in the intervention?",
                "target population beneficiaries service users participants eligible population eligibility criteria cohort clients","target population",
                "beneficiaries", "service users", "participants", "eligible population", "eligibility criteria","cohort","clients"]


questions_sd = ['What is the study design?','What is the research method?','How was data collected and analysed?',
               'study design method methodology data collection research design','study design','method',
               'methodology','data collection','research design']

questions_fd = ['What are the costs of the contract?','How much is paid for outcomes?','What are the outcomes payments?','What is the total contract value?',
                'What is the price per outcome?','outcomes payment price contract value contract cap rate card incentive payment costs savings',
                'outcomes payment','price','contract value','contract cap','rate card','incentive payment','costs','savings']

questions_plo = ['What outcomes were achieved?','What impact was achieved?','What were the results of the intervention?',
  'What was the impact of the intervention?','Were the contracted outcomes achieved?','results outcomes achieved impact','results',
    'outcomes achieved','impact']

all_IR_results = {}
for paper in papers:
  key = paper['paper_id']
  IR_results = {'Study Design':{},'Target Population':{},'Financial detail and costs':{},'Personal-level outcomes':{},'Financial detail and costs(filtered)':{}}

  for question in questions_sd:
    IR_results['Study Design'][question] = []

  for question in questions_tp:
    IR_results['Target Population'][question] = []

  for question in questions_fd:
    IR_results['Financial detail and costs'][question] = []
    IR_results['Financial detail and costs(filtered)'][question] = []

  for question in questions_plo:
    IR_results['Personal-level outcomes'][question] = []

  all_IR_results[key] = copy.deepcopy(IR_results)




In [None]:


for paper in papers:
  key = paper['paper_id']
  print('Begin experiment for key ', key)
  passages = paper['filtered_paragraphs']
  corpus_embeddings = all_corpus_embeddings[key]
  '''
  # Find most frequent unigrams and use them as query
  unigrams = []
  passages = paper['sent_tokenized_txt']
  for passage in passages:
    unigrams.extend(word for word in word_tokenize(passage) if not word in stopwords.words())
  
  frequent_unigrams = []
  for unigram in Counter(unigrams).most_common(1000):
    if len(unigram[0])>2:
      frequent_unigrams.append(unigram[0])

  query = set(frequent_unigrams).intersection(set(general_keywords))
  query = ' '.join(list(query))
  
  query = ''
  frequency_dict = {}
  paper['pdf_txt'] = ' '.join(paper['filtered_paragraphs'])
  for keyword in general_keywords:
    frequency_dict[keyword] = paper['pdf_txt'].count(keyword)
    #if keyword in paper['pdf_txt']:
    #  query = query + ' ' + keyword
  
  print(frequency_dict)
  for keyword in frequency_dict.keys():
    if frequency_dict[keyword]>0:
      query = query + ' ' + keyword'''

  #print('Target population (Gold Standard): ',paper['target_population'])
  
  for field in IR_results.keys():
    for query in IR_results[field].keys():
      print('Query: ', query)
      i = 1
      for result in search(query,passages,corpus_embeddings,20):
        print('Top ',i, ' : ', result.replace('\n',' '))
        all_IR_results[key][field][query].append(' '.join(result.split()))
        i+=1
      print("\n\n")



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Top  19  :   Systemic change: presence of a new, sustainable public funding stream Performance metrics: breadth of metrics South Carolina Yes—Medicaid reimbursement Utah Yes—state income tax Broad—four metrics Narrow—one metric Cost structure: comparison of public investment to investor profit Maximizes public investment— reinvests success payments Maximizes investor profit —overestimates impact Social equity: coverage of services to Yes—rural coverage vulnerable clientele Yes—low-income coverage Chicago No—funding ends after SIB Intermediate—three metrics Maximizes investor profit—overpays investors Yes—low-income coverage politically conservative states with high child poverty rates. They are difficult contexts in which to push any expansion of ECE services.
Top  20  :   South Carolina Nurse–Family Partnership Pay for Success Project funders supporting Nurse–Family Partnership (NFP) Originating from a consortium of in G

Save the ranked candidates in Google Folder.

In [None]:
with open('information_retrieval/IR_results.jsonl','w') as g:
  json.dump(all_IR_results,g)

In [None]:
key = '#17340'
print('Begin experiment for key ', key)
bm25 = BM25Okapi(tokenized_corpus[key])
unigrams = []
passages = paper['sent_tokenized_txt']

for passage in passages:
  unigrams.extend(word for word in word_tokenize(passage) if not word in stopwords.words())

frequent_unigrams = []
for unigram in Counter(unigrams).most_common(1000):
  if unigram[0] in general_keywords:
    frequent_unigrams.append(unigram)


print(frequent_unigrams)

query = 'cohort'
corpus_embeddings = all_corpus_embeddings[key] 

print('Target population (Gold Standard): ',paper['target_population'])
print('Query: ', query)
i = 1
for result in search(query,passages,corpus_embeddings,50):
  print('Top ',i, ' : ', result)
  i+=1
print("\n\n")

Begin experiment for key  #17340
[('service', 65), ('intervention', 65), ('cohort', 17), ('target', 15), ('population', 12), ('identified', 10), ('intended', 6), ('participants', 6), ('attended', 3)]
Target population (Gold Standard):  3,500 additional preschoolers (grouped into five cohorts) in two school districts—Park City and Granite. 2,620 Chicago public school children 
Query:  cohort
Top  1  :  Participants will be served sequentially in two cohorts of 1,000 individuals each, with each cohort defined as a phase of the project.
Top  2  :  Further analysis showed an 8.39% reduction in reoffending rates within the cohort, which is insufficient to trigger repayment for the first cohort (minimum 10% reduction required).
Top  3  :  Peterborough Social Impact Bond: Final report on cohort 1 analysis.
Top  4  :  Peterborough Social Impact Bond: Final report on cohort 1 analysis.
Top  5  :  The results showed an 8.39% reduction in reoffending rates within the cohort, which was insufficien