In [None]:
import os
import re
import pandas as pd
from extract import extract_relevant_content
# import torch
# torch.cuda.empty_cache()

In [None]:
import json
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import gzip
import os
import torch
from pygaggle.rerank.base import Query, Text
from pygaggle.rerank.transformer import MonoT5
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration

if not torch.cuda.is_available():
    print("Warning: No GPU found. Please add GPU to your notebook")

device="cuda:2"
cross_encoder = MonoT5('castorini/monot5-base-msmarco', use_amp=True)
cross_encoder.tokenizer = MonoT5.get_tokenizer('t5-base', batch_size=32)
cross_encoder.model = T5ForConditionalGeneration.from_pretrained("../inPars/data/jff/20230210-021357").to(torch.device('cuda'))

In [None]:
# We also compare the results to lexical search (keyword search). Here, we use 
# the BM25 algorithm which is implemented in the rank_bm25 package.

from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np


# We lower case our text and remove stop-words from indexing
def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)

        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
            tokenized_doc.append(token)
    return tokenized_doc

In [None]:
questions = ['what is the study design?',
 'what is the research method?',
 'how was data collected and analysed? ',
 'study design; method; methodology; data collection; research design',
 'study design', 'method', 'methodology', 'data collection', 'research design',
 'what is the target population?',
 'who are the intended beneficiaries of the service?',
 'who does the service try to help?',
 'who was eligible for inclusion in the intervention?',
 'target population; beneficiaries; service users; participants; eligible population; eligibility criteria; cohort; clients',
 'target population',
 'beneficiaries',
 'service users',
 'participants',
 'eligible population',
 'eligibility criteria',
 'cohort',
 'clients',
 'what are the costs of the contract?',
 'how much is paid for outcomes?',
 'what are the outcomes payments?',
 'what is the total contract value?',
 'what is the price per outcome?',
 'outcomes payment; price; contract value; contract cap; rate card; incentive payment; costs; savings',
 'outcomes payment',
 'price',
 'contract value',
 'contract cap',
 'rate card',
 'incentive payment',
 'costs',
 'savings',
 'what outcomes were achieved?',
 'what impact was achieved;?',
 'what were the results of the intervention?',
 'what was the impact of the intervention?',
 'were the contracted outcomes achieved?',
 'results; outcomes achieved; impact',
 'results', 'outcomes achieved', 'impact']
# questions = ['research; consultation; rating; document; level',
#             'vulnerable; patient; household; unemployed; living',
#             'usd; million; fund; outcome; payment',
#             'school; girl; student;']

In [None]:
path = "./data/samples"
articles = []
for f in os.listdir(path):
    print(os.path.join(path, f))
    passages = extract_relevant_content(os.path.join(path,f))
    passages = [re.sub("\s+", " ", passage) for passage in passages if len(passage.split()) <= 100000 and len(passage.split()) > 0]
    # for finicial target
    # passages = [re.sub("\s+", " ", passage) for passage in passages if len(passage.split()) <= 300 and len(passage.split()) > 0 and (re.search("[£$]\d+", passage) or re.search("\d,\d\d\d", passage) or re.search("\d+(k|mil|%)", passage))]
    articles.append(passages)

qa = {}
for query in questions:
    qa[query] = []
    
for passages in articles:
    tokenized_corpus = []
    for passage in tqdm(passages):
        tokenized_corpus.append(bm25_tokenizer(passage))

    bm25 = BM25Okapi(tokenized_corpus)

    # This function will search all wikipedia articles for passages that
    # answer the query

    def search(query):
        print("Input question:", query)

        ##### BM25 search (lexical search) #####
        bm25_scores = bm25.get_scores(bm25_tokenizer(query))
        top_n = np.argpartition(bm25_scores, -5)[-500:]
        bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
        bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
        
        ##### Re-Ranking #####
        # Now, score all retrieved passages with the cross_encoder
        ans = []
        count = 0
        question = Query(query)
        texts = [Text(passages[hit['corpus_id']],{'docid': hit['corpus_id']},  0) for hit in bm25_hits[:1000]]
        reranked = cross_encoder.rerank(question, texts)
        for i in range(len(reranked[:30])):
            if count == 30:
                break
            docid = reranked[i].metadata['docid']
            score = reranked[i].score
            result = reranked[i].text.replace("\n", " ")
            if  result not in ans:
                ans.append("Top "+ str(count+1) + ": " + result)
                count += 1
                print("\t{:.3f}\t{}".format(score, reranked[i].text.replace("\n", " ")))
        qa[query].append("\n".join(ans))

    for question in questions:
        search(query = question)

In [None]:
df = pd.DataFrame(data=qa, index=["#17247 - Brookings 2017.pdf", "#2598 - Lee 2020.pdf", "#17755 - Ecorys 2019.pdf", "#17284 - Warner 2018.pdf",  "#17725 - IDinsight 2018.pdf", "#17192 - Education 2016 (1).pdf"])
df = (df.T)
print (df)
df.to_excel('qa_top30_inPars.xlsx')

In [None]:
# search(query = "what is the scale of intervention?")

In [None]:
# for passages in articles:
#     for passage in passages:
#         if len(passage.split())>512:
#             print(len(passage.split()))

In [None]:
qa['what is the study design?']