# *Setup Python Environment and Libraries*

In [None]:
# Install BM25
!pip install rank-bm25 nltk

In [None]:
# Install the latest master of Haystack
!pip install git+https://github.com/deepset-ai/haystack.git

Collecting git+https://github.com/deepset-ai/haystack.git
  Cloning https://github.com/deepset-ai/haystack.git to /tmp/pip-req-build-5332jljx
  Running command git clone -q https://github.com/deepset-ai/haystack.git /tmp/pip-req-build-5332jljx
Collecting farm==0.8.0
  Downloading farm-0.8.0-py3-none-any.whl (204 kB)
[K     |████████████████████████████████| 204 kB 5.3 MB/s 
[?25hCollecting fastapi
  Downloading fastapi-0.68.1-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 1.1 MB/s 
[?25hCollecting uvicorn
  Downloading uvicorn-0.15.0-py3-none-any.whl (54 kB)
[K     |████████████████████████████████| 54 kB 2.9 MB/s 
[?25hCollecting gunicorn
  Downloading gunicorn-20.1.0-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 8.4 MB/s 
Collecting elasticsearch<=7.10,>=7.7
  Downloading elasticsearch-7.10.0-py2.py3-none-any.whl (321 kB)
[K     |████████████████████████████████| 321 kB 53.0 MB/s 
[?25hCollecting elastic-apm
  Downloading 

In [None]:
# Import libraries 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path, PurePath
import requests
from requests.exceptions import HTTPError, ConnectionError
from ipywidgets import interact
import ipywidgets as widgets
from rank_bm25 import BM25Okapi
import nltk
from nltk.corpus import stopwords
nltk.download("punkt")
nltk.download("stopwords")
import re
import os
import glob
import sys
from tqdm import tqdm
import json
import math 
from google.colab import files

# Import DPR stuff
from haystack.retriever.dense import DensePassageRetriever
from haystack.preprocessor.utils import fetch_archive_from_http
from haystack.document_store.memory import InMemoryDocumentStore
from haystack.generator.transformers import RAGenerator
from haystack.document_store.faiss import FAISSDocumentStore



KeyboardInterrupt: ignored

# BM25 Hard Negative Selection

The following section cleans the synthetic data and finds hard negatives for each synthetic QA pair to use for DPR fine-tuning 

In [None]:
# Setup functions for cleaning text and stop word dropping  
english_stopwords = list(set(stopwords.words('english')))

def strip_characters(text):
    t = re.sub('\(|\)|:|,|;|\.|’|”|“|\?|%|>|<', '', text)
    t = re.sub('/', ' ', t)
    t = t.replace("'",'')
    return t

def clean(text):
    t = text.lower()
    t = strip_characters(t)
    return t

def tokenize(text):
    words = nltk.word_tokenize(text)
    return list(set([word for word in words 
                            if len(word) > 1
                            and not word in english_stopwords
                            and not word.isnumeric()
                            and word.isalpha()
                    ]
                   )
                )

def preprocess(text):
    t = clean(text)
    tokens = tokenize(t)
    return tokens

The following two chunks downloads the required data to be processed for selecting hard negatives. 

Both are essentially grabbing a google drive link that downloads the following csv:

1. QA.csv (synthetic QA generated by Shamane)
2. covid_data_full.csv (CORD19 data processed from the BM25 notebook) <- Currently not used because still testing pipeline, so rest of the code uses QA.csv

In [None]:
# Download QA.csv (generated by Shamane)
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1kKZSBpgDwRCvaMR9y9caEs1wSdbFKRzs' -O QA.csv

In [None]:
# Download Covid data full
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1zjw7U1bufzIU1j8HaW7NvkGNn9myYiDb' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1zjw7U1bufzIU1j8HaW7NvkGNn9myYiDb" -O covid_data_full.csv && rm -rf /tmp/cookies.txt

In [None]:
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1ZeOqiN4duXO0IO_TMQHpiAar3AJUhziR' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1ZeOqiN4duXO0IO_TMQHpiAar3AJUhziR" -O covid_dump.csv && rm -rf /tmp/cookies.txt


In [None]:
# NOT CURRENTLY USED 
# corona_df = pd.read_csv("covid_data_full.csv")
# print(corona_df.head())

In [None]:
# Create hard negatives using BM25 using this data
qa = pd.read_csv("QA.csv")
print(qa.head())

                                                                          question  ...                                                                            title
0       What is an example of a virus that initiates sg mRNA synthesis internally?  ...  Sequence requirements for RNA strand transfer during nidovirus discontinuous...
1  What has been suggested to explain the cotranscriptional fusion of noncontig...  ...  Sequence requirements for RNA strand transfer during nidovirus discontinuous...
2          What is not signi®cantly affected by leader TRS and body TRS mutations?  ...  Sequence requirements for RNA strand transfer during nidovirus discontinuous...
3                         What has the crucial role in nidovirus sg RNA synthesis?  ...  Sequence requirements for RNA strand transfer during nidovirus discontinuous...
4  What does the single leader TRS mutation at all six Nidovirus discontinuous ...  ...  Sequence requirements for RNA strand transfer during nidovirus dis

The above sets up the python environment and loads in the data.
covid_data_full.csv is the data from the BM25 notebook processed i.e. it is the AllenAI CORD19 data processed. 

The QA.csv is the data from Shamane.

Currently, the covid_data_full.csv is too large to process fully so start with QA.csv

Approach for creating hard negatives 

1. Use BM25 to get a list of passages from synthetic question. The corpus of passages should not include the context for that synthetic question. We should get top-k passages that doesn't contain the answer? This is a bit naive so we can do more ...  
    1. For each of the top-k passages selected using BM25 we want to generate an answer
    2. Do similarity to synthetic answer 
    3. Further subset the top-k passages based on the lowest similarity score 
2. Now we have, for each synthetic QA pair, we have: $\{q_i,a_i|p_i^+,p_{i,m}^-\}$ where m is the number of hard negative passages selected via the BM25

DPR format

```python
[
    {
        "question": "....",
        "answers": ["...", "...", "..."],
        "positive_ctxs": [{
            "title": "...",
            "text": "...."
        }],
        "negative_ctxs": ["..."],
        "hard_negative_ctxs": ["..."]
    },
    ...
]
```

In [None]:
# Process the qa.csv file and create a BM25 corpus 
BM25Corpus = qa.context.fillna("").apply(preprocess).to_frame()

In [None]:
#Knowledge Base with Cord-19 data
covid_dump = pd.read_csv("covid_dump.csv",sep='\t',header=0, names=['title','context'])
BM25Corpus = covid_dump.context.fillna("").apply(preprocess).to_frame()

In [None]:
 # Create the BM25 object
 BM25 = BM25Okapi(BM25Corpus.context.tolist())

In [None]:
qa

Unnamed: 0,question,answer,context,title
0,What is an example of a virus that initiates sg mRNA synthesis internally?,brome mosaic virus,"Some viruses, such as brome mosaic virus, initiate sg mRNA synthesis interna...",Sequence requirements for RNA strand transfer during nidovirus discontinuous...
1,What has been suggested to explain the cotranscriptional fusion of noncontig...,Various models,Various models have been put forward to explain the cotranscriptional fusion...,Sequence requirements for RNA strand transfer during nidovirus discontinuous...
2,What is not signi®cantly affected by leader TRS and body TRS mutations?,EAV genome replication,EAV genome replication is not signi®cantly affected by leader TRS and body T...,Sequence requirements for RNA strand transfer during nidovirus discontinuous...
3,What has the crucial role in nidovirus sg RNA synthesis?,base pairing between the sense leader TRS and the antisense body TRS,"Recently, we have established the pivotal role of an interaction between sen...",Sequence requirements for RNA strand transfer during nidovirus discontinuous...
4,What does the single leader TRS mutation at all six Nidovirus discontinuous ...,RNA7 synthesis,"In contrast to our ®ndings with the body TRS mutants, we did not obtain lead...",Sequence requirements for RNA strand transfer during nidovirus discontinuous...
...,...,...,...,...
87326,How many isolates had the gene encoding OXA-23 carbapenemase?,Twenty-six,No. of COVID-19 admissions met multidrug-resistant CRAB criteria. Thirty iso...,Increase in Hospital-Acquired Carbapenem-Resistant Acinetobacter baumannii I...
87327,What did environmental services disinfect with bleach?,common areas and high-touch surfaces of ICUs,"In early May, hospital A's IPC leadership advised physicians, unit managers,...",Increase in Hospital-Acquired Carbapenem-Resistant Acinetobacter baumannii I...
87328,What did NJDOH investigate?,the cluster,"In collaboration with hospital A, NJDOH investigated the cluster, including ...",Increase in Hospital-Acquired Carbapenem-Resistant Acinetobacter baumannii I...
87329,What led to deviations in IPC practices?,Strategies to preserve continuity of care,A New Jersey hospital reported a cluster of 34 CRAB cases that peaked during...,Increase in Hospital-Acquired Carbapenem-Resistant Acinetobacter baumannii I...


In [None]:
topK = 10 # Get top 10 best matching documents as the hard negatives 
qa_FN_DPR_format = [] # The DPR format above is json like 
for idx, r in qa.iterrows(): # Iterate each synthetic QA pair in QA.csv and convert into DPR data format above 
  question = qa.question[idx]
  question = preprocess(question) 
  docScores = BM25.get_scores(question)
  # docScores = np.delete(docScores, idx) # Remove the context to current question (so we dont pick it)
  idxTopKDocs = np.argsort(docScores)[::-1][:topK] # Reverse order and get best ones (best hard negatives)
  hard_negatives = qa.iloc[idxTopKDocs]

  positive_ctxs = {"title": r.title, "text": r.context}
  hard_negative_ctxs = [{"title": r.title, "text": r.context} for _, r in hard_negatives.iterrows()]
  qa_FN_DPR_format.append({"question": r.question, "answers": [r.answer], "positive_ctxs": [positive_ctxs], "negative_ctxs": [], "hard_negative_ctxs": hard_negative_ctxs})

with open("qa_FN_DPR_format.json", "w") as fout:
  json.dump(qa_FN_DPR_format, fout)

In [None]:
# Splits data into training, dev, test for DPR fine-tuning 
split = math.floor(len(qa_FN_DPR_format)*0.8) 

with open("qa_FN_DPR_format_TRAIN.json", "w") as fout:
  json.dump(qa_FN_DPR_format[0:split], fout)

with open("qa_FN_DPR_format_DEV.json", "w") as fout:
  json.dump(qa_FN_DPR_format[split:], fout)


In [None]:
files.download("qa_FN_DPR_format.json")
files.download("qa_FN_DPR_format_TRAIN.json")
files.download("qa_FN_DPR_format_DEV.json")

# Training and Fine-tuning the DPR

Now that the data has been processed, we can train the DPR!

The above code preprocesses the data for DPR training, but I have already saved this output to my google drive, so the following two lines again downloads the preprocessed data for DPR training so we dont have to wait 5 hrs for the above code lol.

# New Section

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Download the processed data in the json format requried for DPR
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1DG4sa6eCHuVCiuQqRtdk-YfYqezEj7rA' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1DG4sa6eCHuVCiuQqRtdk-YfYqezEj7rA" -O qa_FN_DPR_format_TRAIN.json && rm -rf /tmp/cookies.txt

!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1bxsfG-oZBJVG9qVCG9lCO2AH-h2lsR8C' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1bxsfG-oZBJVG9qVCG9lCO2AH-h2lsR8C" -O qa_FN_DPR_format_DEV.json && rm -rf /tmp/cookies.txt

In [None]:
# Fine tuning
train_filename = "qa_FN_DPR_format_TRAIN.json"
dev_filename = "qa_FN_DPR_format_DEV.json"

query_model = "facebook/dpr-question_encoder-single-nq-base"
passage_model = "facebook/dpr-ctx_encoder-single-nq-base"

save_dir = "/dpr/saved_models"

In [None]:
## Initialize DPR model

retriever = DensePassageRetriever(
    document_store=InMemoryDocumentStore(),
    query_embedding_model=query_model,
    passage_embedding_model=passage_model,
    max_seq_len_query=64,
    max_seq_len_passage=256
)

In [None]:
!nvidia-smi

In [None]:
# Start training our model and save it when it is finished
retriever.train(
    data_dir="",
    train_filename=train_filename,
    dev_filename=dev_filename,
    test_filename=dev_filename,
    n_epochs=1,
    batch_size=16,
    grad_acc_steps=8,
    save_dir=save_dir,
    evaluate_every=3000,
    embed_title=True,
    num_positives=1,
    num_hard_negatives=1
)

Zip the DPR model folder so we can export/download

In [None]:
!zip -r dpr.zip /dpr

In [None]:
# Download fine-tuned DPR model and save to local machine
files.download("dpr.zip")

# Reloading Fine-Tuned DPR Model 

In [None]:
# Reload fine-tuned DPR 
reloaded_retriever = DensePassageRetriever.load(load_dir = "dpr/saved_models", document_store = None)

# QA Pipeline Using Fine-tuned DPR + RAG

In [None]:
# Download the fine-tuned DPR 
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1--TphsSGk083rMNzSoEW6GaK_Qm8fiRe' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1--TphsSGk083rMNzSoEW6GaK_Qm8fiRe" -O dpr.zip && rm -rf /tmp/cookies.txt

# Unzip the dowloaded DPR
! unzip -q dpr.zip

In [None]:
# Download the COVID-QA data 
!wget https://raw.githubusercontent.com/deepset-ai/COVID-QA/master/data/question-answering/COVID-QA.json COVID-QA.json

In [None]:
COVID_QA = json.load(open("COVID-QA.json"))
len(COVID_QA["data"])

In [None]:
# Convert json object into a pandas dataframe 
qas = []
documents = []
for data in COVID_QA["data"]:
  qas.append(data["paragraphs"][0])
  documents.append({"text": data["paragraphs"][0]["context"], "meta": {"name": data["paragraphs"][0]["document_id"]}})
COVID_QA_df = pd.DataFrame(qas)

In [None]:
# Initialize FAISS document store.
# Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding
document_store = FAISSDocumentStore(
    faiss_index_factory_str="Flat",
    return_embedding=True
)

# Initialize DPR Retriever to encode documents, encode question and query documents
retriever = DensePassageRetriever.load(
    load_dir = "dpr/saved_models", document_store = document_store
)

# Initialize RAG Generator
generator = RAGenerator(
    model_name_or_path="facebook/rag-token-nq",
    use_gpu=True,
    top_k=1,
    max_length=200,
    min_length=2,
    embed_title=True,
    num_beams=2,
)

In [None]:
# Delete existing documents in documents store
document_store.delete_documents()

# Write documents to document store
document_store.write_documents(documents)

# Add documents embeddings to index
document_store.update_embeddings(
    retriever=retriever
)

In [None]:
#Evaluating the retrieval recall
for question in QUESTIONS:
    # Retrieve related documents from retriever
    retriever_results = retriever.retrieve(
        query=question
    )

    # Now generate answer from question and retrieved documents
    predicted_result = generator.predict(
        query=question,
        documents=retriever_results,
        top_k=1
    )

    # Print you answer
    answers = predicted_result["answers"]
    print(f'Generated answer is \'{answers[0]["answer"]}\' for the question = \'{question}\'')
#https://haystack.deepset.ai/docs/latest/tutorial7md


In [None]:
# Or alternatively use the Pipeline class
from haystack.pipeline import GenerativeQAPipeline
pipe = GenerativeQAPipeline(generator=generator, retriever=retriever)
res = []
for idx, data in COVID_QA_df.iterrows():
  for qas in data["qas"]:
    ans = pipe.run(query=qas["question"], top_k_generator=1, top_k_retriever=5)
    # res.append({"question": qas["question"], "true_answers": [ans["text"] for ans in qas["answers"]], "pred_answer": ans})
    res.append(ans)
res_df = pd.DataFrame(res)
print(res_df)


In [None]:
res_df.to_csv("DPR_RAG_COVID_QA.csv")

# **BELOW IS OLD CODE JUST KEEPING FOR REFERENCE**

In the DPR training notebook, we download these files to train the DPR from scratch

s3_url_train = "https://dl.fbaipublicfiles.com/dpr/data/retriever/biencoder-nq-train.json.gz"

s3_url_dev = "https://dl.fbaipublicfiles.com/dpr/data/retriever/biencoder-nq-dev.json.gz"

Unzipping the first file we get "biencoder-nq-train.json"

Reading the file with the following script gives us the JSON object format for training/fine tuning the DPR. This is just for reference to get an idea of file format required for training/fine tuning DPR. 

```python
import json
infile = "biencoder-nq-train.json"
count = 0 
with open(infile) as f_read:
    for line in f_read:
        if count > 100:
            break
        line = line.strip()
        if len(line) > 0:
            print(line)
        count = count + 1
```




```python
[
{
"dataset": "nq_train_psgs_w100",
"question": "big little lies season 2 how many episodes",
"answers": [
"seven"
],
"positive_ctxs": [
{
"title": "Big Little Lies (TV series)",
"text": "series garnered several accolades. It received 16 Emmy Award nominations and won eight, including Outstanding Limited Series and acting awards for Kidman, Skarsg\u00e5rd, and Dern. The trio also won Golden Globe Awards in addition to a Golden Globe Award for Best Miniseries or Television Film win for the series. Kidman and Skarsg\u00e5rd also received Screen Actors Guild Awards for their performances. Despite originally being billed as a miniseries, HBO renewed the series for a second season. Production on the second season began in March 2018 and is set to premiere in 2019. All seven episodes are being written by Kelley",   
"score": 1000,
"title_score": 1,
"passage_id": "18768923"
},
{
"title": "Little People, Big World",
"text": "TLC announced a spin-off series \u2013 \"\". It chronicles Matt and Amy as they jump-start their wedding business on the farm. The series premiered on November 13, 2012, and ran for six episodes. It was announced in October 2013 that \"Little People, Big World\" would return for a seventh season. Season seven consists of eight episodes, and premiered on October 29, 2013. An eighth season began on September 2, 2014, 
and a ninth season began on July 6, 2015. The show was renewed for a tenth season which premiered in fall 2016. As of 2018, the show has aired for",
"score": 13.371864,
"title_score": 0,
"passage_id": "7459110"
},
{
"title": "Big Little Lies (TV series)",
"text": "shows of 2017. A soundtrack for the series was released on Google Play and iTunes on March 31, 2017. The first season was released on Blu-ray and DVD on August 1, 2017. Big Little Lies (TV series) Big Little Lies is an American drama television series, based on the novel of the same name by Liane Moriarty, that premiered on February 19, 2017, on HBO. Created and written by David E. Kelley, the series' seven-episode 
first season was directed by Jean-Marc Vall\u00e9e. \"Big Little Lies\" stars Nicole Kidman, Reese Witherspoon and Shailene Woodley and tells the story of three emotionally",
"score": 12.932647,
"title_score": 0,
"passage_id": "18768935"
},
{
"title": "Big Little Lies (TV series)",
"text": "Big Little Lies (TV series) Big Little Lies is an American drama television series, based on the novel of the same name by Liane Moriarty, that premiered on February 19, 2017, on HBO. Created and written by David E. Kelley, the series' seven-episode first season was directed by Jean-Marc Vall\u00e9e. \"Big Little Lies\" stars Nicole Kidman, Reese Witherspoon and Shailene Woodley and tells the story of three emotionally troubled women in Monterey, California, who become embroiled in a murder investigation. Alexander Skarsg\u00e5rd, Laura Dern, Jeffrey Nordling, Adam Scott, Zo\u00eb Kravitz, and James Tupper feature in supporting roles. Critically acclaimed, the",
"score": 12.449134,
"title_score": 0,
"passage_id": "18768922"
},
{
"title": "Andrea Arnold",
"text": "Andrea Arnold Andrea Arnold, OBE (born 5 April 1961) is an English filmmaker and former actress. She won an Academy Award for her short film \"Wasp\" in 2005. She has since made the leap to feature films and television, including \"Red Road\" (2006), \"Fish Tank\" (2009), and \"American Honey\" (2016), all of which have won the Jury Prize at the Cannes Film Festival. Arnold has also directed four episodes of the Emmy Award-winning series \"Transparent\", as well as all seven episodes of the second season of the Emmy Award-winning series \"Big Little Lies\". Arnold was born in Dartford, Kent, the",
"score": 12.204561,
"title_score": 0,
"passage_id": "7854255"
},
{
"title": "Designing Women",
"text": "to whom she eventually loses. In reality, Dixie Carter was a Republican who disagreed with some of the liberal views expressed by her onscreen character, although she did become a Clinton supporter. Shout! Factory has released all seven seasons of \"Designing Women\" on DVD in Region 1. On September 2, 2003, Sony Pictures released \"The Best of Designing Women\", a single-disc DVD featuring five episodes ranging between seasons one through four: \"Designing Women (Pilot)\" (season 1), \"Killing All the Right People\" (season 2), \"Reservations for Eight\" (season 2), \"Big Haas and Little Falsie\" (season 3) and \"They Shoot 
Fat Women,",
"score": 11.899436,
"title_score": 0,
"passage_id": "1523654"
},
{
"title": "Big Little Lies (TV series)",
"text": "series garnered several accolades. It received 16 Emmy Award nominations and won eight, including Outstanding Limited Series and acting awards for Kidman, Skarsg\u00e5rd, and Dern. The trio also won Golden Globe Awards in addition to a Golden Globe Award for Best Miniseries or Television Film win for the series. Kidman and Skarsg\u00e5rd also received Screen Actors Guild Awards for their performances. Despite originally being billed as a miniseries, HBO renewed the series for a second season. Production on the second season began in March 2018 and is set to premiere in 2019. All seven episodes are being written by Kelley",   
"score": 11.830096,
"title_score": 0,
"passage_id": "18768923"
},
{
"title": "The X-Files (season 2)",
"text": "out of five to \"Little Green Men\", \"Duane Barry\", \"One Breath\", \"Irresistible\", \"Die Hand Die Verletzt\", and \"Anasazi\". However, several episodes rated poorly, with \"3\", \"Excelsis Dei\", and \"The Calusari\" being considered particularly poor. Many critics considered the \"Duane Barry\"/\"Ascension\"/\"One Breath\" story arc to be the best part of the season. Shearman singled out the three-parter as the highlight of the season, noting that the \"intimacy\" and \"sincerity [of] the emotion\" of the episodes allowed the mythology of \"The X-Files\" to play out for a further seven seasons. Tom Kessenich, in his book \"Examination: An Unauthorized Look at Seasons 6\u20139",
"score": 10.869938,
"title_score": 0,
"passage_id": "3670320"
},
{
"title": "The Big Bang Theory",
"text": "season. The second half of season seven aired in mid 2014. The eighth season premiered on E4 on October 23, 2014 at 8:30 p.m. During its eighth season, \"The Big Bang Theory\" shared its 8:30 p.m. time period with fellow CBS comedy, \"2 Broke Girls\". Following the airing of the first eight episodes of that show's fourth season, \"The Big Bang Theory\" returned to finish airing its eighth season on March 19, 2015. Netflix UK & Ireland announced on February 13, 2016 that seasons 1\u20138 would be available to stream from February 15, 2016. \"The Big Bang Theory\" started off",
"score": 10.384873,
"title_score": 0,
"passage_id": "10248582"
}
],
"negative_ctxs": [
{
"title": "Cormac McCarthy",
"text": "chores of the house, Lee was asked by Cormac to also get a day job so he could focus on his novel writing. Dismayed with the situation, she moved to Wyoming, where she filed for divorce and landed her first job teaching. Cormac McCarthy is fluent in Spanish and lived in Ibiza, Spain, in the 1960s and later settled in El Paso, Texas, where he lived for nearly 20 years. In an interview with Richard B. Woodward from \"The New York Times\", \"McCarthy doesn't drink anymore \u2013 he quit 16 years ago in El Paso, with one of his young",
"score": 0,
"title_score": 0,
"passage_id": "2145653"
},
{
"title": "Pragmatic Sanction of 1549",
"text": "one heir, Charles effectively united the Netherlands as one entity. After Charles' abdication in 1555, the Seventeen Provinces passed to his son, Philip II of Spain. The Pragmatic Sanction is said to be one example of the Habsburg contest with particularism that contributed to the Dutch Revolt. Each of the provinces had its own laws, customs and political practices. The new policy, imposed from the outside, angered 
many inhabitants, who viewed their provinces as distinct entities. It and other monarchical acts, such as the creation of bishoprics and promulgation of laws against heresy, stoked resentments, which fired the eruption of",
"score": 0,
"title_score": 0,
"passage_id": "2271902"
},
{
"title": "Hall Airport",
"text": "Hall Airport Hall Airport is a privately owned, public use airport located six nautical miles (11 km) northwest of the central business district of Kaufman, a city in Kaufman County, Texas, United States. 
Hall Airport covers an area of 27 acres (11 ha) at an elevation of 440 feet (134 m) above mean sea level. It has one runway designated 17/35 with a turf surface measuring 2,585 by 40 feet (788 x 12 m). For the 12-month period ending May 23, 2007, the airport had 201 general aviation aircraft operations, an average of 16 per month. At that time there",
"score": 0,
"title_score": 0,
"passage_id": "17333840"
},
{
"title": "St Columba's College, Largs",
"text": "early 1960s) who was followed by Brother Germanus (who later became David Germanus) then Brother Nicholas (who later left the Brothers to get married) who is mentioned as part of a 'holy fourball' of golfers (given his golfing skills) on pages 9 and 15 of \"Sam\" the autiobiography of Sam Torrance, the famous golfer and team captain for Europe in the Ryder Cup in 2002. Sam was a member, as was his father the Club Professional, of Routenburn Golf Club, situated above and adjacent to the school. The various headmasters kept up the wonderful traditions of devotion to the Marist",
"score": 0,
"title_score": 0,
"passage_id": "13284527"
},
```



In [None]:
# topK = 10
# fn = []
# for r in qa.index: 
#   question = qa.question[r]
#   question = preprocess(question) 
#   docScores = BM25.get_scores(question)
#   docScores = np.delete(docScores, r)
#   idxTopKDocs = np.argsort(docScores)[::-1][:topK] # Reverse order and get best ones
#   fn.append(qa.context[idxTopKDocs].tolist())
# qa["False Negatives"] = fn
# qa.to_csv("QA_FN.csv", index = False)

In [None]:
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1qz6Nv95mxhus03IqNnwbrxXu9m-dZI3X' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1qz6Nv95mxhus03IqNnwbrxXu9m-dZI3X" -O QA_FN.csv && rm -rf /tmp/cookies.txt


In [None]:
# qa_FN = pd.read_csv("QA_FN.csv")

The following code takes a reduced qa dataframe and puts the data into the dictionary format that is required for the DPR training model.

In [None]:
# # Creating a reduced qa dataframe to try use DPR training model with
# reduced_qa = qa.iloc[0:100,:]

# topK = 10
# fn = []
# p_ctxs = []
# for i in range(100):
#   question = qa.question[i]
#   question = preprocess(question) 
#   docScores = BM25.get_scores(question)
#   p_ctxs.append({"title":"Nothing", "text":qa.context[i], "score":docScores[i], "title_score":0, "passage_id":i})
#   docScores = np.delete(docScores, i)
#   idxTopKDocs = np.argsort(docScores)[::-1][:topK] # Reverse order and get best ones
#   pre_fn = [{"title":"Nothing", "text":qa.context[x], "score":docScores[x], "title_score":0, "passage_id":x} for x in idxTopKDocs] # Store each false negative using a dictionary structure
#   fn.append(pre_fn)

# reduced_qa["hard_negative_ctxs"] = fn
# reduced_qa["positive_ctxs"] = p_ctxs

In [None]:
# Install the latest master of Haystack
!pip install git+https://github.com/deepset-ai/haystack.git

In [None]:
from haystack.retriever.dense import DensePassageRetriever
from haystack.preprocessor.utils import fetch_archive_from_http
from haystack.document_store.memory import InMemoryDocumentStore

In [None]:
# df = []
# for i in qa.index:
#   df.append({"dataset":i,"question":qa.question[i],"answers":qa.answer[i].split(" "), "positive_ctxs":qa.positive_ctxs[i], "negative_ctxs":[], "hard_negative_ctxs":qa.hard_negative_ctxs[i]})
  
