# Question Answering System


More reader: 
- https://lilianweng.github.io/posts/2020-10-29-odqa/
- https://yjernite.github.io/lfqa.html

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Wiki-40B: Multilingual Language Model Dataset:
https://aclanthology.org/2020.lrec-1.297.pdf

In [None]:
!pip -q install sentence-transformers==2.2.2 transformers==4.26.1

In [None]:
import os
import json
import gzip
import torch
from transformers import *
from sentence_transformers import SentenceTransformer, CrossEncoder, util

In [None]:
def load_passages(data_path):
    passages = []
    with gzip.open(data_path, 'rt', encoding='utf8') as fIn:
        for line in fIn:
            data = json.loads(line.strip())

            passages.append(data['paragraphs'][0])

    print("Passages:", len(passages))
    return passages

def predict(question, context):
    input = tokenizer.encode_plus(context, question, return_tensors='pt')
    input = input.to(device)
    res = model(**input)
    start_position = torch.argmax(res.start_logits[0])
    end_position = torch.argmax(res.end_logits[0])
    answer = tokenizer.decode(input['input_ids'][0][start_position:end_position+1], skip_special_tokens=True)
    return answer, ((start_position+end_position)/2).item()

def ranker(question, corpus_embeddings, top_k=20):
    # Retrival
    question_embedding = bi_encoder.encode(question, convert_to_tensor=True)
    question_embedding = question_embedding.cuda()
    results = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
    results = results[0]

    #Ranking
    cross_inp = [[question, passages[result['corpus_id']]] for result in results]
    cross_scores = cross_encoder.predict(cross_inp)

    # Sort results by the cross-encoder scores
    for idx in range(len(cross_scores)):
        results[idx]['cross-score'] = cross_scores[idx]
         
    ranked_passages = sorted(results, key=lambda x: x['cross-score'], reverse=True)
    passages_ = []
    for p in ranked_passages:
        passages_.append(passages[p['corpus_id']])
    return passages_

def get_answer(question, ranked_passages):
    answers = {}
    for p in ranked_passages:
        answer, p_answer = predict(question, p)
        answers[p_answer] = answer
    best_answe = sorted(answers.items(), key=lambda kv:(kv[1], kv[0]), reverse=True)[0][1]
    return answers, best_answe

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Pre-train Bi-Encoders (retrieval)
bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
bi_encoder.max_seq_length = 512     
top_k = 20                

# Pre-trained Cross-Encoders (Re-Rakner) MS MARCO datset
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

# QA Model
model_name = "xlm-roberta-base"
model_path = "./save_model/xlm-roberta-base/checkpoint-6000"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_path)
model = model.cuda()

# Small Wikipedia Passages
data_path = "./data/simplewiki-2020-11-01.jsonl.gz"
passages = load_passages(data_path)

# Passage Embedding
corpus_embeddings = bi_encoder.encode(
    passages, 
    convert_to_tensor=True, 
    show_progress_bar=True
)

##Passage Level

In [None]:
question = "What is the capital of the United States?"
ranked_passages = ranker(question, corpus_embeddings)
answers, best_answer = get_answer(question, ranked_passages)
best_answer

'Washington, D.C'

In [None]:
question = "How many continents in the world?"
ranked_passages = ranker(question, corpus_embeddings)
answers, best_answer = get_answer(question, ranked_passages)
best_answer

'seven'

In [None]:
question = "Thủ đô của nước Việt Nam là gì?"
ranked_passages = ranker(question, corpus_embeddings)
answers, best_answer = get_answer(question, ranked_passages)
best_answer

'Việt Trì'