Import Libraries

In [1]:
import concurrent.futures
import itertools
import operator
import re

import requests
from gensim.summarization.bm25 import BM25
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, QuestionAnsweringPipeline

import os
import spacy

import pandas as pd
import torch

print('GPU available:', torch.cuda.device_count())

  from .autonotebook import tqdm as notebook_tqdm


GPU available: 1


Question Answering Engine

In [2]:
class QueryProcessor:

    def __init__(self, nlp, keep=None):
        self.nlp = nlp
        self.keep = keep or {'PROPN', 'NUM', 'VERB', 'NOUN', 'ADJ'}

    def generate_query(self, text):
        doc = self.nlp(text)
#         for token in doc:
#             print(token, ':', token.pos_)
        query = ' '.join(token.text for token in doc if token.pos_ in self.keep)
        return query

class DocumentRetrieval:

    def __init__(self, url='https://en.wikipedia.org/w/api.php'):
        self.url = url

    def search_pages(self, query):
        params = {
            'action': 'query',
            'list': 'search',
            'srsearch': query,
            'format': 'json'
        }
        res = requests.get(self.url, params=params)
        return res.json()

    def search_page(self, page_id):
        res = wikipedia.page(pageid=page_id)
        return res.content

    def search(self, query):
        pages = self.search_pages(query)
        with concurrent.futures.ThreadPoolExecutor() as executor:
            process_list = [executor.submit(self.search_page, page['pageid']) for page in pages['query']['search']]
            docs = [self.post_process(p.result()) for p in process_list]
        return docs

    def post_process(self, doc):
        pattern = '|'.join([
            '== References ==',
            '== Further reading ==',
            '== External links',
            '== See also ==',
            '== Sources ==',
            '== Notes ==',
            '== Further references ==',
            '== Footnotes ==',
            '=== Notes ===',
            '=== Sources ===',
            '=== Citations ===',
        ])
        p = re.compile(pattern)
        indices = [m.start() for m in p.finditer(doc)]
        min_idx = min(*indices, len(doc))
        return doc[:min_idx]

class PassageRetrieval:

    def __init__(self, nlp):
        self.tokenize = lambda text: [token.lemma_ for token in nlp(text)]
        self.bm25 = None
        self.passages = None

    def preprocess(self, doc):
        passages = [p for p in doc.split('\n') if p and not p.startswith('=')]
        return passages

    def fit(self, docs):
#         passages = list(itertools.chain(*map(self.preprocess, docs)))
        passages = docs
#         corpus = [self.tokenize(p) for p in passages]
        corpus = [self.tokenize(p.lower()) for p in passages for p in p] # Pre-process
        self.bm25 = BM25(corpus)
        self.passages = passages

    def most_similar(self, question, topn=10):
        tokens = self.tokenize(question)
        scores = self.bm25.get_scores(tokens)
        pairs = [(s, i) for i, s in enumerate(scores)]
        pairs.sort(reverse=True)
        passages = [[self.passages[i],s] for s, i in pairs[:topn]]
        return passages


class AnswerExtractor:

    def __init__(self, tokenizer, model):
        tokenizer = AutoTokenizer.from_pretrained(tokenizer)
        with torch.no_grad(): 
            model = AutoModelForQuestionAnswering.from_pretrained(model)
#         model.load_state_dict(torch.load("finetune_distil_bert.pth")) # Load pretrained model
        self.nlp = QuestionAnsweringPipeline(model=model, tokenizer=tokenizer)

    def extract(self, question, passages):
        answers = []
        for passage in passages:
            try:
                answer = self.nlp(question=question, context=passage)
                answer['text'] = passage
                answers.append(answer)
            except KeyError:
                pass
        answers.sort(key=operator.itemgetter('score'), reverse=True)
        return answers

In [3]:
SPACY_MODEL = os.environ.get('SPACY_MODEL', 'en_core_web_sm')
QA_MODEL = os.environ.get('QA_MODEL', 'distilbert-base-cased-distilled-squad')
nlp = spacy.load(SPACY_MODEL, disable=['ner', 'parser', 'textcat'])
query_processor = QueryProcessor(nlp)
document_retriever = DocumentRetrieval()
passage_retriever = PassageRetrieval(nlp)
answer_extractor = AnswerExtractor(QA_MODEL, QA_MODEL)

Text Normalization

In [4]:
class TextNormalizer(object):
    def __init__(self):
        df = pd.read_csv('wordmap.csv')
        # R_patterns = [dict(x) for x in df[['original_text', 'normalize_text']].values]
        self.wordmap = df[['original_text', 'normalize_text']].set_index('original_text').T.to_dict('records')[0]

    def replace(self, text):
        try:
            # text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
#             text = text.lower()
            each_word = re.findall(r"[(\w')]+|[.,!?;]", text)
            res = []
            for word in each_word:
                res.append(self.wordmap.get(word, word))
            res = ' '.join(res)
            return res
        except Exception as e:
            print('Fail!, Message:', e)
            return -1

def replaceNextLine(string):
    string = re.sub("\n+", "\n ", string).strip()
    return string
        
normalizer = TextNormalizer()

QA Engine Testing

In [5]:
df = pd.read_csv('MMU_Complete_Extra_Columns.csv')
df.fillna('', inplace=True)
df.drop('Programme Name', axis=1, inplace=True)
df.drop('Programme Faculty', axis=1, inplace=True)
df.head()

Unnamed: 0,Topic,Programme Duration,Local Total Tuition Fee (RM),International Total Tuition Fee (RM),Description,URL,Entry Requirements,Programme Structure,Career Prospects,Professional Qualification Exemptions,Intake,Fields of Research,Entry Requirements International,Application Form,Form Submission,Details,Eligibility,Guideline,Advance Disbursement
0,"Programmes And Courses, Faculty Of Engineering...",Full-Time: 1 – 3 years\nPart-Time: 2 – 5 years,"RM19,500.00","RM24,375.00",,,,,,,,,,,,,,,
1,"Programmes And Courses, Faculty Of Engineering...",Full-Time: 1 – 3 years\nPart-Time: 2 – 5 years,"RM19,500.00","RM24,375.00",,,,,,,,,,,,,,,
2,"Programmes And Courses, Faculty Of Engineering...",Full-Time: 1 – 3 years\nPart-Time: 2 – 5 years,"RM19,500.00","RM23,750.00",,,,,,,,,,,,,,,
3,"Programmes And Courses, Faculty Of Information...",3 years,"RM62,250.00","RM75,000.00",Data Communications and networking graduates a...,https://www.mmu.edu.my/programmes-by-faculty-a...,Pass Foundation / Matriculation studies from a...,Core\n\nYear 1\n\nMathematical Techniques\nCom...,"System Programmer, Network Engineer, Network A...",,,,I. Pass Foundation / Matriculation studies fro...,,,,,,
4,"Programmes And Courses, Faculty Of Law (fol), ...","Full-Time: Min. 3 years, Max. 5 years\nPart-Ti...","RM2,900.00/year","RM3,550/year",Our post graduate programmes LLM by research a...,https://www.mmu.edu.my/programmes-by-faculty-a...,"A Bachelor degree (Level 6, MQF) in Law AND a ...",Advanced Research Methodology for Law\nQualita...,,,Throughout the year,Alternative Dispute Resolution\nBanking Law\nB...,"A Bachelor degree (Level 6, MQF) in Law AND a ...",,,,,,


In [6]:
docs_l1 = [[ele] for ele in df['Topic'].values]
passage_retriever_l1 = PassageRetrieval(nlp)
passage_retriever_l1.fit(docs_l1)

Sample questions and answers

---------------------------------------------------------------------------------------------

Translation API

In [7]:
# !pip install googletrans==4.0.0rc1 -q

In [8]:
import googletrans
from googletrans import Translator

In [9]:
def QAPipeline2(question):
    lang_detect = Translator()
    lang_trans = Translator()
    language = lang_detect.detect(question).lang
    
    if language == 'en':
        print("English query detected \n----------------------\n")
        
        print('----------Text Normalization------------')
        question = normalizer.replace(question)
        question = question.lower()
        print('Normalized question:', question)
        
        print('\n----------Passage Retrieval------------')
        passages_l1 = passage_retriever_l1.most_similar(question, topn=3)
        print('-------Layer 1 (Document)-------')
        for passage_index, passage in enumerate(passages_l1):
            print('Rank', passage_index+1, ': Score:', str(round(passages_l1[passage_index][1], 4)).ljust(8), '| Text :', passages_l1[passage_index][0][0])
        selected_l1_row = df.loc[df['Topic'] == passages_l1[0][0][0]]
        print('-------Layer 2 (Paragraph)-------')
        docs_l2 = []
        for column_name in df.columns:
            if column_name != 'URL' and column_name != 'Topic':
                docs_l2.append([column_name + ':\n' + selected_l1_row[column_name].item()]) 
        passage_retriever_l2 = PassageRetrieval(nlp)
        passage_retriever_l2.fit(docs_l2)

        passages_l2 = passage_retriever_l2.most_similar(question, topn=10)
        for passage_index, passage in enumerate(passages_l2):
            print('Rank', passage_index+1, ': Score:', str(round(passages_l2[passage_index][1], 4)).ljust(8), '| Text :', replaceNextLine(passages_l2[passage_index][0][0]))

        passages_l2_text = [ele[0] for ele in passages_l2]
        
        print('\n----------Answer Extractor------------')
        answers = answer_extractor.extract(question, passages_l2_text)
        index = 0
        for answer_index, answer in enumerate(answers):
            if answer['text'][0][:-2] not in df.columns: # To ensure there is content
                print('Rank', index+1, ':', answer['answer'])
                print('Score', index+1, ':', answer['score'])
                text = replaceNextLine(answer['text'][0]) 
                print('Text', index+1, ':', text)
                print('')
                index += 1
        if index == 0:
            print('No answer')
        print('--------------------------------------')
        
    elif language == 'ms' or language == 'id':
        print("Soalan Bahasa Melayu dikesan \n----------------------------\n")
        
        print('----------Text Translation & Normalization------------')
        translated = lang_trans.translate(question, src='ms')
        qt = translated.text
        qt = normalizer.replace(qt)
        qt = qt.lower()
        print('Normalized translated question:', qt)
        
        print('\n----------Passage Retrieval------------')
        passages_l1 = passage_retriever_l1.most_similar(qt, topn=3)
        print('-------Layer 1 (Document)-------')
        for passage_index, passage in enumerate(passages_l1):
            print('Rank', passage_index+1, ': Score:', str(round(passages_l1[passage_index][1], 4)).ljust(8), '| Text :', passages_l1[passage_index][0][0])
        selected_l1_row = df.loc[df['Topic'] == passages_l1[0][0][0]]
        print('-------Layer 2 (Paragraph)-------')
        docs_l2 = []
        for column_name in df.columns:
            if column_name != 'URL' and column_name != 'Topic':
                docs_l2.append([column_name + ':\n' + selected_l1_row[column_name].item()]) 
        passage_retriever_l2 = PassageRetrieval(nlp)
        passage_retriever_l2.fit(docs_l2)

        passages_l2 = passage_retriever_l2.most_similar(qt, topn=10)
        for passage_index, passage in enumerate(passages_l2):
            print('Rank', passage_index+1, ': Score:', str(round(passages_l2[passage_index][1], 4)).ljust(8), '| Text :', replaceNextLine(passages_l2[passage_index][0][0]))

        passages_l2_text = [ele[0] for ele in passages_l2]
        
        print('\n----------Answer Extractor------------')
        answers = answer_extractor.extract(qt, passages_l2_text)
        index = 0
        for answer_index, answer in enumerate(answers):
            if answer['text'][0][:-2] not in df.columns: # To ensure there is content
                print('Rank', index+1, ':', answer['answer'])
                print('Score', index+1, ':', answer['score'])
                text = replaceNextLine(answer['text'][0])
                translated_text = lang_trans.translate(text, dest='ms').text
                translated_answer = lang_trans.translate(answer['answer'], dest='ms').text
                print('\nJawapan Bahasa Melayu:')
                print('Rank', index+1, ':', translated_answer)
                print('Sumber Teks', index+1, ':', translated_text)
                print('')
                index += 1
        if index == 0:
            print('No answer')
        print('--------------------------------------')
        
    else:
        print("Non-supported language \n-------------------------\n")

In [10]:
question = 'Apakah jumlah yuran program Foundation in Business?'
QAPipeline2(question)

Soalan Bahasa Melayu dikesan 
----------------------------

----------Text Translation & Normalization------------
Normalized translated question: what is the total foundation in business program fees ?

----------Passage Retrieval------------
-------Layer 1 (Document)-------
Rank 1 : Score: 5.7953   | Text : Programmes And Courses, Faculty Of Business (fob), Foundation In Business
Rank 2 : Score: 3.5301   | Text : Programmes And Courses, Faculty Of Business (fob), Diploma In Business Administration
Rank 3 : Score: 3.5301   | Text : Programmes And Courses, Faculty Of Business (fob), Diploma In Digital Business
-------Layer 2 (Paragraph)-------
Rank 1 : Score: 7.438    | Text : Description:
 This one-year foundation programme equips students with fundamental business knowledge and competency to pursue degrees in various fields of business prior to embarking in the business degree programme of their choice. Students will also be taught soft skills that include critical thinking, writing 

In [None]:
# question = 'What does the Foundation in Cinematics Arts focus on?' 
# Rank 1 : visual arts.
# Score 1 : 0.5700848110537677
# Text 1 : Description:
#  The Foundation in Cinematic Arts provides a broad scope in the humanities and social sciences with an emphasis on the visual arts. Students gain basic knowledge and skills required for enrolling in a higher education degree programme in the fields of cinematic and other arts and media, such as the degree programmes offered at Faculty of Cinematic Arts.

# question = 'What are the total fees for Foundation in Business?'
# Rank 1 : RM6,750.00
# Score 1 : 0.8529037468051825
# Text 1 : Local Total Tuition Fee (RM):
#  RM6,750.00

In [70]:
#sample qs:
# question = 'What does the Foundation in Cinematics Arts focus on?' 
# Rank 1 : visual arts.
# Score 1 : 0.5700848110537677
# Text 1 : Description:
#  The Foundation in Cinematic Arts provides a broad scope in the humanities and social sciences with an emphasis on the visual arts. Students gain basic knowledge and skills required for enrolling in a higher education degree programme in the fields of cinematic and other arts and media, such as the degree programmes offered at Faculty of Cinematic Arts.

# question = 'What are the total fees for Foundation in Business?'
# 'Apakah jumlah yuran program Foundation in Business?'
# Rank 1 : RM6,750.00
# Score 1 : 0.8529037468051825
# Text 1 : Local Total Tuition Fee (RM):
#  RM6,750.00

# '日本語で聞聞いても良いですか？'

In [71]:
# eng to mys converter
# translator = Translator()
# translated = translator.translate(question, dest='ms')
# translated = translator.detect(question)
# print(translated)