In [1]:
from transformers import BertForQuestionAnswering, AutoTokenizer
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()

modelname = 'deepset/bert-base-cased-squad2'

model = BertForQuestionAnswering.from_pretrained(modelname)
tokenizer = AutoTokenizer.from_pretrained(modelname)

In [2]:
from transformers import pipeline
nlp = pipeline('question-answering', model=model, tokenizer=tokenizer)

In [3]:
a1_path = "./Project-Data/set4/a1.txt"
a2_path = "./Project-Data/set4/a2.txt"
a3_path = "./Project-Data/set4/a3.txt"
a4_path = "./Project-Data/set4/a4.txt"
a5_path = "./Project-Data/set4/a5.txt"
a6_path = "./Project-Data/set4/a6.txt"
a7_path = "./Project-Data/set4/a7.txt"
a8_path = "./Project-Data/set4/a8.txt"
a9_path = "./Project-Data/set4/a9.txt"

In [4]:
import io

In [5]:
from nltk.corpus import stopwords

In [6]:
stop_words = set(stopwords.words('english'))

In [7]:
def get_article(path):
    with io.open(path, "r", encoding="utf-8") as f:
        article = f.read()
    return article

In [8]:
a1 = get_article(a1_path)
a2 = get_article(a2_path)
a3 = get_article(a3_path)
a4 = get_article(a4_path)
a5 = get_article(a5_path)
a6 = get_article(a6_path)
a7 = get_article(a7_path)
a8 = get_article(a8_path)
a9 = get_article(a9_path)

In [9]:
articles = [a1, a2, a3, a4, a5, a6, a7, a8, a9]

In [10]:
# topic recognition 
def movie(text):
    # assume text given is only the first sentence
    return re.search("film", text, re.IGNORECASE) != None

def football(text):
    return re.searh("soccer player|footballer", text, re.IGNORECASE) != None

def constellation(text):
    return re.search("constellation", text, re.IGNORECASE) != None

def language(text):
    return re.search("language", text, re.IGNORECASE) != None

In [11]:
film_questions = [
    "What is the genre of the film?",
    "When was the film released?", 
    "Who is the director of the film?",
    "What is the box office of the film?",
    "What is the film criticised for?",
    "What are some characters in the film?",
    "Did the film win any awards?",
    "Is the film a success?"
]

In [12]:
def handle_1st(context, question):
    # find in 1st sentence
    first_sent = context.split("\n\n\n")[1].split(".")[0]
    ans = nlp({
        "question": question,
        "context": first_sent
    })
    return ans["answer"]

In [13]:
def handle_intro(context, question):
    intro = context.split("\n\n\n")[1]
    ans = nlp({
        "question": question,
        "context": intro
    })
    return ans["answer"]

In [14]:
def handle(context, question, sect_name):
    temp = context.split("\n\n\n")
    sect = None
    for i, p in enumerate(temp):
        if p.startswith(sect_name):
            sect = temp[i+1]
            break
    if not sect:
        temp = context.split("\n")
        for i, p in enumerate(temp):
            if p.startswith(sect_name):
                sect = temp[i+1]
                break
    if not sect:
        return None
    ans = nlp({
        "question": question,
        "context": sect
    })
    return ans["answer"]

In [15]:
film_keywords_1st_sent = set(["genre", "country", "time"])
film_1st_sent_handler = (film_keywords_1st_sent, handle_1st, None)
film_keywords_intro = set(["written", "directed", "edited", "produced", "box office", \
                            "review", "acclaimed", "praised", "criticized", "nominate", "award", "success"])
film_intro_handler = (film_keywords_intro, handle_intro, None)

film_handlers = [film_1st_sent_handler, film_intro_handler]

In [16]:
football_keywords_1st_sent = set(["who", "born", "where"])
football_1st_sent_handler = (football_keywords_1st_sent, handle_1st, None)
football_keywords_sec1 = set(["childhood", "family", "parents", "recruited", "accepted", \
                              "grow up"])
football_sec1_handler = (football_keywords_sec1, handle, "Early life")
football_keywords_sec2 = set(["honors", "awards"])
football_sec2_handler = (football_keywords_sec2, handle, "Honours")

football_handlers = [football_1st_sent_handler, football_sec1_handler, football_sec2_handler]

In [17]:
cons_keywords_intro = set(["located", "where", "name"]) 
cons_intro_handler = (cons_keywords_intro, handle_intro, None)
cons_keywords_sec1 = set(["stars"])
cons_sec1_handler = (cons_keywords_sec1, handle, "Stars")
cons_keywords_sec2 = set(["galaxy", "cluster", "nebula"])
cons_sec2_handler = (cons_keywords_sec2, handle, "Deep-sky objects")

cons_handlers = [cons_intro_handler, cons_sec1_handler, cons_sec2_handler]

In [19]:
lang_keywords_sec1 = set(["earliest", "develop", "transform", "simplify", "rise", \
                          "spread", "evolve", "change", "retain", "rename", "acquire", \
                          "abtain"])
lang_sec1_handler = (cons_keywords_sec1, handle, "History")
lang_keywords_sec2 = set(["grammatical", "noun", "adjective", "pronouns", "determiner", \
                          "verb", "syllable", "morpheme", "word classes", "syntax"])
lang_sec2_handler = (cons_keywords_sec2, handle, "Grammar")
lang_keywords_sec3 = set(["phonetic", "dialect", "pronunciation", "phoneme", "consonant", \
                          "tone", "vowel", "sound", "nasal", "rhythm", "intonation"])
lang_sec3_handler = (lang_keywords_sec3, handle, "Phonology")

lang_handlers = [lang_sec1_handler, lang_sec2_handler, lang_sec3_handler]

In [20]:
def hardcode(context, question, handlers):
    for keywords, handler, param in handlers:
        for keyword in keywords:
            if st.stem(keyword) in question:
                if not param:
                    ans = handler(context, question, param)
                else:
                    ans = handler(context, question)
                return ans
    return None # answer can't be found through hardcode

In [21]:
answers = []
def check1(context, question):
    ans = None
    if movie(context):
        ans = hardcode(context, question, film_handlers)
    elif football(context):
        ans = hardcode(context, question, football_handlers)
    elif constellation(context):
        ans = hardcode(context, question, cons_handlers)
    elif language(context):
        ans = hardcode(context, question, lang_handlers)
    
    if ans:
        answers.append(ans)
    check2(context, question)