In [3]:
import spacy
import nltk
import stanza
import neuralcoref
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
nlp_s = stanza.Pipeline('en')
nlp = spacy.load("en_core_web_sm")
neuralcoref.add_to_pipe(nlp)
from nltk import Tree
import io

2021-11-11 22:06:19 INFO: Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| depparse     | combined  |
| sentiment    | sstplus   |
| constituency | wsj       |
| ner          | ontonotes |

2021-11-11 22:06:19 INFO: Use device: cpu
2021-11-11 22:06:19 INFO: Loading: tokenize
2021-11-11 22:06:19 INFO: Loading: pos
2021-11-11 22:06:20 INFO: Loading: lemma
2021-11-11 22:06:20 INFO: Loading: depparse
2021-11-11 22:06:20 INFO: Loading: sentiment
2021-11-11 22:06:21 INFO: Loading: constituency
2021-11-11 22:06:21 INFO: Loading: ner
2021-11-11 22:06:22 INFO: Done loading processors!


## Load the data

In [4]:
a1_path = "./Desktop/11411/project/Project-Data/set4/a1.txt"
a2_path = "./Desktop/11411/project/Project-Data/set4/a2.txt"
a3_path = "./Desktop/11411/project/Project-Data/set4/a3.txt"
a4_path = "./Desktop/11411/project/Project-Data/set4/a4.txt"
a5_path = "./Desktop/11411/project/Project-Data/set4/a5.txt"
a6_path = "./Desktop/11411/project/Project-Data/set4/a6.txt"
a7_path = "./Desktop/11411/project/Project-Data/set4/a7.txt"
a8_path = "./Desktop/11411/project/Project-Data/set4/a8.txt"
a9_path = "./Desktop/11411/project/Project-Data/set4/a9.txt"

In [5]:
def get_article(path):
    with io.open(path, "r", encoding="utf-8") as f:
        article = f.read()
    return article

In [6]:
a1 = get_article(a1_path)
a2 = get_article(a2_path)
a3 = get_article(a3_path)
a4 = get_article(a4_path)
a5 = get_article(a5_path)
a6 = get_article(a6_path)
a7 = get_article(a7_path)
a8 = get_article(a8_path)
a9 = get_article(a9_path)

In [7]:
a1_annotated = nlp(a1)
a2_annotated = nlp(a2)
a3_annotated = nlp(a3)
a4_annotated = nlp(a4)
a5_annotated = nlp(a5)
a6_annotated = nlp(a6)
a7_annotated = nlp(a7)
a8_annotated = nlp(a8)
a9_annotated = nlp(a9)

In [8]:
film_questions = [
    "What is the genre of the film?",
    "When was the film released?", 
    "Who is the director of the film?",
    "What is the box office of the film?",
    "What is the film criticised for?",
    "What are some characters in the film?",
    "Did the film win any awards?",
    "Is the film a success?"
]

## Try to get candidate sentences

In [9]:
# This function extracts a set of keywords from the question given
def keyword(question):
    q = nlp_s(question)  # Use stanza to annotate the question since it's more accurate
    keywords = set([st.stem(sent.words[word.head-1].text.lower()) if word.head > 0 else word.text for sent in q.sentences for word in sent.words])
    return keywords

In [11]:
# This function returns a list of candidate sentences by searching the keywords in the doc 
# params: doc is annotated and keywords is a set
# ret: candidates is a list of sentence
def candidates(doc, keywords):
    candidates = []
    for sent in doc.sents:
        check1 = set([st.stem(token.lemma_) for token in sent if token.dep_ in {"ROOT", "nbsuj", "obj"}])
        check2 = set([st.stem(chunk.root.head.lemma_) for chunk in sent.noun_chunks])
        check1.update(check2)
        for keyword in keywords:
            if keyword in check1:
                candidates.append(sent)
                break
        
    return candidates

### Test out code

In [14]:
for question in film_questions:
    print("Quesion: ", question)
    keywords = keyword(question)
    keywords.remove("film")
    l = candidates(a1_annotated, keywords)
    print("Candidates:", l)
    print("\n\n")

Quesion:  What is the genre of the film?
Candidates: []



Quesion:  When was the film released?
Candidates: [The soundtrack was released on 21 October 2011 through Sony Classical Records.


, Following its wins at the 69th Golden Globe Awards, it was announced Warner Bros. would re-release the film in France in 362 theaters on 25 January 2012., It was also re-released in Belgium on 22 February 2012.


, In response, director Hazanavicius released a statement:

"The Artist was made as a love letter to cinema, and grew out of my (and all of my cast and crew’s) admiration and respect for movies throughout history., The Artist was released on region 1 DVD and Blu-ray on 26 June 2012., It was released in the UK on 28 May 2012.


]



Quesion:  Who is the director of the film?
Candidates: [In January 2012, the film was nominated for twelve BAFTAs, the most of any film from 2011, and won seven, including Best Film, Best Director and Best Original Screenplay for Hazanavicius, and Best Actor f

In [15]:
for question in film_questions:
    print("Quesion: ", question)
    keywords = keyword(question)
    keywords.remove("film")
    l = candidates(a2_annotated, keywords)
    print("Candidates:", l)
    print("\n\n")

Quesion:  What is the genre of the film?
Candidates: []



Quesion:  When was the film released?
Candidates: [The film was released in France on 2 February 2011, under the title, An alternate version, with some of the profanities muted out, was classified as "PG-13" in the United States; this version was released to cinemas on 1 April 2011, replacing the R-rated one.]



Quesion:  Who is the director of the film?
Candidates: [The film also won four Academy Awards: Best Picture, Best Director (Hooper), Best Actor (Firth), and Best Original Screenplay (Seidler).


, I think a less courageous director than Tom [Hooper] – and indeed a less courageous actor than Colin, At the 83rd Academy Awards, The King's Speech won the Academy Award for Best Picture, Best Director (Hooper), Best Actor (Firth), and Best Original Screenplay (Seidler)., Besides the four categories it won, the film received nominations for Best Cinematography (Danny Cohen) and two for the supporting actors (Bonham Carter and