In [None]:
'''
'''

In [None]:
# load the files
import json
documents_set = json.load(open('documents.json'))
training_set = json.load(open('training.json'))
devel_set = json.load(open('devel.json'))
testing_set = json.load(open('testing.json'))

<b> 1. Information Retrieval to find the best matching sentence </b>

In [None]:
import nltk
import re
from math import log
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize
from collections import defaultdict, Counter
from nltk.corpus import stopwords
nltk.download("wordnet") 
nltk.download('punkt')

stopwords = set(nltk.corpus.stopwords.words('english')) 
stemmer = nltk.stem.PorterStemmer()
lmtzr = nltk.stem.wordnet.WordNetLemmatizer()


# Preprocessing: lowercase, remove stopwords, stem the words and form words representation 
def preprocess(doc):
    preprocessed_doc = []
    doc = re.sub(r'[^\w\s]', '', doc) # remove punctuations
    tokens = word_tokenize(doc)
    for token in tokens:
        if token.lower() not in stopwords: #remove stopwords
            preprocessed_doc.append(stemmer.stem(token.lower())) # stem the word
    return preprocessed_doc        


# collect term frequencies for each sentence (a bag of words)
def extract_term_freqs(sentence):
    tfs = Counter()
    preprocessed_sent = preprocess(sentence)
    for token in preprocessed_sent:
        tfs[token] += 1
    return tfs


# compute document frequencies(here refers to a term occurs in how many sentences within a document)
def compute_doc_freqs(doc_term_freqs):
    dfs = Counter()
    for tfs in doc_term_freqs.values():
        for term in tfs.keys():
            dfs[term] += 1
    return dfs


# process documents_set into sentences
def sent_tokenize(documents_set):
    documents_set_sents = {}
    for docid in range(len(documents_set)):
        doc = documents_set[docid]['text']
        sentences = []
        for para in doc:
            sentences += nltk.sent_tokenize(para)
        documents_set_sents[docid] = sentences
    return documents_set_sents
documents_set_sents = sent_tokenize(documents_set)


# process the document_set into term frequencies
def get_term_frequencies(documents_set_sents):
    doc_term_freqs = {}
    for docid in range(len(documents_set_sents)):
        doc = documents_set_sents[docid]
        sent_term_freqs = {}
        for sent_id in range(len(doc)):
            term_freqs = extract_term_freqs(doc[sent_id])
            sent_term_freqs[sent_id] = term_freqs
        doc_term_freqs[docid] = sent_term_freqs
    return doc_term_freqs
doc_term_freqs = get_term_frequencies(documents_set_sents)


# process the document_set into document frequencies
def get_doc_freqs(doc_term_freqs):
    doc_freqs = {}
    for docid in doc_term_freqs.keys():
        sent_freqs = compute_doc_freqs(doc_term_freqs[docid])
        doc_freqs[docid] = sent_freqs
    return doc_freqs
doc_freqs = get_doc_freqs(doc_term_freqs)


# build an inverted index to allow for efficient lookup by term
def inverted_index(doc_term_freqs):
    inverted_index_dict = {}
    for docid in doc_term_freqs.keys():
        inverted_index = defaultdict(list)
        # note the inversion of the indexing, to be term -> (sent_id, tf)
        for sent_id, term_freqs in doc_term_freqs[docid].items():
            for term in term_freqs.keys():
                inverted_index[term].append([sent_id, term_freqs[term]])
        inverted_index_dict[docid] = inverted_index
    return inverted_index_dict
inverted_index_dict = inverted_index(doc_term_freqs)


# Store the number of tokens in each sentence of a document
def get_token_num(documents_set_sents):
    token_num_dict = {}
    for docid in range(len(documents_set_sents)):
        sent_length_dict = {}
        doc = documents_set_sents[docid]
        for sent_id in range(len(doc)):
            preprocessed_sent = preprocess(doc[sent_id])
            sent_length_dict[sent_id] = len(preprocessed_sent)
        token_num_dict[docid] = sent_length_dict
    return token_num_dict
token_num_dict = get_token_num(documents_set_sents)


# store the number of sentences in a documents
def get_sent_num(documents_set_sents):
    sent_num = {}
    for docid in range(len(documents_set_sents)):
        sent_num[docid] = len(documents_set_sents[docid])
    return sent_num
sent_num = get_sent_num(documents_set_sents)


# compute BM25
def Okapi_BM25(query, docid, sent_num, inverted_index_dict, token_num_dict, doc_term_freqs, doc_freqs):
    preprocessed_query = preprocess(query)
    query_terms = set(preprocessed_query)
    query_terms_freqs = extract_term_freqs(query)
    k1 = 1.2
    k3 = 1.5
    b = 0.75
    score = {}
                
    for sent_id in range(sent_num[docid]):
        sent_score = 0
        for term in query_terms:
            N = sent_num[docid]                          # the number of sentences
            f = len(inverted_index_dict[docid][term])    # number of sentences contain term
            fdt =  doc_term_freqs[docid][sent_id][term]  # number of a term in a sentence
            Ld = sum(doc_term_freqs[docid][sent_id].values())     # length of a sentence
            Lavg =  sum(doc_freqs[docid].values()) / sent_num[docid]   # ave length of sentences
            fqt = query_terms_freqs[term]                # number of term in a query

            idf = log((N - f + 0.5)/(f + 0.5))
            tf_doc = ((k1 + 1) * fdt)/(k1 * ((1-b) + b * Ld/Lavg) + fdt)
            query_tf = (k3 + 1) * fqt / (k3 + fqt)
            query_tf = (k3 + 1) * fqt / (k3 + fqt)
            wt = idf * tf_doc * query_tf
            sent_score += wt
        score[sent_id] = sent_score
    return score

# return the best matching sentence
def best_matching_sent_id(scores):
    best_matching_sent_id = max(scores, key=lambda k: scores[k])
    return best_matching_sent_id


def best_match_sent(documents_set_sents, data_set):
    best_match_sent = {}
    for query_id in range(len(data_set)):
        query = data_set[query_id]['question']
        docid = data_set[query_id]['docid']
        scores = Okapi_BM25(query, docid, sent_num, inverted_index_dict, token_num_dict, doc_term_freqs, doc_freqs)
        best_matching_sent_id = max(scores, key=lambda k: scores[k])
        best_match_sent[query_id] = documents_set_sents[docid][best_matching_sent_id]
    return best_match_sent

# generate best match sentence for questions in training set, development set and testing set.

best_match_sent_devel = best_match_sent(documents_set_sents, data_set = devel_set)
best_match_sent_test = best_match_sent(documents_set_sents, data_set = testing_set)
best_match_sent_train = best_match_sent(documents_set_sents, data_set = training_set)

<b> 2. Named Entity Recognition</b>

In [None]:
'''
Reference: Jenny Rose Finkel, Trond Grenager, and Christopher Manning. 2005. Incorporating Non-local Information into Information Extraction Systems 
by Gibbs Sampling. Proceedings of the 43nd Annual Meeting of the Association for Computational Linguistics (ACL 2005), 
pp. 363-370. http://nlp.stanford.edu/~manning/papers/gibbscrf3.pdf
'''

import nltk
from nltk.tag import StanfordNERTagger
from nltk import pos_tag, sent_tokenize, word_tokenize
from nltk.chunk import conlltags2tree
from nltk.tree import Tree
nltk.download('averaged_perceptron_tagger')

# StanfordNERTagger 7 class model for recognizing locations, persons, organizations, times, money, percents, and dates
st = StanfordNERTagger('/Users/yue/stanford-ner/classifiers/english.muc.7class.distsim.crf.ser.gz',
                       '/Users/yue/stanford-ner/stanford-ner.jar',
                       encoding='utf-8')


# obtain stanford entity information for best match sentences
def get_tagged_sents(best_match_sents):
    sents_list = []
    for i in range(len(best_match_sents.keys())):
        sents_list.append(word_tokenize(best_match_sents[i]))
    tagged_sent = st.tag_sents(sents_list)
    for st_tag in tagged_sent:
        for token, tag in st_tag:
            if token == '': # remove empty array
                st_tag.remove((token, tag))
    return tagged_sent
        
st_best_match_sents_devel = get_tagged_sents(best_match_sent_devel)
st_best_match_sents_test = get_tagged_sents(best_match_sent_test)
st_best_match_sents_train = get_tagged_sents(best_match_sent_train)

In [None]:
# convert Stanford NER Tagger result into NLTK tress
## Adapted from https://stackoverflow.com/questions/30664677/extract-list-of-persons-and-organizations-using-stanford-ner-tagger-in-nltk

from nltk.chunk import conlltags2tree
from nltk.tree import Tree

# named entity boundaries
def stanfordNE2bio(st_tag):
    bio_tagged_sent = []
    prev_tag = 'O'
    for token, tag in st_tag:
        if tag == "O": #O
            bio_tagged_sent.append((token, tag))
            prev_tag = tag
            continue
        if tag != "O" and prev_tag == "O": # Begin NE
            bio_tagged_sent.append((token, "B-"+tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag == tag: # Inside NE
            bio_tagged_sent.append((token, "I-"+tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag != tag: # Adjacent NE
            bio_tagged_sent.append((token, "B-"+tag))
            prev_tag = tag
    return bio_tagged_sent


# convert Stanford NER Tagger result into NLTK tress
def stanfordNE2tree(st_tag):
    bio_tagged_sent = stanfordNE2bio(st_tag)
    sent_tokens, sent_ne_tags = zip(*bio_tagged_sent)
    sent_pos_tags = [pos for token, pos in pos_tag(sent_tokens)]
    sent_conlltags = [(token, pos, ne) for token, pos, ne in zip(sent_tokens, sent_pos_tags, sent_ne_tags)]
    ne_tree = conlltags2tree(sent_conlltags)
    return ne_tree


# get continuous named entity words
def get_continuous_NE(st_tag):
    ne_tree = stanfordNE2tree(st_tag)
    ne_in_sent = []
    ne_token = []
    for subtree in ne_tree:
        if type(subtree) == Tree: 
            ne_label = subtree.label()
            ne_string = " ".join([token for token, pos in subtree.leaves()])
            ne_in_sent.append((ne_string.lower(), ne_label))
            ne_token.append(ne_string.lower())
    return ne_in_sent

## End of adaptation 


# obtain final named entity by joining contiguous words with the same type for each data set
def get_final_en(st_best_match_sents):
    final_en = []
    for st_tag in st_best_match_sents:
        final_en.append(get_continuous_NE(st_tag))
    return final_en

devel_set_ne = get_final_en(st_best_match_sents_devel)
train_set_ne = get_final_en(st_best_match_sents_train)
test_set_ne = get_final_en(st_best_match_sents_test)

%store devel_set_ne
%store train_set_ne
%store test_set_ne


In [None]:
# get best sentence full tag: combine POS tag feature and Named Entity feature
def get_continuous_NE_and_postag(st_tag):
    ne_tree = stanfordNE2tree(st_tag)
    ne_in_sent = []
    ne_token = []
    for subtree in ne_tree:
        if type(subtree) == Tree: 
            ne_label = subtree.label()
            ne_string = " ".join([token for token, pos in subtree.leaves()])
            ne_in_sent.append((ne_string.lower(), ne_label))
            ne_token.append(ne_string.lower())
        else:
            ne_in_sent.append(subtree)
    return ne_in_sent

def get_final_en_and_postag(st_best_match_sents):
    final_en_postag = []
    for st_tag in st_best_match_sents:
        final_en_postag.append(get_continuous_NE_and_postag(st_tag))
    return final_en_postag

devel_best_sent_ne_postag = get_final_en_and_postag(st_best_match_sents_devel)
train_best_sent_ne_postag = get_final_en_and_postag(st_best_match_sents_train)
test_best_sent_ne_postag = get_final_en_and_postag(st_best_match_sents_test)

%store devel_best_sent_ne_postag
%store train_best_sent_ne_postag
%store test_best_sent_ne_postag


In [None]:
# get POS tag for each token in each query
def get_query_full_postag(dataset):
    query_full_pos_tag = []
    for query_id in range(len(dataset)):
        query = word_tokenize(dataset[query_id]['question'].lower())
        query_full_pos_tag.append(pos_tag(query))
    return query_full_pos_tag

train_query_full_postag = get_query_full_postag(training_set)
devel_query_full_postag = get_query_full_postag(devel_set)
test_query_full_postag = get_query_full_postag(testing_set)

%store train_query_full_postag
%store devel_query_full_postag
%store test_query_full_postag

In [None]:
# get named entity and POS tag for questions 
def get_queries(dataset):
    query = {}
    for query_id in range(len(dataset)):
        query[query_id] = dataset[query_id]['question']
    return query

devel_queries = get_queries(devel_set)
test_queries = get_queries(testing_set)
train_queries = get_queries(training_set)
st_devel_queries = get_tagged_sents(devel_queries)
st_test_queries = get_tagged_sents(test_queries)
st_train_queries = get_tagged_sents(train_queries)

devel_query_ne_postag = get_final_en_and_postag(st_devel_queries)
train_query_ne_postag = get_final_en_and_postag(st_train_queries)
test_query_ne_postag = get_final_en_and_postag(st_test_queries)


# named entity
devel_query_ne = get_final_en(st_devel_queries)
trian_query_ne = get_final_en(st_train_queries)
test_query_ne = get_final_en(st_test_queries)


<b> 3. Question type classifier </b>

In [None]:
# build a question classifier
'''
Aknowlege:
https://shirishkadam.com/2017/07/03/nlp-question-classification-using-support-vector-machines-spacyscikit-learnpandas/
WH-word: The WH-word in a question holds a lot of information about the intent of the question and what basically it is trying to seek. (What, When, How, Where and so on)
WH-word POS: The part of speech of the WH-word (wh-determiner, wh-pronoun, wh-adverb)
POS of the word next to WH-word: The part of speech of the word adjacent to WH-word or the word at 1st position in the bigram (0th being the WH-word).
head word: the first NP after the question’s wh-word.
'''
'''
The question types:
“person” (who/whom/whose)
“date”/“time”(when)
“location” (where), 
“entity: physical object”(what/which), 
“numeric value”/“Money” (how many/how much/how far/how long etc)
“description” (how/why). 
wh_word_list = ['what', 'which', 'who', 'whom', 'whose', 'when', 'where', 'why', 'how']
'''

# tag the answers of training set 
def label_ans(dataset, ne_dict):
    ans_type_dict = {}
    for query_id in range(len(dataset)):
        query = dataset[query_id]['question']
        ans = [token for token, pos in ne_dict[query_id]]
        pos = [pos for token, pos in ne_dict[query_id]]
        if dataset[query_id]['text'] in ans:
            index = ans.index(dataset[query_id]['text'])
            ans_type_dict[query_id] = pos[index]
        else:
            ans_type_dict[query_id] = 'OTHER'
    return ans_type_dict

train_ans_labbled = label_ans(training_set, train_set_ne)
devel_ans_labbled = label_ans(devel_set, devel_set_ne)

%store train_ans_labbled 
%store devel_ans_labbled 


In [None]:
# get head word 
def get_head_word(wh_word_index, full_postag):
    head_word = ''
    for token_id in range(wh_word_index, len(full_postag)):
        pos = full_postag[token_id][1]
        if pos == 'NN' or pos == 'NNS':
            head_word = lmtzr.lemmatize(full_postag[token_id][0])
            break
    
    if head_word == '':
        for token_id in range(0, wh_word_index):
            pos = full_postag[token_id][1]
            if pos == 'NN' or pos == 'NNS':
                head_word = lmtzr.lemmatize(full_postag[token_id][0])
                break
    return head_word

In [None]:
# get features to conduct machine learning
def get_data_feature(query_full_postag):
    feature_list = []
    wh_word = ''
    wh_word_pos = ''
    neigh_word = ''
    neigh_word_pos = ''
    head_word = ''
    for token, pos in query_full_postag:
        if pos == 'WDT' or pos == 'WP' or pos == 'WP$' or pos == 'WRB': 
            wh_word = token
            wh_word_index = query_full_postag.index((token, pos))
            wh_word_pos = query_full_postag[wh_word_index][1]
            head_word = get_head_word(wh_word_index, query_full_postag)
            if wh_word_index == len(query_full_postag) - 1:
                neigh_word_index = ''
                neigh_word = ''
                neigh_word_pos = ''
            else:
                neigh_word_index = wh_word_index + 1
                neigh_word = query_full_postag[neigh_word_index][0]
                neigh_word_pos = query_full_postag[neigh_word_index][1]
            break
                        
    if  wh_word == '':
        wh_word = 'unk'
        wh_word_pos = 'unk'
        neigh_word = 'unk'
        neigh_word_pos = 'unk'
        head_word = get_head_word(0, query_full_postag)

    feature_list = [wh_word, wh_word_pos, neigh_word, neigh_word_pos, head_word]
    return feature_list
    
# train feature dict
def get_train_features():
    train_feature_dict = {}
    for query_id in train_ans_labbled.keys():
        if train_ans_labbled[query_id] != 'OTHER':
            query_full_postag = train_query_full_postag[query_id]
            train_feature_dict[query_id] = get_data_feature(query_full_postag)
    return train_feature_dict

def get_devel_features():
    devel_feature_dict = {}
    for query_id in devel_ans_labbled.keys():
        query_full_postag = devel_query_full_postag[query_id]
        devel_feature_dict[query_id] = get_data_feature(query_full_postag)
    return devel_feature_dict

def get_test_features():
    test_feature_dict = {}
    for query_id in range(len(test_query_full_postag)):
        query_full_postag = test_query_full_postag[query_id]
        test_feature_dict[query_id] = get_data_feature(query_full_postag)
    return test_feature_dict
    
train_feature = get_train_features()
devel_feature = get_devel_features()
test_feature = get_test_features()

%store train_feature 
%store devel_feature 
%store test_feature 
    
# 33904 'other'

In [None]:
import sklearn
import numpy as np
from sklearn.feature_extraction import DictVectorizer

vectorizer = DictVectorizer()

def get_BOW(feature_list):
    BOW = {}
    for token in feature_list:
        BOW[token] = BOW.get(token, 0) + 1
    return BOW

def prepare_train_data(train_feature, feature_extractor):
    feature_matrix = []
    for query_id in train_feature.keys():
        feature_list = train_feature[query_id]
        feature_matrix.append(feature_extractor(feature_list))
    return feature_matrix

train_matrix = prepare_train_data(train_feature, get_BOW)
devel_matrix = prepare_train_data(devel_feature, get_BOW)
test_matrix = prepare_train_data(test_feature, get_BOW)

def get_classification(features, ans_labbled):
    classification = []
    for query_id in features.keys():
        classification.append(ans_labbled[query_id])
    return classification

train_classification = get_classification(train_feature, train_ans_labbled)
devel_classification = get_classification(devel_feature, devel_ans_labbled)

# transform data
X_train = vectorizer.fit_transform(train_matrix)
X_devel = vectorizer.transform(devel_matrix)
X_test = vectorizer.transform(test_matrix)


In [None]:
# Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
import numpy as np
from sklearn.metrics import accuracy_score

def clf_NB_tune_parameters():
    alphas = np.logspace(-2, 3, num = 100)
    result = []
    for alpha in alphas:
        clf_NB = MultinomialNB(alpha = alpha)
        clf_NB.fit(X_train, train_classification)
        y_predict_NB = clf_NB.predict(X_devel)
        accuracy = accuracy_score(devel_classification, y_predict_NB)
        result.append([alpha, accuracy])
    sorted_result = sorted(result, key = lambda tup: tup[1], reverse = True)
    return sorted_result

top_ten_NB_result = clf_NB_tune_parameters()[:10]
best_alpha = top_ten_NB_result[0][0]

def get_NB_predict(X_matric):
    clf_NB = MultinomialNB(alpha = best_alpha)
    clf_NB.fit(X_train, train_classification)
    y_predict_NB = clf_NB.predict(X_matric)
    return y_predict_NB 

NB_test_query_type = get_NB_predict(X_test)
NB_devel_query_type = get_NB_predict(X_devel)

'''
0.7884615384615384 accuracy for NE type question
'''

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
import numpy as np
def clf_LR_tune_penalty(penalty):
    clf_LR = LogisticRegression(penalty = penalty)
    clf_LR.fit(X_train, train_classification)
    y_predict_LR = clf_LR.predict(X_devel)
    accuracy = accuracy_score(devel_classification, y_predict_LR)
    return accuracy

def clf_LR_tune_hyperparameter():
    Cs = np.linspace(1e-2, 100, num = 200)
    result = []
    for C in Cs:
        clf_LR = LogisticRegression(C = C)
        clf_LR.fit(X_train, train_classification)
        y_predict_LR = clf_LR.predict(X_devel)
        accuracy = accuracy_score(devel_classification, y_predict_LR)
        result.append([C, accuracy])
    sorted_result = sorted(result, key = lambda tup: tup[1], reverse = True)
    return sorted_result
top_ten_LR_result = clf_LR_tune_hyperparameter()[:10]
best_C = top_ten_LR_result[0][0]

def get_LR_predict(X_matrix):
    clf_LR = LogisticRegression(penalty = 'l2', C = 1.21)
    clf_LR.fit(X_train, train_classification)
    y_predict_LR = clf_LR.predict(X_matrix)
    return y_predict_LR

LR_test_query_type = get_LR_predict(X_test)
LR_devel_query_type = get_LR_predict(X_devel)

'''
0.8028846153846154 accuracy for NE type question: Logistic Regression will be adopted 
'''

<b> 4. Answer ranking </b>

In [None]:
# get all the candidate anwers
def get_ans_candidate(dataset, data_set_ne, query_type, dataset_best_sent_ne_postag):
    ans_candidate_dict = {}
    answers = []
    for query_id in range(len(dataset)):
        ans_candidate = []
        ans = ''
        lowercased_query = dataset[query_id]['question'].lower()
        # named entity for best match sentence
        final_ne_sent = data_set_ne[query_id]
        # query type
        ans_type = query_type[query_id]
        full_ne_postag = dataset_best_sent_ne_postag[query_id]
        
        # numeric question: complement stanford named entity
        if 'far' in lowercased_query or 'many' in lowercased_query or 'much' in lowercased_query or 'long' in lowercased_query:
            for token, pos in full_ne_postag:
                if pos == 'CD':
                    ans_candidate.append((token, pos, 1, 1))
                  
        if final_ne_sent == []:
            for token_pair_id in range(len(full_ne_postag)):
                token = full_ne_postag[token_pair_id][0]
                pos = full_ne_postag[token_pair_id][1]
                if pos == 'IN':
                    next_token = full_ne_postag[token_pair_id + 1][0]
                    next_pos = full_ne_postag[token_pair_id + 1][1]
                    if next_token != 'the':
                        if next_token not in lowercased_query:
                            ans_candidate.append((next_token, next_pos, 1, 1))  
                        else:
                            ans_candidate.append((next_token, next_pos, 1, 2))
        
        if final_ne_sent == []:
            ans_candidate = []   
        else:
            hi = []
            low = []
            for token, pos in final_ne_sent:
                if token not in lowercased_query:
                    if pos == ans_type:
                        hi.append((token, pos, 1, 1)) # first integer: pos = ans_type; second integer: token not in query
                    else:
                        low.append((token, pos, 1, 2)) # if candidate token appear in the query, second integer = 2
                else:
                    low.append((token, pos, 2, 2))
                      
            ans_candidate.extend(hi)
            ans_candidate.extend(low)
            
            if ans_candidate == []: 
                if ans_candidate == []:
                    for token, pos in final_ne_sent:
                        ans_candidate.append((token, pos, 1, 1))
        ans_candidate_dict[query_id] = ans_candidate
    return ans_candidate_dict

test_ans_candidate = get_ans_candidate(testing_set, test_set_ne, LR_test_query_type, test_best_sent_ne_postag)
devel_ans_candidate = get_ans_candidate(devel_set, devel_set_ne, LR_devel_query_type, devel_best_sent_ne_postag)


In [None]:
# get distance between NE or token and a NE from question if NEs are the same type
# and rank all the candidate answers
def get_distance_NE_in_query(dataset, ans_candidate_dict, dataset_best_sent_ne_postag, dataset_query_ne):
    ans_submission = []
    for query_id in ans_candidate_dict.keys():
        query_ne_index_list = []
        answers_index_list = []
        ans_candidate = ans_candidate_dict[query_id] # candidate ans with postag and question type score
        lowercased_query = dataset[query_id]['question'].lower()
        
        if ans_candidate != []:
            ans_set = set(token for token, pos, question_type_score, token_in_query in ans_candidate) # remove replication 
            query_ne = dataset_query_ne[query_id] # NE of question
            best_sent_token = [token for token, pos in dataset_best_sent_ne_postag[query_id]] # tokenize best matching sentence

            answers_index_list = [] # candidate ans's index in the best matching sentence
            for answers in ans_set:
                ans_index = [i for i, j in enumerate(best_sent_token) if j == answers]
                answers_index_list.append((answers, ans_index))

            # find the index of query NE in the best matching sentence 
            if query_ne != []:
                query_ne_index_list = [] # a list store the query NEs and their indexes
                for token, ne_type in query_ne:
                    if token in best_sent_token: # if query NE appears in best match sentence
                        content_word_index = [i for i, j in enumerate(best_sent_token) if j == token] # query ne's index in sentence
                        query_ne_index_list += content_word_index 
        
        new_ans = []        
        ans_candidate_new = {}    
        # distance between sentence ne and query content word
        if answers_index_list != [] and query_ne_index_list != []:
            for answers, ans_index in answers_index_list:
                distance = []
                for ans_ne_index in ans_index:
                    for query_ne_index in query_ne_index_list:
                        distance.append(abs(ans_ne_index - query_ne_index))
                min_distance = min(distance)
                if answers not in lowercased_query:
                    min_distance = min(distance)
                else:
                    min_distance = 1000
                ans_candidate_new[answers] = min_distance
            for token, pos, question_type_score, token_in_query in ans_candidate:
                if token in ans_candidate_new.keys():
                    new_ans.append((token, pos, question_type_score, token_in_query, ans_candidate_new[token]))
                else:
                    new_ans.append((token, pos, question_type_score, token_in_query, 1000))
        else:
            for token, pos, question_type_score, token_in_query in ans_candidate:
                new_ans.append((token, pos, question_type_score, token_in_query, 1000))
                
        # rank all the answers
        new_ans.sort(key=lambda x:(x[3],x[2],x[4]))
        
        if new_ans == []:
            final_ans = ''
        else:
            final_ans = new_ans[0][0]
        ans_submission.append((query_id, final_ans))
    return ans_submission

ans_devel_sub13 = get_distance_NE_in_query(devel_set, devel_ans_candidate, devel_best_sent_ne_postag, devel_query_ne)
ans_test_sub13 = get_distance_NE_in_query(testing_set, test_ans_candidate, test_best_sent_ne_postag, test_query_ne)

<b> 5. Answer: output csv </b>

In [None]:
import csv
with open('test_ans13.csv', 'w', newline='', encoding='UTF-8') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    header = ['id', 'answer']
    writer.writerow(header)
    writer.writerows(ans_test_sub13)
csvfile.close()


<b> 6. Measurement </b>

In [None]:
import csv

ans_file = csv.reader(open('devel_ans13.csv', 'r'))
ans_dict = {}
for item in ans_file:
    if ans_file.line_num == 1:
        continue
    ans_dict[int(item[0])] = item[1]

def average_f1_score(ans_dict):
    TP = 0
    FP = 0
    FN = 0
    for ans_id in ans_dict.keys():
        my_ans = word_tokenize(re.sub(r'[^\w\s]', '', ans_dict[ans_id].lower())) # remove punctuations
        gold_ans = devel_set[ans_id]['text']
        gold_ans = word_tokenize(re.sub(r'[^\w\s]', '', gold_ans.lower()))
        for token in my_ans:
            if token in gold_ans:
                TP += 1
            elif token not in gold_ans:
                FP += 1
        for gold_token in gold_ans:
            if gold_token not in my_ans:
                FN += 1
    recall = TP/(TP + FN)
    prec = TP/(TP + FP)
    average_f1_score = 2 * (recall * prec)/(recall + prec)
    return average_f1_score

final_QA_f1 = average_f1_score(ans_dict)
