In [52]:
import nltk
from nltk.corpus import stopwords
import uuid
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import traceback

import tfidf_functions

In [74]:
# Read in an entire story based on document id
def get_document_text(document_id):
    try:
        text=""
        try:
            f=open(f"../narrativeqa/tmp/{document_id}.content", "r", encoding="utf-8")
            text = f.read()
            f.close()
        except:
            f=open(f"../narrativeqa/tmp/{document_id}.content", "r", encoding="ISO-8859-1")
            text = f.read()
            f.close()
        return text
    except Exception as e:
        print(f"Error getting document {document_id}")
        print(f"Exception: {e}")        


In [75]:
# Split text into passages to serve as documents in tfidf
def split_document_and_tfidf_vectorize(text, num_characters=500):
    passages = [text[i:i+num_characters] for i in range(0, len(text), num_characters)]

    #passages = text.split('\n\n')

    #passages = list(filter(None, passages))
    #print(len(passages))

    vectorizer = TfidfVectorizer(stop_words=set(stopwords.words("english")))
    tfidf = vectorizer.fit_transform(passages)
    
    return passages, tfidf, vectorizer

In [76]:
class QAPair:
    passages = []
    
    def __init__(self, document_id, question, answer1, answer2):
        self.document_id = document_id
        self.question = question
        self.answer1 = answer1
        self.answer2 = answer2
        self.id = uuid.uuid4()


In [77]:
# Load all question answer pairs for the available documents
#document_id, set, question, answer1, answer2, question_tokenized, answer1_tokenized, answer2_tokenized.
def get_question_answer_pairs():
    
    document_questions = {}
    
    f=open(f"../narrativeqa/qaps.csv", "r", encoding="ISO-8859-1")
    f1 = f.readlines()
    for x in f1:
        x_split = x.split(',')
        document_id = str(x_split[0])
        if document_id not in document_questions.keys():
            document_questions[document_id] = []
        
        document_questions[document_id].append(QAPair(x_split[0], x_split[2], x_split[3], x_split[4]))
    
    return document_questions

In [11]:
document_questions = get_question_answer_pairs()

In [78]:
# Get the top n passage indices in regards to a query within a document
# Based on cosine simliarity
def get_related_passage_indices(question, vectorizer, tfidf, num_passages_to_return=5):
    q_vec = vectorizer.transform([question])
    cosine_similarities = linear_kernel(q_vec, tfidf).flatten()

    related_docs_indices_a = cosine_similarities.argsort()[:-num_passages_to_return:-1]
    related_docs_indices = []
    for index in related_docs_indices_a:
        if cosine_similarities[index] > 0:
            related_docs_indices.append(index)
            
    return related_docs_indices

# Get the top n passages in regards to a query
def get_related_passages(passages, related_docs_indices):
    related_passages = []
    for i in related_docs_indices:
        related_passages.append(passages[i])
    
    return related_passages

In [33]:
#TESTING

q1 = [document_questions["0029bdbe75423337b551e42bb31f9a102785376f"][1].question]
answer1 = document_questions["0029bdbe75423337b551e42bb31f9a102785376f"][1].answer1
answer2 = document_questions["0029bdbe75423337b551e42bb31f9a102785376f"][1].answer2
q_vec = vectorizer.transform(q1)
cosine_similarities = linear_kernel(q_vec, tfidf).flatten()

num_passages_to_return = 5
related_docs_indices_a = cosine_similarities.argsort()[:-num_passages_to_return:-1]
related_docs_indices = []
for index in related_docs_indices_a:
    if cosine_similarities[index] > 0:
        related_docs_indices.append(index)
        
print(f'Q: {q1}\n')
for i in related_docs_indices:
    print(f'index: {i}')
    print(passages[i])
    print()
print('\nAnswers')
print(answer1)
print(answer2)

Q: ['Who does Arabella Mason wed?']

index: 6
he
captain, liked attention, and liked sailors; this was Miss Arabella
Mason, a very pretty young woman of eighteen years of age, who
constantly looked in the glass merely to ascertain if she had ever seen
a face which she preferred to her own, and who never read any novel
without discovering that there was a remarkable likeness between the
heroine and her pretty self.

Miss Arabella Mason was the eldest daughter of the steward of the old
Lord de Versely, brother to the Honourable Miss Delmar, a

index: 17
again came over to Madeline Hall, accompanied
as usual, by Ben, and the second day after their arrival it was made
known to all whom it might concern, that Miss Arabella Mason had
actually contracted a secret marriage with the handsome Benjamin Keene.

Of course, the last person made acquainted with this interesting
intelligence was the Honourable Miss Delmar, and her nephew took upon
himself to make the communication.  At first the honou

In [79]:
# Loop through available documents, retrieve the top passages for each question/answer pair
# Write the returned passages to document_qa_passages directory for later use
# Pairs written as directionary of form {q: [passage, passage, etc]}

#document_id,set,kind,story_url,story_file_size,wiki_url,wiki_title,story_word_count,story_start,story_end
def get_and_write_qa_passages(document_questions, max_stories=2):
    f=open("../narrativeqa/documents.csv", "r")
    f1 = f.readlines()
    i = 0
    max_stories = 2
    document_id=""
    for x in f1:
        try:
            i = i + 1
            if i == 1:
                continue

            x_split = x.split(',')
            document_id = x_split[0]

            text = get_document_text(document_id)

            doc_start = text.find(x_split[8])
            doc_end = text.rfind(x_split[9])

            text = text[int(doc_start):int(doc_end)]

            passages, tfidf, vectorizer = split_document_and_tfidf_vectorize(text)
            passages_to_write = {}
            
            for qa_pair in document_questions[document_id]:
                related_indices = get_related_passage_indices(qa_pair.question, vectorizer, tfidf, num_passages_to_return=5)
                related_passages = get_related_passages(passages, related_indices)
                qa_pair.passages = related_passages
                
                passages_to_write[qa_pair.question] = related_passages

            json_question_pairs = json.dumps(passages_to_write)
            fq = open(f"./document_qa_passages/{document_id}.q_passages", "w")
            fq.write(json_question_pairs)
            fq.close()

            if i > max_stories and max_stories != -1:
                break
                
        except Exception:
            traceback.print_exc()
            print(f"Error processing qa pairs and passages for document {document_id}")
            break


In [81]:
get_and_write_qa_passages(document_questions, max_stories=2)