In [74]:
import sparknotes_parser
import csv
import uuid
import re
import requests
import traceback
import os.path


In [2]:
def get_sparknotes_quizzes_for_book(urls):
    book_questions = []
    for url in urls:
        quiz_questions = sparknotes_parser.get_quiz_for_book(url)
        book_questions = book_questions + quiz_questions
    
    return book_questions

In [3]:
siddartha_urls = ['https://www.sparknotes.com/lit/siddhartha/quiz/', 
                  'https://www.sparknotes.com/lit/siddhartha/summary/?quickquiz_id=1758',
                 'https://www.sparknotes.com/lit/siddhartha/characters/?quickquiz_id=1759',
                 'https://www.sparknotes.com/lit/siddhartha/themes/?quickquiz_id=1761',
                 'https://www.sparknotes.com/lit/siddhartha/section1/?quickquiz_id=1762',
                 'https://www.sparknotes.com/lit/siddhartha/section2/?quickquiz_id=1763',
                 ]

adambede = ['https://www.sparknotes.com/lit/adambede/quiz/']
huckfin = ['https://www.sparknotes.com/lit/huckfinn/quiz/']

questions = get_sparknotes_quizzes_for_book(siddartha_urls)
print(len(questions))

50


In [72]:
def load_page(url):
    headers = {'content-type': 'text/html', 'Accept-Charset': 'UTF-8'}
    r = requests.get(url,  headers=headers)

    if (r == None):
        print(f"Unable to find url {url}")
    
    return r.content.decode('utf-8')

#"#document_id, set, question, answer1, answer2, question_tokenized, answer1_tokenized, answer2_tokenized.\n",
#        self.question = question
 #       self.answers = answers
 #       self.correct_answer = correct_answer
def save_quiz_questions(doc_quizzes, document_id):
    
    filename='./qaps_sparknotes.csv'
    file_exists = os.path.isfile(filename)
    
    with open(filename, 'a+', newline='') as csvfile:
        fieldnames = ['document_id', 'set', 'question', 'answer1', 'answer2', 'question_tokenized', 'answer1_tokenized', 'answer2_tokenized', 'answera', 'answerb', 'answerc', 'answerd']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        if not file_exists:
            writer.writeheader()
            
        for doc in doc_quizzes:
            
            # avoid true false questions
            if len(doc.answers) < 4:
                continue 
            
            writer.writerow({'document_id': document_id, 
                             'set': 'test', 
                             'question': doc.question, 
                             'answer1': doc.correct_answer, 
                             'answer2': doc.correct_answer, 
                             'question_tokenized': ' '.join(doc.question.split()), 
                             'answer1_tokenized': ' '.join(doc.correct_answer.split()), 
                             'answer2_tokenized': ' '.join(doc.correct_answer.split()), 
                             'answera': doc.answers[0], 
                             'answerb': doc.answers[1], 
                             'answerc': doc.answers[2], 
                             'answerd': doc.answers[3]
                            })
            
#
#document_id,set,kind,story_url,story_file_size,wiki_url,wiki_title,story_word_count,story_start,story_end
def save_document(document_id, book_title, book_url):
    
    filename='./documents_sparknotes.csv'
    file_exists = os.path.isfile(filename)
    
    with open(filename, 'a+', newline='') as csvfile:
        fieldnames = ['document_id','set','kind','story_url','story_file_size','wiki_url','wiki_title','story_word_count','story_start','story_end']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        if not file_exists:
            writer.writeheader()

        writer.writerow({'document_id': document_id, 
                 'set': 'test', 
                 'kind': 'gutenberg', 
                 'story_url': book_url, 
                 'story_file_size': 0, 
                 'wiki_url': '', 
                 'wiki_title': book_title, 
                 'story_word_count': 0, 
                 'story_start': '<NOSTART>', 
                 'story_end': '<NOEND>'
                })

In [77]:
with open('./book_quizzes.csv') as csvfile:
    books = csv.reader(csvfile, delimiter=',')
    next(books)
    #documentid, book, quizurl, bookurl
    document_id = ''
    total_questions = 0
    try:
        for row in books:
            document_id = row[0]
            new_document_id = False
            if row[0] != None:
                document_id = uuid.uuid1()
                new_document_id = True

            # Skip books without quiz link
            if row[2] == None or row[2] == '':
                continue
                
            # Skip rows without narrativeQA doc and 
            if new_document_id == True and row[3] == '':
                continue

            document_quiz_urls = [row[2]]
            page_content = load_page(row[2])
            other_quizzes = re.findall('href=.*quickquiz_id.*">', page_content)
            for quiz_tag in other_quizzes:
                quiz = quiz_tag[6:-2]
                quiz = "https://www.sparknotes.com" + quiz

                # Don't want to duplicate existing quiz, or context required quizzes
                if quiz == row[2] or re.search('context', quiz) != None:
                    continue

                document_quiz_urls.append(quiz)

            doc_quizzes = get_sparknotes_quizzes_for_book(document_quiz_urls)
            save_quiz_questions(doc_quizzes, document_id)
            
            if new_document_id:
                save_document(document_id, row[1], row[3])
            
            num_questions = len(doc_quizzes)
            print(f'Book: {row[1]} | Doc: {document_id} | Total questions: {num_questions}')
            total_questions = total_questions + num_questions

        print(f'Completed | {total_questions} questions created')
    except Exception as e:
        print(e)
        traceback.print_exc()
        print(f"Error scraping sparknotes quizzes {document_id}.")
        

        


Book: THE ADVENTURES OF HUCKLEBERRY FINN Mark Twain | Doc: f5f30064-0b24-11ea-9221-c9dc32036b5d | Total questions: 110
Book: AENEID Virgil | Doc: 02c245e8-0b25-11ea-9221-c9dc32036b5d | Total questions: 105
Book: AGAMEMNON, THE CHOEPHORI, AND THE EUMENIDES Aeschylus | Doc: 0885169a-0b25-11ea-9221-c9dc32036b5d | Total questions: 50
Book: ALICE'S ADVENTURES IN WONDERLAND Lewis Carroll | Doc: 14785d22-0b25-11ea-9221-c9dc32036b5d | Total questions: 25
Book: ALL'S WELL THAT ENDS WELL William Shakespeare | Doc: 16bf21ba-0b25-11ea-9221-c9dc32036b5d | Total questions: 25
Book: ANTONY AND CLEOPATRA William Shakespeare | Doc: 18b6eafc-0b25-11ea-9221-c9dc32036b5d | Total questions: 25
Book: ARMS AND THE MAN George Bernard Shaw | Doc: 1a92e48e-0b25-11ea-9221-c9dc32036b5d | Total questions: 25
Book: THE AUTOBIOGRAPHY OF BENJAMIN FRANKLIN Benjamin Franklin | Doc: 1ab8d28e-0b25-11ea-9221-c9dc32036b5d | Total questions: 70
Book: BABBITT Sinclair Lewis | Doc: 2390db68-0b25-11ea-9221-c9dc32036b5d | Total