In [1]:
from dotenv import load_dotenv
import os
import requests
from bs4 import BeautifulSoup
import cloudscraper
import re
import time
import csv
import pandas as pd

load_dotenv()

True

In [23]:
class Source:

    @staticmethod
    def get_soup_object(level: str, exercise_number: str, flag_mock: bool = True) -> BeautifulSoup:
        '''
        Gets data from os.getenv() and returns beautiful Soup object.

        If flag_mock = True, it returns the contents of a local file to prevent making requests during tests.
        '''
        def replace_string_url(level: str, exercise: str):
            '''
            Replace url string with exercise and level for os.environ['SOURCE_1']
            '''
            url = list(os.getenv('SOURCE_1'))
            url[69] = level[0]
            url[70] = level[1]
            url[92] = exercise
            return ''.join(url)

        soup_obj = None
        response = str()


        if flag_mock:
            with open('test.html', 'r') as f:
                response =  f.read()
        else:
            source = replace_string_url(level, exercise_number)
            scraper = cloudscraper.create_scraper()
            response = scraper.get(source).text

        soup = BeautifulSoup(response, 'html.parser')
        return soup
    
    @staticmethod
    def get_raw_question_data(soup: BeautifulSoup) -> dict:
        '''
        Scrapes the soup object to get text for p elements which contain question and answer.
        '''

        ptags = soup.find_all('p')
        regex_question = r'^\d+\..*'
        regex_answer = r'^Question.*'
        relevant_info = {'Questions': [], 'Answers': []}
        for ptag in ptags:
            re_match_questions = re.match(regex_question, ptag.text)
            re_match_answers = re.match(regex_answer, ptag.text)
            if re_match_questions:
                relevant_info['Questions'].append(re_match_questions.group(0))
                continue
            if re_match_answers:
                relevant_info['Answers'].append(re_match_answers.group(0))

        relevant_info['Answers'] = relevant_info['Answers'][0].split('Question ')[1:]

        return relevant_info
    
    @staticmethod
    def extract_question_str_choices(question_text: str) -> dict:
        '''
        Create a dictionary with question text and question choices
        '''
        match = re.match(r'^(\d+)\.\s*(.*?)\s*([^。]+)$', question_text)
        if match:
            question_number = int(match.group(1).strip())
            question_str = match.group(2).strip()  
            question_choices = match.group(3).strip()    
            
            question_choices = question_choices.split(' ')

            question_choices = {str(index+1): choice for index, choice in enumerate(question_choices)}
            question_choices = str(question_choices)

            return {'question_number': question_number, 'question_str': question_str, 'question_choices': question_choices}
            
        return dict()
    
    @staticmethod
    def extract_question_answer(answer_text: str):
        '''
        Create a tuple containing the answer of question
        '''
        pattern = r'^(\d+):\s(\d+).*$'
        result = re.match(pattern, answer_text)
        if result:
            return (result.group(1), result.group(2))
        
    @staticmethod
    def get_questions_data(level: str, exercise_number: str, flag_mock: bool = True) -> dict:
        '''
        Uses previous method from Source to generate final dictionary of questions
        '''
        soup = Source.get_soup_object(level, exercise_number, flag_mock)
        raw_question_data = Source.get_raw_question_data(soup)
        
        questions, answers = raw_question_data['Questions'], raw_question_data['Answers']

        myQuestions = list()

        for question, answer in zip(questions, answers):
            question_data = Source.extract_question_str_choices(question)
            question_answer = Source.extract_question_answer(answer)
            
            if question_data['question_number'] == int(question_answer[0]):
                question_data['question_answer'] = question_answer[1]

                del question_data['question_number']
                question_data['question_level'] = level
                question_data['question_explanation'] = ''
                question_data['question_type'] = 'vocabulary'
                question_data['question_topics'] = ''
                myQuestions.append(question_data)

        return myQuestions

class Utils:

    @staticmethod
    def generate_question_id(start):
        '''
        Python generator to always generate a new question id when called.
        '''
        index = start
        while True:
            yield index
            index += 1
    
    @staticmethod
    def write_to_csv(output, row: dict):
        with open(output, 'a') as f:
            f.write(str(row))
            f.write('\n')

In [28]:
def extract_vocabulary_exercises(levels_exercises: dict):

    current_delay = 5

    for level, number_questions in levels_exercises.items():
        for question_i in range(1, number_questions+1):
            response = Source.get_questions_data(level, str(question_i), flag_mock=False)
            for question in response:
                Utils.write_to_csv('extracted_data.csv', question)
                print(f'{level}: Page {question_i}, {question}')
            time.sleep(current_delay)
        
        current_delay += 1
        time.sleep(current_delay)

levels_exercises = {'n1': 22, 'n2': 26, 'n3': 21, 'n4': 31, 'n5': 24}
extract_vocabulary_exercises(levels_exercises)

n1: Page 1, {'question_str': 'ケネディ殺害の容疑者は _______ に謎を残したままマフィアに撃たれて死亡した。', 'question_choices': "{'1': '動機', '2': '本音', '3': '動力', '4': '下心'}", 'question_answer': '1', 'question_level': 'n1', 'question_explanation': '', 'question_type': 'vocabulary', 'question_topics': ''}
n1: Page 1, {'question_str': 'いつ見つけても _______ の早いがんでは予後が悪く、遅いがんは予後がいい。早くても遅くても意味はないのです。', 'question_choices': "{'1': '先進', '2': '増進', '3': '進出', '4': '進行'}", 'question_answer': '4', 'question_level': 'n1', 'question_explanation': '', 'question_type': 'vocabulary', 'question_topics': ''}
n1: Page 1, {'question_str': 'インターネット広告 _______ と広告の効果の関係について考えてみよう。', 'question_choices': "{'1': '値', '2': '費', '3': '料', '4': '額'}", 'question_answer': '2', 'question_level': 'n1', 'question_explanation': '', 'question_type': 'vocabulary', 'question_topics': ''}
n1: Page 1, {'question_str': 'この数年間で千葉や隣接県では女児連れ去り事件が多発していた。まだ _______ 解決の案件もあるが、未遂に終わり、容疑者が逮捕されたケースもある。', 'question_choices': "{'1': '非', '2': '双', '3': '未', '4': '無'}", 'qu

KeyError: 'question_number'

In [31]:
questions = list()
with open('extracted_data.csv', "r") as f:
    for line in f:
        questions.append(eval(line))

print(questions)

[{'question_str': 'ケネディ殺害の容疑者は _______ に謎を残したままマフィアに撃たれて死亡した。', 'question_choices': "{'1': '動機', '2': '本音', '3': '動力', '4': '下心'}", 'question_answer': '1', 'question_level': 'n1', 'question_explanation': '', 'question_type': 'vocabulary', 'question_topics': ''}, {'question_str': 'いつ見つけても _______ の早いがんでは予後が悪く、遅いがんは予後がいい。早くても遅くても意味はないのです。', 'question_choices': "{'1': '先進', '2': '増進', '3': '進出', '4': '進行'}", 'question_answer': '4', 'question_level': 'n1', 'question_explanation': '', 'question_type': 'vocabulary', 'question_topics': ''}, {'question_str': 'インターネット広告 _______ と広告の効果の関係について考えてみよう。', 'question_choices': "{'1': '値', '2': '費', '3': '料', '4': '額'}", 'question_answer': '2', 'question_level': 'n1', 'question_explanation': '', 'question_type': 'vocabulary', 'question_topics': ''}, {'question_str': 'この数年間で千葉や隣接県では女児連れ去り事件が多発していた。まだ _______ 解決の案件もあるが、未遂に終わり、容疑者が逮捕されたケースもある。', 'question_choices': "{'1': '非', '2': '双', '3': '未', '4': '無'}", 'question_answer': '3', 'question_level': 'n1',

In [38]:
id_generator = Utils.generate_question_id(1)
for question in questions:
    question['question_id'] = next(id_generator)
    requests.post('http://127.0.0.1:5000/questions/new', json = question)
    print(question)
    time.sleep(1)


{'question_str': 'ケネディ殺害の容疑者は _______ に謎を残したままマフィアに撃たれて死亡した。', 'question_choices': "{'1': '動機', '2': '本音', '3': '動力', '4': '下心'}", 'question_answer': '1', 'question_level': 'n1', 'question_explanation': '', 'question_type': 'vocabulary', 'question_topics': '', 'question_id': 1}
{'question_str': 'いつ見つけても _______ の早いがんでは予後が悪く、遅いがんは予後がいい。早くても遅くても意味はないのです。', 'question_choices': "{'1': '先進', '2': '増進', '3': '進出', '4': '進行'}", 'question_answer': '4', 'question_level': 'n1', 'question_explanation': '', 'question_type': 'vocabulary', 'question_topics': '', 'question_id': 2}
{'question_str': 'インターネット広告 _______ と広告の効果の関係について考えてみよう。', 'question_choices': "{'1': '値', '2': '費', '3': '料', '4': '額'}", 'question_answer': '2', 'question_level': 'n1', 'question_explanation': '', 'question_type': 'vocabulary', 'question_topics': '', 'question_id': 3}
{'question_str': 'この数年間で千葉や隣接県では女児連れ去り事件が多発していた。まだ _______ 解決の案件もあるが、未遂に終わり、容疑者が逮捕されたケースもある。', 'question_choices': "{'1': '非', '2': '双', '3': '未', '4': '無'}