In [1]:
from web_parser import AnswerParser, QuestionParser
import multiprocessing
from multiprocessing import Queue, Process
from selenium import webdriver
from pathlib import Path
import os
import yaml
from config import drivers, nr_answer_parsers, get_db_path, get_vcs_path, get_config_path
from data_handler import Master, init_new_repo
from util.logger import Logger

In [3]:
# 需要爬取的问题的列表
q_list = [
    623539562,
    327003589
]
# 存档仓库的地址
archiv_path = os.path.join(
    "/media/ywatcher/ExtDisk1/Files/web_archiv_",
    "archiv_1"
)

In [4]:
os.path.exists(Path(archiv_path).parent.absolute())

True

In [14]:
# 初始化存档仓库，如果存档路径已经存在且不是文件夹，
# 或者存档路径已经存在或者非空，
# 或者存档路径的上一级目录不存在/不是文件夹，
# 会抛出异常，并终止创建仓库。
try:
    init_new_repo(
        repo_path=archiv_path,
    )
except Exception as e:
    print(e)

/media/ywatcher/ExtDisk1/Files/web_archiv_/archiv_1 exists, and is not empty


# TODO
 - [x] test scrap then store
 - [ ] use event to communicate
 - [ ] test scrap and store at same time
 - [x] test on a git repo
 - [x] test to create git repo and make a commit<br>
 https://stackoverflow.com/questions/71947784/what-does-webdriverwaitdriver-20-mean<br>
https://stackoverflow.com/questions/26869200/python-queue-module-get-stuck

---
 - [x] scrap author
 - [x] init git repo (partly done)
 - [x] init db (partly done)
 - [ ] merge repo
 - [ ] revert git operation
 - [ ] scroll
 - [x] git message
 - [ ] configurable xpath rules
 - [ ] set logger for master


In [6]:
# 初始化进程间交流使用的队列
q_task = Queue()
q_result = Queue()

In [15]:
# 根据仓库路径，初始化各个子部分的路径，并读取仓库配置文件
db_path = get_db_path(archiv_repo_path=archiv_path)
vcs_path = get_vcs_path(archiv_repo_path=archiv_path)
repo_config_path = get_config_path(archiv_repo_path=archiv_path)
repo_config = yaml.safe_load(open(repo_config_path, 'r')) 

In [9]:
repo_config

{'owner_email': 'me@email', 'owner_name': 'me'}

In [10]:
# 初始化 selenium web driver 和爬取器
# 初始化问题爬取器
question_driver = webdriver.Chrome(drivers["chrome"])
question_parser = QuestionParser(
    queue_put_result=q_result,
    queue_put_task=q_task,
    driver=question_driver,
    logger=Logger("q")
)
# 初始化多个回答爬取器
answer_drivers = []
answer_parsers = []
for i in range(nr_answer_parsers):
    answer_drivers.append(
        webdriver.Chrome(drivers["chrome"])
    )
    answer_parsers.append(
        AnswerParser(
            queue_put_result=q_result,
            queue_get_task=q_task,
            driver=answer_drivers[i],
            logger=Logger("a_{}".format(i))
        )
    )
# 初始化爬取结果处理器
master = Master(
    db_path=db_path,
    git_repo_path=vcs_path
)


In [11]:
# 初始化爬取器和处理器的进程
targets = [
    # target func and args
    (question_parser.start_parsing_list, (q_list,))
] + [
    (answer_parser.start_parsing, [])
    for answer_parser in answer_parsers
] + [
    # (master.start, (config["commiter_name"],config["commiter_email"],q_result))
    (master.start_parse, (
        repo_config['owner_name'],
        repo_config["owner_email"],
        q_result
    ))
]


processes = [
    Process(target=target_func, args=args)
    for target_func, args in targets
]






In [12]:
# 开启爬取进程
for p in processes[:-1]:
    p.start()

623539562
q: parsing question with id=623539562
a_0: parsing answer with id=3226696148
q: a_1:parsed 623539562 
327003589parsing answer with id=3226907077

q: parsing question with id=327003589
a_2: parsing answer with id=3255789284
q: parsed 327003589
q: fin
a_0: parsing answer with id=701487690
a_1: parsing answer with id=728565226
a_0: parsing answer with id=1890558990
a_1: parsing answer with id=722240235
a_2: stop
a_2: put
a_2: out
a_0: stop
a_0: put
a_0: out
a_1: stop
a_1: put
a_1: out


In [13]:
# 开启处理进程
processes[-1].start()
# 结束所有进程
for p in processes:
    p.join()

search:process queue self._wait_for_tstate_lock() 

In [None]:
class Controller:
    def __init__(
        self, 
    ):
        self.processes = []
        pass