In [2]:
from web_parser import AnswerParser, QuestionParser
import multiprocessing
from multiprocessing import Queue, Process
from selenium import webdriver
from pathlib import Path
import os
import yaml
from config import drivers, nr_answer_parsers, get_db_path, get_vcs_path, get_config_path
from data_handler import Master, init_new_repo
from util.logger import Logger

In [3]:
# 需要爬取的问题的列表
q_list = [
    # 623539562,
    # 327003589
    624316466
]
# 存档仓库的地址
archiv_path = os.path.join(
    "/media/ywatcher/ExtDisk1/Files/web_archiv_",
    "archiv_1"
)

In [4]:
os.path.exists(Path(archiv_path).parent.absolute())

True

In [5]:
# 初始化存档仓库，如果存档路径已经存在且不是文件夹，
# 或者存档路径已经存在或者非空，
# 或者存档路径的上一级目录不存在/不是文件夹，
# 会抛出异常，并终止创建仓库。
try:
    init_new_repo(
        repo_path=archiv_path,
    )
except Exception as e:
    print(e)

/media/ywatcher/ExtDisk1/Files/web_archiv_/archiv_1 exists, and is not empty


# TODO
 - [x] test crawl then store
 - [ ] use event to communicate
 - [ ] test crawling and storing at same time
 - [x] test on a git repo
 - [x] test to create git repo and make a commit<br>
 https://stackoverflow.com/questions/71947784/what-does-webdriverwaitdriver-20-mean<br>
https://stackoverflow.com/questions/26869200/python-queue-module-get-stuck

---
 - [x] crawl author
 - [x] init git repo (partly done)
 - [x] init db (partly done)
 - [ ] merge repo
 - [ ] revert git operation
 - [x] scroll
 - [x] git message
 - [ ] configurable xpath rules
 - [ ] set logger for master
 - [ ] load pics 
 - [ ] set different scroll strategy


In [6]:
# 初始化进程间交流使用的队列
q_task = Queue()
q_result = Queue()

In [7]:
# 根据仓库路径，初始化各个子部分的路径，并读取仓库配置文件
db_path = get_db_path(archiv_repo_path=archiv_path)
vcs_path = get_vcs_path(archiv_repo_path=archiv_path)
repo_config_path = get_config_path(archiv_repo_path=archiv_path)
repo_config = yaml.safe_load(open(repo_config_path, 'r')) 

In [8]:
repo_config

{'owner_email': 'me@email', 'owner_name': 'me'}

In [9]:
# 初始化 selenium web driver 和爬取器
# 初始化 web driver 时，会有相应的窗口
# 打开
# 初始化问题爬取器
question_driver = webdriver.Chrome(drivers["chrome"])
question_parser = QuestionParser(
    queue_put_result=q_result,
    queue_put_task=q_task,
    driver=question_driver,
    logger=Logger("q")
)
# 初始化多个回答爬取器
answer_drivers = []
answer_parsers = []
for i in range(nr_answer_parsers):
    answer_drivers.append(
        webdriver.Chrome(drivers["chrome"])
    )
    answer_parsers.append(
        AnswerParser(
            queue_put_result=q_result,
            queue_get_task=q_task,
            driver=answer_drivers[i],
            logger=Logger("a_{}".format(i))
        )
    )
# 初始化爬取结果处理器
master = Master(
    db_path=db_path,
    git_repo_path=vcs_path
)


In [10]:
# 初始化爬取器和处理器的进程
targets = [
    # target func and args
    (question_parser.start_parsing_list, (q_list,))
] + [
    (answer_parser.start_parsing, [])
    for answer_parser in answer_parsers
] + [
    # (master.start, (config["commiter_name"],config["commiter_email"],q_result))
    (master.start_parse, (
        repo_config['owner_name'],
        repo_config["owner_email"],
        q_result
    ))
]


processes = [
    Process(target=target_func, args=args)
    for target_func, args in targets
]






In [11]:
# 开启爬取进程
# 目前滚动爬取的机制不够完善，可以暂时点开
# 爬取问题的 webdriver 窗口，并手动辅助
# 滚动，否则问题下的答案爬取将过早结束；
# 之后将完善不同的模拟滚动策略
for p in processes[:-1]:
    p.start()

624316466
q: parsing question with id=624316466
q: Scrolled to bottom.
a_0: parsing answer with id=3231891300
a_1: parsing answer with id=3231687115
a_2: parsing answer with id=3233322005
q: parsed 624316466
q: fin
a_0: parsing answer with id=3232043923
a_2: parsing answer with id=3232542923
a_1: parsing answer with id=3231673839
a_2: parsing answer with id=3232757331
a_0: parsing answer with id=3233434244
a_1: parsing answer with id=3232887153
a_2: parsing answer with id=3232379001
a_1: parsing answer with id=3234150650
a_0: parsing answer with id=3232693304
a_2: parsing answer with id=3233178056
a_0: parsing answer with id=3233643704
a_1: parsing answer with id=3233851495
a_2: parsing answer with id=3232816339
a_0: parsing answer with id=3232755057
a_1: parsing answer with id=3238810287
a_2: parsing answer with id=3233049526
a_0: parsing answer with id=3250888586
a_2: parsing answer with id=3232837658
a_0: parsing answer with id=3231817684
a_1: parsing answer with id=3232815705
a_2: 

In [12]:
# 开启处理进程
processes[-1].start()
# 结束所有进程
for p in processes:
    p.join()

search:process queue self._wait_for_tstate_lock() 

In [None]:
class Controller:
    def __init__(
        self, 
    ):
        self.processes = []
        pass