In [19]:
import requests, lxml
from lxml import etree
import time
import random
import io
import logging
import os
from abc import ABC, abstractmethod
import argparse


def get_logger(name, log_filename=None):
    logger = logging.getLogger(name)
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter(fmt="%(name)s - %(levelname)s\t%(message)s", datefmt="%Y/%m/%d %H:%M:%S")
    
    shandler = logging.StreamHandler(sys.stdout)
    shandler.setFormatter(formatter)
    logger.addHandler(shandler)
    
    if log_filename is not None:
        fhandler = logging.FileHandler(log_filename, mode='a', encoding='utf8')
        fhandler.setFormatter(formatter)
        logger.addHandler(fhandler)
        
    return logger
def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--debug", type=bool, default=False)
    parser.add_argument("--verbose", type=bool, default=True)
    args = parser.parse_args([])
    return args

In [14]:
import requests, lxml
from lxml import etree
import time
import random
import io
import logging
import os, sys
from abc import ABC, abstractmethod
from tqdm import tqdm

sys.path.append("..")

from utils import get_logger, get_args

logger = get_logger(__name__, 'crawler.log')

class Sleep:
    def __init__(self, func):
        self.func = func
        self.visit_cnt = 0
    
    def __call__(self, *args, **kwargs):
        if self.visit_cnt > 4:
            time.sleep(2 + random.random() * 4)
        else:
            self.visit_cnt += 1
        self.func(*args, **kwargs)

class Crawler(ABC):
    def __init__(self, args):
        self.name = None
        self.main_url = None
        self.main_page = None
        self.src_store_url = None
        self.src_ext = ['doc', 'docx', 'pdf', 'txt', 'xlsx', 'xls', 'ppt', 'zip', 'tar', '7z', 'rar',
                        'png', 'jpg', 'gif', 'jpeg']
        
        self.args = args
        self.page_url = None
        # self.store_2_page_list_xpath = None
        # self.download_page_2_file_xpath = None
        
        self.visit_cnt = 0      # to control QPS, not used currently
        self.desc = "Crawler for {name}\n" \
                    "Home page: {main_page}"
        
    def __repr__(self):
        return self.desc.format(name=self.name, main_page=self.main_page)
    
    def sleep(self):
        time.sleep(0.5 + random.random() * 1)
    
    @staticmethod
    def get_etree_html(url):
        time.sleep(0.5 + random.random() * 1)
        response = requests.get(url)
        html = response.content
        return etree.HTML(html)
    
    def download_src(self, url, ext, name, save_path='./'):
        self.sleep()
        if not os.path.exists(save_path):
            os.mkdir(save_path)
        if ext not in self.src_ext:
            logger.info(f"Illegal ext to download: {url}")
            return
        # self.visit_cnt = Crawler.sleep(self.visit_cnt)
        
        src = requests.get(url)
        content = io.BytesIO(src.content)
        if name.split('.')[-1] not in self.src_ext:
            name = name + '.' + ext
        if name in os.listdir(save_path):
            logger.info(f'>>> {name} exists! Pass <<<')
            return
    
        fname = os.path.join(save_path, name)
        with open(fname, 'wb') as f:
            f.write(content.read())
        logger.info(f'>>> {fname} has been downloaded <<<')
        
    def process_urls(self, urls, host_url):
        if isinstance(urls, list or tuple):
            assert host_url is not None or all([not url.startswith('/') for url in urls]), \
                f"{urls} contains some urls without prefix while host_url is None!"
        else:
            assert host_url is not None or not urls.startswith('/'), \
                f"{urls} has no prefix while host_url is None!"
        if host_url is not None:
            if isinstance(urls, str):
                urls = host_url + urls if urls.startswith('/') else urls
            else:
                urls = [host_url + url if url.startswith('/') else url for url in urls]
        return urls
        
    def get_src_from_store(self, src_urls, src_names, host_url=None, save_path='./'):

        for cnt, (url, name) in enumerate(zip(src_urls, src_names)):
            ext = url.split('.')[-1]
            if self.args.verbose:
                logger.info(f"Downloading from src page {cnt} - Current EXT: {ext}")
            if ext not in self.src_ext:     # url directs to a page but not src file
                cur_src_urls, cur_src_names = self.get_src_from_page(url, host_url=host_url)
                if self.args.verbose:
                    logger.info(f"Files of current page: {list(zip(cur_src_names, cur_src_urls))}")
                for url_, name_ in zip(cur_src_urls, cur_src_names):
                    ext_ = url_.split('.')[-1]
                    # print(url_)
                    self.download_src(url_, ext_, name_, save_path)
            else:
                self.download_src(url, ext, name, save_path)
            
            if self.args.debug and cnt >= 1:
                break
    
    def get_src_urls(self, url, host_url=None):
        # get src pages from the download center(s)
        if isinstance(url, list or tuple):
            src_urls, src_names = [], []
            for url_, page_url_ in zip(url, self.page_url):
                logger.info(f"getting the src_urls list of center {url_}...")
                src_urls_, src_names_ = [], []
                src_urls_, src_names_ = self._get_src_urls(url_, page_url_)
                src_urls += src_urls_
                src_names += src_names_
        else:
            src_urls, src_names = self._get_src_urls(url, self.page_url)
        src_urls = self.process_urls(src_urls, host_url)
        return src_urls, src_names
    
    @abstractmethod 
    def _get_src_urls(self, url, page_url=None):
        raise NotImplementedError()
        
    @abstractmethod 
    def _get_src_from_page(self, element):
        raise NotImplementedError()
    
    def get_src_from_page(self, url, host_url=None):
        url = self.process_urls(url, host_url)
        element = Crawler.get_etree_html(url)
        src_urls, src_names = self._get_src_from_page(element)
        src_urls = self.process_urls(src_urls, host_url)
        return src_urls, src_names
    # @abstractmethod 
    # def get_nav_item(self, url):
    #     raise NotImplementedError()

    def crawl_src(self, host_url=None, save_path='./'):
        if self.src_store_url is None:
            logger.info(f"{self.name}没有资源下载网页！")
            return
        logger.info(f"Downloading src files for {self.name}...")
        
        src_urls, src_names = self.get_src_urls(self.src_store_url, host_url)
        logger.info(f"All src pages: \n{list(zip(src_names, src_urls))}")
        self.get_src_from_store(src_urls, src_names, host_url, save_path)
        
        logger.info(f"Done")
        

In [26]:
class SISTCrawler(Crawler):
    def __init__(self, args):
        super().__init__(args)
        self.name = '信息科学技术学院'
        self.main_url = 'https://sist.ustc.edu.cn/'
        self.main_page = 'https://sist.ustc.edu.cn/main.htm'
        self.src_store_url = [
            # 'https://sist.ustc.edu.cn/5104/list.htm',       # 研究生
            'https://sist.ustc.edu.cn/5111/list.htm',       # 本科生
            # 'https://sist.ustc.edu.cn/5128/list.htm',       # 党建
            'https://sist.ustc.edu.cn/5095/list.htm',       # 学生工作
            # 'https://sist.ustc.edu.cn/5085/list.htm',       # 科学研究
            'https://sist.ustc.edu.cn/5079/list.htm',       # 信息服务
        ]
        self.page_url = [url[:-4] + '{id}' + url[-4:] for url in self.src_store_url[:4]] + \
                        [None, None]
        self.page_url = ['https://sist.ustc.edu.cn/5111/list{id}.htm', 'https://sist.ustc.edu.cn/5095/list{id}.htm',
                          None]
        # self.max_page_num_xpath = None
        
        # self.store_2_page_list_xpath = "//div[@class='view_bg']//a"
        # self.name_holder = ""
        
    def _get_src_urls(self, url, page_url=None):
        element = self.get_etree_html(url)

        if page_url is None:
            src_urls, src_names = self.get_page_src_urls(url)
        else:
            max_page = element.xpath("//em[@class='all_pages']")[0]
            max_page = int(max_page.text)

            src_urls, src_names = [], []
            for id in range(1, max_page + 1):
                
                page_src_urls, page_names = self.get_page_src_urls(page_url.format(id=id))
                if self.args.verbose:
                    print(f'page: {id}')
                    print(f'page_names: {page_names}')
                src_urls += page_src_urls
                src_names += page_names
        
        return src_urls, src_names
            
    def get_page_src_urls(self, url):
        element = Crawler.get_etree_html(url)
        page_src_list = element.xpath("//h5[contains(@class,'card-title')]/a | //div[@class='wp_entry']//a")
        src_urls = [a.attrib['href'] for a in page_src_list]
        src_names = [a.attrib['title'] if a.attrib.has_key('title') else eval(a.attrib['sudyfile-attr'])['title'] for a in page_src_list]
        
        return src_urls, src_names
            
    def _get_src_from_page(self, element):
        page_src_list = element.xpath("//div[@class='wp_articlecontent']//a")
        src_urls = [a.attrib['href'] for a in page_src_list]
        src_names = [eval(a.attrib['sudyfile-attr'])['title'] for a in page_src_list]
        return src_urls, src_names

In [28]:
args = get_args()
print(args)
sist = SISTCrawler(args)
sist.crawl_src(save_path='../cache/sist', host_url=sist.main_url)

Namespace(debug=False, verbose=True)
__main__ - INFO	Downloading src files for 信息科学技术学院...
__main__ - INFO	Downloading src files for 信息科学技术学院...
__main__ - INFO	Downloading src files for 信息科学技术学院...
__main__ - INFO	Downloading src files for 信息科学技术学院...
__main__ - INFO	getting the src_urls list of center https://sist.ustc.edu.cn/5111/list.htm...
__main__ - INFO	getting the src_urls list of center https://sist.ustc.edu.cn/5111/list.htm...
__main__ - INFO	getting the src_urls list of center https://sist.ustc.edu.cn/5111/list.htm...
__main__ - INFO	getting the src_urls list of center https://sist.ustc.edu.cn/5111/list.htm...
page: 1
page_names: ['中国科学技术大学本科生参加暑期交流项目申请表', '缓修成绩取消申请单', '中国科学技术大学录播教室使用申请表', '中国科学技术大学学生“开学考试”成绩登记表', '中国科学技术大学本科生开学考试申请单', '大研（毕设）协议书', '大学生赴院所做大研（毕设）差旅费统计表', '本科生课程请假申请表', '警示期及退学复议学生选课调整申请表']
page: 2
page_names: ['学生个性化学习申请表（因学习困难申请缓修）', '学生个性化学习申请表（因拟转专业申请缓修）', '试卷送印单', '跨院系调整所修专业申请表', '本科生听课记录表（实验）', '本科生听课记录表（理论）', '本科生学业指导谈话记录（学业警示）', '本科生学业指导谈话记录（通用）', '在读证

KeyboardInterrupt: 