In [2]:
import json
import os
import pyarrow.parquet as pq
import requests
import bs4
import urllib.parse
import hashlib
import pandas as pd
import regex
from unidecode import unidecode
from random import choice
import logging

ModuleNotFoundError: No module named 'pyarrow'

In [None]:
def check_dir_exist():
    # Check if directory data exists
    if not os.path.isdir('data'):
        os.mkdir('data')
    
    print('Folder data created')

def append_to_parquet_file(df: pd.DataFrame, file_path: str):
    # Append dataframe into parquet file if file exists
    if os.path.exists(file_path):
        df.to_parquet(file_path, engine='fastparquet', append=True)
        
        print(f"Data appended to {file_path}")

    else:
        df.to_parquet(file_path, engine='fastparquet')
        
        print(f"New parquet file created at {file_path}")

In [None]:
def get_md5(s):
    m = hashlib.md5()
    m.update(s.encode('utf-8'))
    return m.hexdigest()


class TVPLCrawler():
    BATCH_SIZE = 50
    CRAWLER = 'tungks1'
    OUTPUT_PATH = 'output.json'
    PROXY_URL = 'https://api.proxyscrape.com/v2/?request=displayproxies&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all'

    def __init__(self):
        self.name = self.__class__.__name__
        self.log = logging.getLogger(self.name)
        self.log.setLevel(logging.INFO)

    def crawl_link(self, file_path):
        # Lay link, tieu de, va ngay ban hanh cua tat ca van ban
        url = 'https://thuvienphapluat.vn/page/tim-van-ban.aspx?keyword=&area=0&match=True&type=0&status=0&signer=0&sort=1&lan=1&scan=0&org=0&fields=&page='
        page = 3000
        limit = page + 3000

        while True:
            page_url = url + str(page)
            content = requests.get(page_url)
            soup = bs4.BeautifulSoup(content.text, "html.parser")
            links = soup.select(".content-0") + soup.select(".content-1")

            if len(links) == 0:
                break

            for link in links:
                title = link.select_one('.nqTitle')
                if title:
                    link_url = title.select_one('a')
                    if link_url:
                        link_url = link_url.get('href')
                        title = title.text.strip()
                        date = link.select_one('.right-col > p')
                        if date:
                            date = date.text
                            if 'Ban hành' in date:
                                date = date.split(':')[-1].strip()
                        else:
                            date = ''

                        item = {'_id': get_md5(link_url),
                                'title': title,
                                'link': link_url,
                                'date': date,
                                'crawler': None,
                                'status': 'not_crawled'}

                        # append dictionary into json file
                        item_df = pd.DataFrame([item])
                        append_to_parquet_file(item_df, file_path)

            self.log.info(f'Finish page {page}')

            if page > limit:
                break

            page += 1
    
    # Lay noi dung chi tiet cua cac van ban
    def crawl_html(self, link, proxy=None):
        item = {'link': link}

        if proxy:
            req = requests.get(link, proxies={'http': proxy})
        else:
            req = requests.get(link)
        print('Done')

        soup = bs4.BeautifulSoup(req.text, 'html.parser')

        item['title'] = soup.select_one('#divThuocTinh > h1')
        if item['title']:
            item['title'] = item['title'].text.strip()

        thuoc_tinh_html = soup.select_one('#divThuocTinh > table')
        if thuoc_tinh_html:
            item['thuoc_tinh_html'] = str(thuoc_tinh_html)
        else:
            item['thuoc_tinh_html'] = ''

        tom_tat_html = soup.select_one('.Tomtatvanban')
        if tom_tat_html:
            item['tom_tat_html'] = str(tom_tat_html)
        else:
            item['tom_tat_html'] = None

        noi_dung_html = soup.select_one('.content1')
        if noi_dung_html:
            item['noi_dung_html'] = str(noi_dung_html)
        else:
            noi_dung_html = ''

        # Lay phan thuoc tinh từ html và tach thanh cac gia tri
        if thuoc_tinh_html:
            thuoc_tinh_text = thuoc_tinh_html.text.replace('\r', '').replace(':', '').split('\n')
            thuoc_tinh_text = [t.strip() for t in thuoc_tinh_text if t.strip()]
            thuoc_tinh = {unidecode(a).lower().replace(' ', '_'): b for a, b in
                          zip(thuoc_tinh_text[::2], thuoc_tinh_text[1::2])}
            item.update(thuoc_tinh)

        # Lay text phan tom tat
        if tom_tat_html:
            tom_tat_text = tom_tat_html.text
            item['tom_tat'] = tom_tat_text.strip()

        # Lay bang, chi lay nhung bang co trong noi_dung_html, bo 2 bang o dau
        df_list = pd.read_html(item['noi_dung_html'], header=0)
        dict_result = {}
        for i in range(len(df_list)):
            dict_result[f'bang_{i}'] = df_list[i].to_dict(orient="records")
        item['danh_sach_bang'] = dict_result

        # Lay text phan noi dung
        if noi_dung_html:
            # Thay tag <table>...</table> thanh <table>bang_1</table>
            noi_dung_html = item['noi_dung_html']
            for name, df in dict_result.items():
                #         df_string = json.dumps(df)
                noi_dung_html = regex.sub(r"<table[^>]*>(.|\n)*?</table>",
                                          f'&lt;jsontable name="{name}"&gt; &lt;/jsontable&gt;', noi_dung_html, count=1)

            # Lay text phan noi dung
            noi_dung_html = bs4.BeautifulSoup(noi_dung_html, 'html.parser')
            noi_dung_text = noi_dung_html.text
            item['noi_dung'] = noi_dung_text.strip()

            # Lay cac link trich dan trong noi dung
            links_tag = noi_dung_html.select('a[href]')
            links = [l.get('href') for l in links_tag]
            names = [l.get('title') for l in links_tag]
            links = {name: urllib.parse.urljoin(item['link'], link) for name, link in zip(names, links) if
                     (name is not None) and (link is not None)}
            item['van_ban_duoc_dan'] = links

        return item

    def crawl(self):
        count = 0
        success = 0
        fail = 0

        parquet_file = pq.ParquetFile('data/url1.parquet')

        for batch in parquet_file.iter_batches(batch_size=5):
            print("RecordBatch")
            batch_df = batch.to_pandas()
            
            proxy_list = requests.get(self.PROXY_URL)
            if proxy_list.status_code == 200:
                proxy_list = proxy_list.text.split('\r\n')
                proxy = choice(proxy_list)
            else:
                proxy = None

            items = []
            
            for _, link in batch_df.iterrows():
                try:
                    item = self.crawl_html(link['link'], proxy)
                    item['_id'] = link['_id']
                    item['title'] = link['title']
                    items.append(item)

                    # update status, crawler in url.parquet
                    link['status'] = 'crawled'
                    link['crawler'] = self.CRAWLER
                    success += 1
                
                except Exception as e:
                    link['status'] = 'failed'
                    link['crawler'] = self.CRAWLER
                    fail += 1
                
                count += 1
            
            # insert crawled items into crawled_items.parquet
            items_df = pd.DataFrame(items)
            append_to_parquet_file(items_df, 'data/crawled_items.parquet')

            break    
        # self.log.info(f'Done {count} items, {success} success, {fail} fail')

if __name__ == '__main__':
    url_file_path = 'data/url1.parquet'

    crawler = TVPLCrawler()
    crawler.crawl_link(url_file_path)
#     crawler.crawl()
