In [None]:
from pyspark.sql import Row
from xinghe.spark import *
from app.common.json_util import *
from xinghe.s3 import *
from pyspark.sql.types import StructType, StructField, StringType
import re
import hashlib
from lxml.etree import HTML
import traceback
from datetime import datetime
import uuid

config = {
    "spark_conf_name": "spark_4",
    "skip_success_check": True,
}


MAX_OUTPUT_ROW_SIZE = 1024 * 1024 * 1024 * 1.5
DUMPS = [
    ...
]

ERROR_PATH = "xx"
input_path = ['xx']
output_path = "xx"
spark = new_spark_session("cc_dumps.dedup.fir", config)
sc = spark.sparkContext

# html source

In [2]:
# 获取 cc warc path list
#warc_paths = []
#for dump in DUMPS:
#    dump_path = f'{CC_WARC}{dump}/'
#    warc_paths.extend([x for x in list(list_s3_objects(dump_path, recursive=True)) if "/warc/" in x])

In [3]:
def html_to_content(html_str: str, url: str) -> str:
    if html_str.strip() and isinstance(html_str,str):
        html_str = re.sub(r'<\?[^>]*\?>', '', html_str.strip())
        try:
            html_etree = HTML(html_str)
        except:
            return None
        if html_etree:
            for element in html_etree.xpath('//*[self::script or self::style]'):
                element.getparent().remove(element)
            text = ''.join(html_etree.xpath("//text()"))
            cleaned_text = re.sub(r'[^\w\s]', '', text, flags=re.UNICODE)
            cleaned_text = re.sub(r'\s+', '', cleaned_text).strip()
            return sha256_hash(cleaned_text)

def sha256_hash(string):
    return hashlib.sha256(string.encode()).hexdigest()
    
# 异常日志
def get_s3_doctor(target_theme):
    partition_id = str(uuid.uuid4())
    current_time = datetime.now().strftime("%Y%m%d")
    error_log_path = f"{ERROR_PATH}{target_theme}/{current_time}/{partition_id}.jsonl"
    s3_doc_writer = S3DocWriter(path=error_log_path)
    return s3_doc_writer

def parse_path_to_html(row_iter):
    seen = set()
    
    # 初始化错误日志写入器
    s3_doc_writer = get_s3_doctor("dedup_fir")
    error_info = None          # 错误信息初始化
    
    for zz in row_iter:
        try:
            # 读取文件并处理
            try:
                detail_datas = json_loads(zz.value)
                layout_id = detail_datas.get("layout_id", "")
                sub_path = detail_datas.get("sub_path", "").split('/')[-1]
                layout = layout_id.split("_")[-1]
                if int(layout) < 0 :
                    continue
                # 安全地获取字段，提供默认值
                html_content = detail_datas.get("main_html", "")
                url = detail_datas.get("url", "")
                track_id = detail_datas.get("track_id", "")
                    
                hash_html = html_to_content(html_content, url) if html_content else None
                if hash_html and hash_html not in seen:  # 保持原有的去重逻辑
                    seen.add(hash_html)
                    line = {
                        "sub_path": sub_path,
                        "hash_html": hash_html,
                        "track_id": track_id,
                    }
                    yield Row(**{"value": json_dumps(line)})
                        
            except Exception as e:
                # 记录数据解析错误
                error_info = {
                    "error_type": type(e).__name__,
                    "error_message": str(e),
                    "traceback": traceback.format_exc(),
                    "input_data": zz.value if hasattr(zz, 'value') else str(zz),
                    "timestamp": datetime.now().isoformat()
                }
                s3_doc_writer.write(error_info)
                continue
                    
        except Exception as e:
            # 记录文件读取错误
            error_info = {
                "error_type": type(e).__name__,
                "error_message": str(e),
                "traceback": traceback.format_exc(),
                "input_data": "N/A",
                "timestamp": datetime.now().isoformat()
            }
            s3_doc_writer.write(error_info)
            continue
    
    if error_info:
        s3_doc_writer.flush()


In [None]:
# mapPartitions 对 warc path 并行解析数据
schema = StructType([
    StructField("value", StringType(), True),
])
#page_content = sc.parallelize(warc_paths, len(warc_paths))
input_df = read_any_path(spark, ",".join(input_path), config)
dump_html_df = input_df.rdd.mapPartitions(parse_path_to_html).toDF()

# 写出s3

In [None]:
config["skip_output_version"] = True
config["output_compression"] = "gz"
write_any_path(dump_html_df, output_path, config)