In [1]:
# 加载spark配置
from pyspark.sql import Row, DataFrame
from pyspark.sql.types import StructType, StructField, StringType
from xinghe.spark import *
from app.common.json_util import *
from xinghe.s3 import *
from pyspark.sql.functions import col, count, when
from collections import defaultdict
# config = {
#     "spark_conf_name": "spark_2",
#     "skip_success_check": True,
#     "input_format": "parquet",
#     "spark.dynamicAllocation.maxExecutors": 800,
# }
schema = StructType([StructField('value', StringType(), True)])
config = {
    "spark_conf_name": "spark_2",
    "skip_success_check": True,
    "spark.sql.shuffle.partitions":10000,
    "spark.executor.memory":"20g",  # 默认30g
    "spark.driver.memory":"10g",  # 默认20g
    #"spark.yarn.queue": "root.quyuan",
    #"input_format": "parquet",
}

spark = new_spark_session("cc-extract-index-test", config)
sc = spark.sparkContext
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
input_path = ["s3://web-parse-hw60p/quyuan/output_v0007/"]

In [3]:
input_df = read_any_path(spark, ",".join(input_path), config)

                                                                                

In [4]:
input_df.count()

                                                                                

15847

In [5]:
inline_tags = ["li", "td", "tr", "br"]
def is_block_element(node) -> bool:
    """如果标签不在内联元素集合中，默认为块级元素。 但是，如果一个内联元素包含块级元素，则该内联元素被视为块级元素。"""
    if node.tag in inline_tags:
        return any(is_block_element(child) for child in node.iterchildren())
    return isinstance(node, html.HtmlElement)

In [6]:
def json_data(row_iter):
    for row in row_iter:
        data = json_loads(row.value)
        yield Row(**{"value": json_dumps(data), "layout_id": data["layout_id"]})

In [7]:
repartition_df = input_df.rdd.mapPartitions(json_data).toDF()

                                                                                

In [8]:
repartition_df.take(1)

                                                                                

[Row(value='{"layout_id":"www.tellingtimeworksheets.net_0","track_id":"9efd1d8b-cd2d-4935-a2a5-d5953dae6c93","typical_main_html_success":true,"main_html_success":true,"url":"https://www.tellingtimeworksheets.net/tag/telling-time-worksheet-for-kindergarten-pdf/","url_host_name":"www.tellingtimeworksheets.net","html":"<!DOCTYPE html>\\n<html lang=\\"en-US\\" prefix=\\"og: https://ogp.me/ns#\\">\\n<head>\\n\\t<meta charset=\\"UTF-8\\" />\\n\\t<meta name=\\"viewport\\" content=\\"width=device-width, initial-scale=1\\" />\\n\\t<style>img:is([sizes=\\"auto\\" i], [sizes^=\\"auto,\\" i]) { contain-intrinsic-size: 3000px 1500px }</style>\\n\\t\\n<!-- Search Engine Optimization by Rank Math - https://rankmath.com/ -->\\n<meta name=\\"robots\\" content=\\"follow, index, max-snippet:-1, max-video-preview:-1, max-image-preview:large\\"/>\\n<link rel=\\"canonical\\" href=\\"https://www.tellingtimeworksheets.net/tag/telling-time-worksheet-for-kindergarten-pdf/\\" />\\n<meta property=\\"og:locale\\" 

In [9]:
def is_in_p_tag(xpath: str) -> bool:
    """检查节点的XPath是否以p标签结束"""
    if not xpath:
        return False
    # 分割XPath并获取最后一部分
    parts = xpath.strip('/').split('/')
    if not parts:
        return False
    last_segment = parts[-1]
    # 检查最后一段是否是p标签（带或不带索引）
    if last_segment.startswith('p'):
        # 处理带索引的情况如 p[1]
        rest = last_segment[1:]
        if not rest:  # 只有'p'
            return True
        if rest.startswith('[') and rest.endswith(']') and rest[1:-1].isdigit():
            return True
    return False

In [10]:
import json
from lxml.html import HtmlComment, fromstring, tostring
from typing import Generator, List, Dict, Any, Tuple, Optional
from collections import defaultdict

def json_dumps(data):
    """自定义JSON序列化函数"""
    return json.dumps(data, ensure_ascii=False)

class Row:
    """模拟行对象"""
    def __init__(self, value):
        self.value = value

def find_common_prefixes(sequences: List[List[str]], min_occurrence: int = 2) -> List[Tuple[List[str], int]]:
    """
    找到所有序列中至少出现min_occurrence次的最长公共前缀
    返回一个列表，包含(公共前缀, 出现次数)元组
    """
    if not sequences:
        return []
    
    # 统计所有前缀及其出现频率
    prefix_counts = defaultdict(int)
    
    # 遍历每个序列的所有可能前缀
    for seq in sequences:
        for i in range(1, len(seq) + 1):
            prefix = tuple(seq[:i])  # 使用元组作为可哈希键
            prefix_counts[prefix] += 1
    
    # 过滤出出现次数足够的候选前缀
    candidates = [prefix for prefix, count in prefix_counts.items() 
                 if count >= min_occurrence and len(prefix) > 0]
    
    # 按长度排序（最长的在前）
    candidates.sort(key=len, reverse=True)
    
    # 选择最长的有效前缀
    result = []
    selected_prefixes = set()
    
    for prefix in candidates:
        prefix_tuple = tuple(prefix)
        
        # 检查是否已被更长的前缀包含
        is_subset = False
        for selected in selected_prefixes:
            if prefix_tuple == selected[:len(prefix_tuple)]:
                is_subset = True
                break
        
        if not is_subset:
            result.append((list(prefix), prefix_counts[prefix_tuple]))
            selected_prefixes.add(prefix_tuple)
    
    return result

def find_common_suffixes(sequences: List[List[str]], min_occurrence: int = 2) -> List[Tuple[List[str], int]]:
    """
    找到所有序列中至少出现min_occurrence次的最长公共后缀
    返回一个列表，包含(公共后缀, 出现次数)元组
    """
    if not sequences:
        return []
    
    # 反转所有序列
    reversed_seqs = [list(reversed(seq)) for seq in sequences]
    
    # 使用前缀查找方法
    reversed_prefixes = find_common_prefixes(reversed_seqs, min_occurrence)
    
    # 反转结果返回
    return [(list(reversed(prefix)), count) for prefix, count in reversed_prefixes]

def extract_text_nodes(row_iter) -> Generator[Any, None, None]:
    # 收集所有文档的节点序列及元数据
    doc_sequences = []
    doc_data_map = {}
    
    for row in row_iter:
        try:
            data = json.loads(row.value)
        except json.JSONDecodeError:
            continue
            
        html_content = data.get("main_html", "")
        track_id = data.get("track_id", "")
        
        if not html_content.strip():
            continue
            
        try:
            tree = fromstring(html_content)
            root_tree = tree.getroottree()
            
            sequence = []       # 文本节点序列
            nodes_list = []     # 节点对象列表
            xpaths = []         # 节点的XPath
            
            # 遍历所有节点，收集叶子文本节点
            for node in tree.iter():
                if isinstance(node, HtmlComment):
                    continue
                    
                if len(node) > 0:  # 非叶子节点跳过
                    continue
                    
                # 拼接文本内容
                text_parts = []
                if node.text and node.text.strip():
                    text_parts.append(node.text.strip())
                if node.tail and node.tail.strip():
                    text_parts.append(node.tail.strip())
                
                text = " ".join(text_parts).strip()
                if not text:
                    continue
                    
                sequence.append(text)
                nodes_list.append(node)
                xpaths.append(root_tree.getpath(node))
            
            doc_sequences.append(sequence)
            doc_data_map[track_id] = {
                "original_data": data,
                "sequence": sequence,
                "nodes_list": nodes_list,
                "xpaths": xpaths,
                "tree": tree,
                "root_tree": root_tree
            }
        except Exception as e:
            print(f"Error processing {track_id}: {str(e)}")
            continue
    
    # 查找公共前缀和后缀（至少出现2次）
    common_heads = find_common_prefixes(doc_sequences, min_occurrence=2)
    common_tails = find_common_suffixes(doc_sequences, min_occurrence=2)
    
    print(f"Found {len(common_heads)} common heads and {len(common_tails)} common tails")
    
    for track_id, doc_data in doc_data_map.items():
        sequence = doc_data["sequence"]
        tree = doc_data["tree"]
        nodes_list = doc_data["nodes_list"]
        xpaths = doc_data["xpaths"]
        
        dedup_info = {
            "head_xpaths": [],
            "tail_xpaths": [],
            "removed_head": 0,
            "removed_tail": 0,
            "matched_head": None,
            "matched_tail": None,
            "head_count": 0,
            "tail_count": 0
        }
        
        # 移除公共头部（跳过<p>标签内的节点）
        for head, count in common_heads:
            head_len = len(head)
            if sequence[:head_len] == head:
                # 检查是否有节点在<p>标签内
                skip_removal = any(is_in_p_tag(xpath) for xpath in xpaths[:head_len])
                if skip_removal:
                    print(f"Skipping head removal for {track_id} due to <p> tag")
                    continue
                    
                dedup_info["matched_head"] = head
                dedup_info["head_count"] = count
                dedup_info["removed_head"] = head_len
                dedup_info["head_xpaths"] = xpaths[:head_len]
                
                for node in reversed(nodes_list[:head_len]):
                    parent = node.getparent()
                    if parent is not None:
                        parent.remove(node)
                break
        
        # 移除公共尾部（跳过<p>标签内的节点）
        for tail, count in common_tails:
            tail_len = len(tail)
            if sequence[-tail_len:] == tail:
                # 检查是否有节点在<p>标签内
                skip_removal = any(is_in_p_tag(xpath) for xpath in xpaths[-tail_len:])
                if skip_removal:
                    print(f"Skipping tail removal for {track_id} due to <p> tag")
                    continue
                    
                dedup_info["matched_tail"] = tail
                dedup_info["tail_count"] = count
                dedup_info["removed_tail"] = tail_len
                dedup_info["tail_xpaths"] = xpaths[-tail_len:]
                
                for node in reversed(nodes_list[-tail_len:]):
                    parent = node.getparent()
                    if parent is not None:
                        parent.remove(node)
                break
        
        new_html = tostring(tree, encoding="unicode", pretty_print=False)
        doc_data["original_data"]["new_html"] = new_html
        doc_data["original_data"]["dedup_info"] = dedup_info
        
        yield Row(value=json_dumps(doc_data["original_data"]))


In [11]:
repartitioned_df = repartition_df.repartition("layout_id").rdd.mapPartitions(extract_text_nodes).toDF()

                                                                                

In [12]:
repartitioned_df.take(1)

[Row(value='{"layout_id": "www.msgclub.net_19", "track_id": "f5e0783a-57c9-48eb-b888-fea3497faf9a", "typical_main_html_success": true, "main_html_success": true, "url": "http://www.msgclub.net/blog/post/what-are-the-business-benefits-of-bulk-voice-call/", "url_host_name": "www.msgclub.net", "html": "<!DOCTYPE html>\\n<html lang=\\"en-US\\">\\n<head>\\n\\t<meta charset=\\"UTF-8\\" />\\n\\t<meta name=\\"viewport\\" content=\\"width=device-width, initial-scale=1\\" />\\n<meta name=\'robots\' content=\'max-image-preview:large\' />\\n<title>What are the business benefits of Bulk Voice Call? &#8211; MSGCLUB Blog</title>\\n<link rel=\'dns-prefetch\' href=\'//s.w.org\' />\\n<link rel=\\"alternate\\" type=\\"application/rss+xml\\" title=\\"MSGCLUB Blog &raquo; Feed\\" href=\\"http://www.msgclub.net/blog/feed/\\" />\\n<link rel=\\"alternate\\" type=\\"application/rss+xml\\" title=\\"MSGCLUB Blog &raquo; Comments Feed\\" href=\\"http://www.msgclub.net/blog/comments/feed/\\" />\\n<link rel=\\"alte

In [14]:
write_any_path(repartitioned_df, "s3://web-parse-hw60p/quyuan/output_v010/", config)

                                                                                

{'rows': 15847,
 'bytes': {'sum': 3601074031,
  'min': 21421,
  'max': 3576382,
  'cnt': 15847,
  'avg': 227240.11},
 'files': 112,
 'sub_paths': {'retained/retained/0ww1.com_6': {'rows': 35,
   'bytes': {'sum': 6368955,
    'min': 154716,
    'max': 221787,
    'cnt': 35,
    'avg': 181970.143},
   'files': 1},
  'retained/retained/apazones.com_32': {'rows': 9,
   'bytes': {'sum': 3817666,
    'min': 412466,
    'max': 450933,
    'cnt': 9,
    'avg': 424185.111},
   'files': 1},
  'retained/retained/blog.agapechristiansingles.com_1': {'rows': 38,
   'bytes': {'sum': 12709267,
    'min': 322372,
    'max': 372897,
    'cnt': 38,
    'avg': 334454.395},
   'files': 1},
  'retained/retained/blog.msummersphotography.com_8': {'rows': 34,
   'bytes': {'sum': 10905991,
    'min': 301772,
    'max': 371971,
    'cnt': 34,
    'avg': 320764.441},
   'files': 1},
  'retained/retained/bowldetroit.com_19': {'rows': 23,
   'bytes': {'sum': 3968727,
    'min': 166573,
    'max': 185854,
    'cnt':