In [1]:
from xinghe.spark import *
from app.common.json_util import *
from xinghe.s3 import *
from xinghe.s3.read import *
from xinghe.ops.spark import spark_resize_file

import os
os.environ["LLM_WEB_KIT_CFG_PATH"] = "/xxx.jsonc"

from llm_web_kit.libs.standard_utils import compress_and_decompress_str
from llm_web_kit.html_layout.html_layout_cosin import cluster_html_struct, get_feature, similarity, sum_tags

config = {
    "spark_conf_name": "spark_4", # another value is "spark_2"
    "skip_success_check": True,
    "spark.executorEnv.LLM_WEB_KIT_CFG_PATH": "/xxx.jsonc",
}



spark = new_spark_session("cc_dumps.layoutID.index", config)
sc = spark.sparkContext
sc.setLogLevel("ERROR")

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
import warnings
warnings.filterwarnings('ignore')

import time
import uuid
import traceback
from datetime import datetime
from pyspark.sql.functions import struct, to_json, sum as _sum, collect_list, first
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, LongType


TIMEOUT_SECONDS = 3600 * 5  # 超时时间5min
MAX_OUTPUT_ROW_SIZE = 1024 * 1024 * 1024 * 1.5
SIMILARITY_THRESHOLD = 0.95


# get layout data

In [3]:
input_path = "xxx"
input_path_lst = [f for f in list(list_s3_objects(input_path, recursive=True)) if f.endswith(".jsonl")]
len(input_path_lst)

933

# 边读数据边生成index data

In [4]:
def read_to_index(_iter):
    # TODO 错误日志存放地址
    error_log_path = f"s3://xxx.jsonl"
    print(f"error_log_path: {error_log_path}")
    s3_doc_writer = S3DocWriter(path=error_log_path)
    error_info = None
    for fpath in _iter:
        current_layout_id = None
        start_offset = None
        layout_length = 0
        idx = 0
        print(f"fpath:{fpath}")
        for row in read_s3_rows(fpath):
            idx += 1
            try:
                detail_data = json_loads(row.value)
                layout_id = detail_data["layout_id"]
                offset, length = map(int, row.loc.split("bytes=")[-1].split(","))
                if layout_id == current_layout_id:
                    layout_length += length
                    continue
                else:
                    if current_layout_id is not None:
                        print(f"{current_layout_id} 该批数据批次结束, 总数据量为： {idx-1}")
                        line = {
                            "layout_id": current_layout_id,
                            "url_host_name": detail_data["url_host_name"],
                            "count": idx-1,
                            "file": {
                                "filepath": fpath,
                                "offset": start_offset,
                                "length": layout_length,
                                "record_count": idx-1,
                                "timestamp": int(time.time())
                            }
                        }
                        yield line
                        idx = 1
                    current_layout_id = layout_id
                    start_offset = offset
                    layout_length = 0
                    layout_length += length
                    print(f"新批次数据： {current_layout_id}, start_offset: {start_offset}, layout_length: {layout_length}")
            except Exception as e:
                error_info = {
                    "error_type": type(e).__name__,
                    "error_message": str(e),
                    "traceback": traceback.format_exc(),
                    "input_data": row.value if hasattr(row, 'value') else str(row),
                    "timestamp": datetime.now().isoformat()
                }
                s3_doc_writer.write(error_info)
                continue
    if current_layout_id is not None:
        print(f"last: {current_layout_id} 该批数据批次结束, 总数据量为： {idx}")
        line = {
            "layout_id": current_layout_id,
            "url_host_name": detail_data["url_host_name"],
            "count": idx,
            "file": {
                "filepath": fpath,
                "offset": start_offset,
                "length": layout_length,
                "record_count": idx,
                "timestamp": int(time.time())
            }
        }
        yield line
    if error_info:
        s3_doc_writer.flush()



In [5]:
schema = StructType([
    StructField("layout_id", StringType(), True),
    StructField("url_host_name", StringType(), True),
    StructField("count", LongType(), True),
    StructField("file", StructType([
        StructField("filepath", StringType(), True),
        StructField("offset", LongType(), True),
        StructField("length", LongType(), True),
        StructField("record_count", LongType(), True),
        StructField("timestamp", IntegerType(), True),
    ]), True),
])

page_content = sc.parallelize(input_path_lst, len(input_path_lst))
dump_html_df = page_content.mapPartitions(read_to_index).toDF(schema)

                                                                                

133024

# 基于layout_id合并index data

In [7]:
result_df = dump_html_df.groupBy("layout_id") \
    .agg(
        _sum("count").alias("count"),
        collect_list("file").alias("files"),
        first("url_host_name").alias("url_host_name")
    )

# write

In [8]:
struct_col = struct(result_df["layout_id"], result_df["count"], result_df["files"], result_df["url_host_name"])
output_df = result_df.withColumn("value", to_json(struct_col)).select("value")

In [9]:
output_file_size_gb = 2
resize_func = spark_resize_file(output_file_size_gb)
new_output_df = resize_func(output_df)


                                                                                

In [10]:
output_path = "s3://xxx/"
config["skip_output_version"] = True
config['skip_output_check'] = True

write_any_path(new_output_df, output_path, config)

                                                                                

{'rows': 132303,
 'bytes': {'sum': 43342667,
  'min': 293,
  'max': 626,
  'cnt': 132303,
  'avg': 327.602},
 'files': 1,
 'sub_paths': {}}

In [15]:
spark.stop()