Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions realcrawl/cfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from loguru import logger

from realcrawl.exception.base import ConfigFileNotFoundException
from realcrawl.libs.path_lib import get_py_pkg_root_dir


def load_config(suppress_error: bool = False) -> dict:
Expand Down Expand Up @@ -60,3 +61,17 @@ def load_config(suppress_error: bool = False) -> dict:
config = json.load(f)

return config


def load_pipe_tpl(pipe_name: str) -> dict:
"""Load the pipe template for the web kit.

Args:
pipe_name(str): The name of the pipe to load

Returns: pipe_tpl(dict): The pipe template dictionary
"""
pipe_tpl_path = os.path.join(get_py_pkg_root_dir(), 'config', 'extract_tpl', f'{pipe_name}.jsonc')
with open(pipe_tpl_path, 'r', encoding='utf-8') as f:
pipe_tpl = json.load(f)
return pipe_tpl
30 changes: 30 additions & 0 deletions realcrawl/config/extract_tpl/extractor_pipe.jsonc
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"extractor_pipe": {
"enable": true,
"validate_input_format": false,
"pre_extractor": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor"
},
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor",
"class_init_kwargs": {},
}
],
"extractor": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.extractor.HTMLFileFormatExtractor",
"class_init_kwargs": {}
}
],
"post_extractor": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor"
}
]
}
}
43 changes: 43 additions & 0 deletions realcrawl/extract/html_extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@

from func_timeout import func_timeout
from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory
from llm_web_kit.input.datajson import DataJson

from realcrawl.cfg import load_pipe_tpl


class HtmlExtract:
def __init__(self, html_file_path: str, output_format: str = 'md'):
self.config = load_pipe_tpl('extractor_pipe')
self.extractor_chain = ExtractSimpleFactory.create(self.config)
self.d = {
'track_id': '1',
'html': open(html_file_path, 'r').read(),
'url': 'https://www.google.com',
'domain': 'google.com',
'dataset_name':'cc',
'data_source_category':'HTML',
'file_bytes': 4096,
'page_layout_type': 'article',
'meta_info': {'input_datetime': '2020-01-01 00:00:00'}
}
self.output_format = output_format

def get_html_content(self):
print('self.d: ', self.d)
input_data = DataJson(self.d)
data_e: DataJson = func_timeout(10, self.extractor_chain.extract, args=(input_data,))
print('data_e: ', data_e.get_content_list().to_json())
if self.output_format == 'md':
md_content = data_e.get_content_list().to_mm_md()
elif self.output_format == 'json':
md_content = data_e.get_content_list().to_mm_json()
else:
raise ValueError(f'Invalid output format: {self.output_format}')
return md_content

def get_main_html(self):
input_data = DataJson(self.d)
data_e: DataJson = func_timeout(10, self.extractor_chain.extract, args=(input_data,))
main_html = data_e.get_main_html()
return main_html
17 changes: 17 additions & 0 deletions realcrawl/libs/path_lib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import os


def get_proj_root_dir():
"""获取项目的根目录.也就是含有.github, docs, llm_web_kit目录的那个目录."""
return os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))


def get_py_pkg_root_dir():
"""获取python包的根目录.也就是含有__init__.py的那个目录.

Args:
None
Returns:
str: 项目的根目录
"""
return os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
4 changes: 4 additions & 0 deletions requirements/runtime.txt
Original file line number Diff line number Diff line change
@@ -1 +1,5 @@
commentjson==0.9.0
commentjson
func-timeout
git+https://github.com/ccprocessor/llm-webkit-mirror.git@dev
loguru
122 changes: 122 additions & 0 deletions tests/realcrawl/assets/1.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
</head>
<body>

<!-- Path: 2.html -->

<h1>Heading 1</h1>
<p>Paragraph 1</p>
<div>
<img alt="image-alt" title="image-title" src="test.png" />
<p>Paragraph 2</p>
</div>

<!-- 简单table -->
<table>
<tr>
<td>1</td>
<td>2</td>
</tr>
<tr>
<td>3</td>
<td>4</td>
</tr>
</table>

<div>
<span>
<!-- 复杂table -->
<table>
<tr>
<td rowspan="2">1</td>
<td>2</td>
<td>3</td>
</tr>
<tr>
<td colspan="2">4</td>
</tr>
<tr>
<td>5</td>
<td>6</td>
<td>7</td>
</tr>
</table>
</span>
</div>

<!-- 简单list -->
<ul>
<li>1</li>
<li>2</li>
</ul>

<!-- 列表项里有子列表 -->
<ul>
<li>1
<ul>
<li>1.1</li>
<li>1.2</li>
</ul>
</li>
<li>2
<ul>
<li>2.1</li>
<li>2.2</li>
</ul>
</li>
</ul>

<!-- 数学公式 -->
<math xmlns="http://www.w3.org/1998/Math/MathML" display="block">
<mi>x</mi>
<mo>=</mo>
<mrow>
<mfrac>
<mrow>
<mo>&#x2212;</mo>
<mi>b</mi>
<mo>&#x00B1;</mo>
<msqrt>
<msup>
<mi>b</mi>
<mn>2</mn>
</msup>
<mo>&#x2212;</mo>
<mn>4</mn>
<mi>a</mi>
<mi>c</mi>
</msqrt>
</mrow>
<mrow>
<mn>2</mn>
<mi>a</mi>
</mrow>
</mfrac>
</mrow>
<mtext>.</mtext>
</math>

<!-- 代码 -->
<pre><code class="language-js">const Prism = require('prismjs');

// The code snippet you want to highlight, as a string
const code = `var data = 1;`;

// Returns a highlighted HTML string
const html = Prism.highlight(code, Prism.languages.javascript, 'javascript');</code></pre>

<!-- 有序列表 -->
<ol>
<li>100</li>
<li>200</li>
</ol>

<!-- 带链接的 inline code -->
<p>reference: <code>#include&lt;<a href="xxxx.xxxx.com">xxxx.hpp</a>&gt;</code></p>

</body>
</html>
18 changes: 18 additions & 0 deletions tests/realcrawl/test_html_extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import os
import unittest

from realcrawl.extract.html_extract import HtmlExtract


class TestHtmlExtract(unittest.TestCase):
def setUp(self):
self.base_path = os.path.dirname(os.path.abspath(__file__))

def test_html_extract(self):
html_extract = HtmlExtract(os.path.join(self.base_path, 'assets/1.html'))
html_content = html_extract.get_html_content()
assert len(html_content) > 0


if __name__ == '__main__':
unittest.main()