From 6ff63ee2ba5d51c5d165eea1754080773d1a4b99 Mon Sep 17 00:00:00 2001 From: dashi6174 Date: Wed, 15 May 2024 16:34:28 +0800 Subject: [PATCH] Support for code files parse (#789) ### What problem does this PR solve? _Briefly describe what this PR aims to solve. Include background context that will help reviewers understand the purpose of the PR._ ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- api/utils/file_utils.py | 2 +- rag/app/naive.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/api/utils/file_utils.py b/api/utils/file_utils.py index 1b34d23fc6..a062b93eed 100644 --- a/api/utils/file_utils.py +++ b/api/utils/file_utils.py @@ -156,7 +156,7 @@ def filename_type(filename): return FileType.PDF.value if re.match( - r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md)$", filename): + r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename): return FileType.DOC.value if re.match( diff --git a/rag/app/naive.py b/rag/app/naive.py index c557a62670..01bb4de1d3 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -136,7 +136,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, excel_parser = ExcelParser() sections = [(excel_parser.html(binary), "")] - elif re.search(r"\.(txt|md)$", filename, re.IGNORECASE): + elif re.search(r"\.(txt|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") txt = "" if binary: