From 6ff63ee2ba5d51c5d165eea1754080773d1a4b99 Mon Sep 17 00:00:00 2001
From: dashi6174 <dashi6174@163.com>
Date: Wed, 15 May 2024 16:34:28 +0800
Subject: [PATCH] Support for code files parse (#789)

### What problem does this PR solve?

_Briefly describe what this PR aims to solve. Include background context
that will help reviewers understand the purpose of the PR._

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
---
 api/utils/file_utils.py | 2 +-
 rag/app/naive.py        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/api/utils/file_utils.py b/api/utils/file_utils.py
index 1b34d23fc6..a062b93eed 100644
--- a/api/utils/file_utils.py
+++ b/api/utils/file_utils.py
@@ -156,7 +156,7 @@ def filename_type(filename):
         return FileType.PDF.value
 
     if re.match(
-            r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md)$", filename):
+            r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename):
         return FileType.DOC.value
 
     if re.match(
diff --git a/rag/app/naive.py b/rag/app/naive.py
index c557a62670..01bb4de1d3 100644
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -136,7 +136,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
         excel_parser = ExcelParser()
         sections = [(excel_parser.html(binary), "")]
 
-    elif re.search(r"\.(txt|md)$", filename, re.IGNORECASE):
+    elif re.search(r"\.(txt|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")
         txt = ""
         if binary: