Skip to content

Commit

Permalink
improved pdf_converter (#317)
Browse files Browse the repository at this point in the history
* improved pdf_converter

* fix test
  • Loading branch information
andrelmfarias committed Dec 2, 2019
1 parent 424e450 commit 5d2e4b3
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 7 deletions.
10 changes: 5 additions & 5 deletions cdqa/utils/converters.py
Expand Up @@ -170,8 +170,8 @@ def pdf_converter(directory_path, min_length=200, include_line_breaks=False):
try:
df.loc[i] = [pdf.replace(".pdf",''), None]
raw = parser.from_file(os.path.join(directory_path, pdf))
s = raw["content"]
paragraphs = re.split("\n(?=\u2028|[A-Z-0-9])", s)
s = raw["content"].strip()
paragraphs = re.split("\n\n(?=\u2028|[A-Z-0-9])", s)
list_par = []
temp_para = "" # variable that stores paragraphs with length<min_length
# (considered as a line)
Expand All @@ -198,9 +198,9 @@ def pdf_converter(directory_path, min_length=200, include_line_breaks=False):
else:
# appending paragraph p as is to list_par
list_par.append(p.replace("\n", ""))
else:
if temp_para:
list_par.append(temp_para.strip())
else:
if temp_para:
list_par.append(temp_para.strip())

df.loc[i, "paragraphs"] = list_par
except:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -8,7 +8,7 @@ def read(file):

setup(
name="cdqa",
version="1.3.8",
version="1.3.9",
author="Félix MIKAELIAN, André FARIAS, Matyas AMROUCHE, Olivier SANS, Théo NAZON",
description="An End-To-End Closed Domain Question Answering System",
long_description=read("README.md"),
Expand Down
2 changes: 1 addition & 1 deletion tests/test_converters.py
Expand Up @@ -39,7 +39,7 @@ def df_converter_check(self, df, include_line_breaks=False):
if include_line_breaks:
para_len = [len(df.paragraphs[i]) for i in range(df.shape[0])]
para_len.sort()
if not para_len == [144, 220, 265]:
if not para_len == [58, 80, 87]:
errors.append(f"error in number of paragraphs : {para_len}")

# assert no error message has been registered, else print messages
Expand Down

0 comments on commit 5d2e4b3

Please sign in to comment.