Skip to content

Commit

Permalink
Removed .pdf from document title in pdf_converter (#315)
Browse files Browse the repository at this point in the history
* removed .pdf from document title in pdf_converter

* bump version
  • Loading branch information
andrelmfarias committed Nov 29, 2019
1 parent 7919dad commit 424e450
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 2 deletions.
2 changes: 1 addition & 1 deletion cdqa/utils/converters.py
Expand Up @@ -168,7 +168,7 @@ def pdf_converter(directory_path, min_length=200, include_line_breaks=False):
df = pd.DataFrame(columns=["title", "paragraphs"])
for i, pdf in enumerate(list_pdf):
try:
df.loc[i] = [pdf, None]
df.loc[i] = [pdf.replace(".pdf",''), None]
raw = parser.from_file(os.path.join(directory_path, pdf))
s = raw["content"]
paragraphs = re.split("\n(?=\u2028|[A-Z-0-9])", s)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -8,7 +8,7 @@ def read(file):

setup(
name="cdqa",
version="1.3.7",
version="1.3.8",
author="Félix MIKAELIAN, André FARIAS, Matyas AMROUCHE, Olivier SANS, Théo NAZON",
description="An End-To-End Closed Domain Question Answering System",
long_description=read("README.md"),
Expand Down

0 comments on commit 424e450

Please sign in to comment.