Merge branch 'develop'

amaiya · Nov 8, 2020 · 48cf8d2 · 48cf8d2
2 parents a1ba6d3 + dbb47e0
commit 48cf8d2
Show file tree

Hide file tree

Showing 7 changed files with 50 additions and 15 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,14 +6,27 @@ Most recent releases are shown at the top. Each release shows:
 - **Changed**: Additional parameters, changes to inputs or outputs, etc
 - **Fixed**: Bug fixes that don't change documented behaviour
 
-## 0.24.1 (2020-11-06)
+## 0.24.2 (2020-11-07)
 
 ### New:
 - N/A
 
 ### Changed
+- `ktrain.text.textutils.extract_copy` now uses `textract` to extract text from many file types (e.g., PDF, DOC, PPT)
+  instead of just PDFs,
+
+### Fixed:
+- N/A
+
+
+
+## 0.24.1 (2020-11-06)
+
+### New:
 - N/A
 
+### Changed
+- N/A
 
 ### Fixed:
 - Change exception in model ID check in `Translator` to warning to better allow offline language translations

diff --git a/FAQ.md b/FAQ.md
@@ -6,6 +6,9 @@
 
 - [What kinds of applications have been built with *ktrain*?](#what-kinds-of-applications-have-been-built-with-ktrain)
 
+- [How do I use ktrain with documents in PDF, DOC, or PPT formats?](#how-do-i-use-ktrain-with-documents-in-pdf-doc-or-ppt-formats)
+
+
 ## Installation/Deployment Issues
 - [How do I install ktrain on a Windows machine?](#how-do-i-install-ktrain-on-a-windows-machine)
 
@@ -745,6 +748,17 @@ predictor = ktrain.load_predictor('/path/to/folder')
 
 [[Back to Top](#frequently-asked-questions-about-ktrain)]
 
+
+### How do I use ktrain with documents in PDF, DOC, or PPT formats?
+
+If you have documents in formats like `.pdf`, `.docx`, or `.pptx` formats and want to use them in a training set or with various **ktrain** features 
+like question-answering  or zero-shot-learning, they will need to be converted to plain text format first (i.e., `.txt` files).  You can use the
+`ktrain.text.textutils.extract_copy` function to automatically do this. Alternatively, you can use other tools like [Apache Tika](https://tika.apache.org/) to do the conversion.
+
+[[Back to Top](#frequently-asked-questions-about-ktrain)]
+
+
+
 ### What kinds of applications have been built with *ktrain*?
 
 Examples include:

diff --git a/examples/text/ktrain-ONNX-TFLite-examples.ipynb b/examples/text/ktrain-ONNX-TFLite-examples.ipynb
@@ -12,7 +12,7 @@
     "import os\n",
     "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\";\n",
     "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"\"  # Enforce CPU usage\n",
-    "from psutil import cpu_count\n",
+    "from psutil import cpu_count  # Do \"pip install psutil\" if not already installed\n",
     "import tensorflow as tf\n",
     "import numpy as np\n",
     "\n",

diff --git a/examples/text/question_answering_with_bert.ipynb b/examples/text/question_answering_with_bert.ipynb
@@ -99,7 +99,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "For documents sets that are too large to be loaded into a Python list, you can use `SimpleQA.index_from_folder`, which will crawl a folder and index all plain text documents (e.g.,, `.txt` files).\n",
+    "For documents sets that are too large to be loaded into a Python list, you can use `SimpleQA.index_from_folder`, which will crawl a folder and index all plain text documents (e.g.,, `.txt` files).  If your documents are in formats like `.pdf`, `.docx`, or `.pptx`, you can convert them to `.txt` files with tools like [Apache Tika](https://tika.apache.org/) or [textract](https://textract.readthedocs.io/en/stable/).  You can also use the `ktrain.text.textutils.extract_copy` function, that will automatically use `textract` to extract plain text from your documents and copy them to a different directory.\n",
     "\n",
     "#### Speeding Up Indexing\n",
     "By default, `index_from_list` and `index_from_folder` use a single processor (`procs=1`) with each processor using a maximum of 256MB of memory (`limitmb=256`) and merging results into a single segment (`multisegment=False`).  These values can be changed to speedup indexing as arguments to `index_from_list` or `index_from_folder`.  See the [whoosh documentation](https://whoosh.readthedocs.io/en/latest/batch.html) for more information on these parameters and how to use them to speedup indexing.  In this case, we've used `multisegment=True` and `procs=4`.\n",

diff --git a/ktrain/text/textutils.py b/ktrain/text/textutils.py
@@ -9,40 +9,47 @@
 
 def extract_copy(corpus_path, output_path):
     """
-    Crawl <corpus_path>, extract or read plain text from application/pdf
-    and text/plain files and then copy them to output_path.
+    Crawl <corpus_path>, extract plain text from documents
+    and then copy them to output_path.
+    Requires textract package
     Args:
         corpus_path(str):  root folder containing documents
         output_path(str):  root folder of output directory
     Returns:
         list: list of skipped filenames
     """
+    try:
+        import textract
+    except ImportError:
+        raise Exception('extract_copy requires textract: pip install textract')
+
     skipped = set()
     num_skipped = 0
     corpus_path = os.path.normpath(corpus_path)
     output_path = os.path.normpath(output_path)
     for idx, filename in enumerate(extract_filenames(corpus_path)):
         if idx %1000 == 0: print('processed %s doc(s)' % (idx+1))
         mtype = get_mimetype(filename)
-        if mtype == 'application/pdf':
-            text = pdftotext(filename)
-            text = text.strip()
-        elif mtype and mtype.split('/')[0] == 'text':
-            with open(filename, 'r') as f:
-                text = f.read()
-                text = str.encode(text)
-        else:
+        try:
+            if mtype and mtype.split('/')[0] == 'text':
+                with open(filename, 'r') as f:
+                    text = f.read()
+                    text = str.encode(text)
+            else:
+                text = textract.process(filename)
+        except:
             num_skipped += 1
             if not mtype:
                 mtype =  os.path.splitext(filename)[1]
                 if not mtype: mtype == 'unknown'
             skipped.add(mtype)
             continue
+
         if not text: 
             num_skipped += 1
             continue
         fpath, fname = os.path.split(filename)
-        if mtype == 'application/pdf': fname = fname+'.txt'
+        if mtype and mtype.split('/')[0] != 'text': fname = fname+'.txt'
         relfpath = fpath.replace(corpus_path, '')
         relfpath = relfpath[1:] if relfpath and relfpath[0] == os.sep else relfpath
         opath = os.path.join(output_path, relfpath)

diff --git a/ktrain/version.py b/ktrain/version.py
@@ -1,2 +1,2 @@
 __all__ = ['__version__']
-__version__ = '0.24.1'
+__version__ = '0.24.2'
diff --git a/setup.py b/setup.py
@@ -49,6 +49,7 @@
           #'stellargraph>=0.8.2', # forked version used by graph module
           #'allennlp', # required for Elmo embeddings since TF2 TF_HUB does not work
           #'textblob', # used by textutils.extract_noun_phrases
+          #'textract', # used by textutils.extract_copy
       ],
   classifiers=[  # Optional
     # How mature is this project? Common values are