Merge branch 'develop'

amaiya · Mar 4, 2020 · 84a8527 · 84a8527
2 parents 0038b38 + 0eb962c
commit 84a8527
Show file tree

Hide file tree

Showing 8 changed files with 56 additions and 37 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,23 @@ Most recent releases are shown at the top. Each release shows:
 - **Fixed**: Bug fixes that don't change documented behaviour
 
 
+## 0.10.1 (2020-03-04)
+
+### New:
+- N/A
+
+### Changed:
+- `shallownlp.Classifier.texts_from_folder` changed to `shallownlp.Classifier.load_texts_from_folder`
+- `shallownlp.Classifier.texts_from_csv` changed to `shallownlp.Classifier.load_texts_from_csv`
+- In `text.preprocessor`, added warning that `class_names` is being ignored when `class_names` were supplied
+  and `y_train` and `y_test` contain string labels
+
+### Fixed:
+- N/A
+
+
+
+
 ## 0.10.0 (2020-03-03)
 
 ### New:

diff --git a/README.md b/README.md
@@ -1,4 +1,6 @@
 ### [Overview](#overview) | [Tutorials](#tutorials) | [Examples](#examples) |  [Installation](#installation)
+[![PyPI Status](https://badge.fury.io/py/ktrain.svg)](https://badge.fury.io/py/ktrain) [![ktrain python compatibility](https://img.shields.io/pypi/pyversions/ktrain.svg)](https://pypi.python.org/pypi/ktrain) [![license](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/amaiya/ktrain/blob/master/LICENSE) [![Downloads](https://pepy.tech/badge/ktrain)](https://pepy.tech/project/ktrain) [![Downloads](https://pepy.tech/badge/ktrain/month)](https://pepy.tech/project/ktrain/month)
+
 
 # ktrain
 

diff --git a/examples/text/shallownlp-examples.ipynb b/examples/text/shallownlp-examples.ipynb
@@ -11,7 +11,7 @@
     "%matplotlib inline\n",
     "import os\n",
     "os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # CPU\n",
-    "os.environ['DISABLE_V2_BEHAVIOR'] = '1'"
+    "os.environ['DISABLE_V2_BEHAVIOR'] = '1' # disable V2 Behavior - required for NER in TF2 right now"
    ]
   },
   {
@@ -340,8 +340,8 @@
    ],
    "source": [
     "datadir = r'/home/amaiya/data/aclImdb'\n",
-    "(x_train,  y_train, label_names) = snlp.Classifier.texts_from_folder(datadir+'/train')\n",
-    "(x_test,  y_test, _) = snlp.Classifier.texts_from_folder(datadir+'/test', shuffle=False)\n",
+    "(x_train,  y_train, label_names) = snlp.Classifier.load_texts_from_folder(datadir+'/train')\n",
+    "(x_test,  y_test, _) = snlp.Classifier.load_texts_from_folder(datadir+'/test', shuffle=False)\n",
     "print('label names: %s' % (label_names))\n",
     "clf = snlp.Classifier().fit(x_train, y_train, ctype='nbsvm')\n",
     "print('validation accuracy: %s%%' % (round(clf.evaluate(x_test, y_test)*100, 2)))\n",
@@ -370,9 +370,9 @@
      "output_type": "stream",
      "text": [
       "Building prefix dict from the default dictionary ...\n",
-      "I0302 23:02:22.177368 140669362145088 __init__.py:111] Building prefix dict from the default dictionary ...\n",
+      "I0304 17:09:57.303427 140636486911808 __init__.py:111] Building prefix dict from the default dictionary ...\n",
       "Loading model from cache /tmp/jieba.cache\n",
-      "I0302 23:02:22.179908 140669362145088 __init__.py:131] Loading model from cache /tmp/jieba.cache\n"
+      "I0304 17:09:57.305477 140636486911808 __init__.py:131] Loading model from cache /tmp/jieba.cache\n"
      ]
     },
     {
@@ -389,10 +389,10 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Loading model cost 0.672 seconds.\n",
-      "I0302 23:02:22.851611 140669362145088 __init__.py:163] Loading model cost 0.672 seconds.\n",
+      "Loading model cost 0.640 seconds.\n",
+      "I0304 17:09:57.945362 140636486911808 __init__.py:163] Loading model cost 0.640 seconds.\n",
       "Prefix dict has been built succesfully.\n",
-      "I0302 23:02:22.853565 140669362145088 __init__.py:164] Prefix dict has been built succesfully.\n"
+      "I0304 17:09:57.946959 140636486911808 __init__.py:164] Prefix dict has been built succesfully.\n"
      ]
     },
     {
@@ -407,7 +407,7 @@
    ],
    "source": [
     "datadir = '/home/amaiya/data/ChnSentiCorp_htl_ba_6000'\n",
-    "(texts,  labels, label_names) = snlp.Classifier.texts_from_folder(datadir+'/train')\n",
+    "(texts,  labels, label_names) = snlp.Classifier.load_texts_from_folder(datadir+'/train')\n",
     "print('label names: %s' % (label_names))\n",
     "from sklearn.model_selection import train_test_split\n",
     "x_train, x_test, y_train, y_test = train_test_split(texts, labels, test_size=0.1, random_state=42)\n",
@@ -430,7 +430,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
@@ -444,8 +444,8 @@
    "source": [
     "# setup data\n",
     "datadir = r'/home/amaiya/data/aclImdb'\n",
-    "(x_train,  y_train, label_names) = snlp.Classifier.texts_from_folder(datadir+'/train')\n",
-    "(x_test,  y_test, _) = snlp.Classifier.texts_from_folder(datadir+'/test', shuffle=False)\n",
+    "(x_train,  y_train, label_names) = snlp.Classifier.load_texts_from_folder(datadir+'/train')\n",
+    "(x_test,  y_test, _) = snlp.Classifier.load_texts_from_folder(datadir+'/test', shuffle=False)\n",
     "\n",
     "# initialize a model to optimize\n",
     "clf = snlp.Classifier()\n",
@@ -483,7 +483,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -529,7 +529,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
@@ -541,7 +541,7 @@
        " ('doc2', 'Arabic', 1)]"
       ]
      },
-     "execution_count": 15,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -564,7 +564,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
@@ -573,7 +573,7 @@
        "[('doc1', '合肥微尺度国家物理科学实验室', 7)]"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -602,7 +602,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
@@ -631,7 +631,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
@@ -640,7 +640,7 @@
        "[('doc1', 'сегодня надейся на завтра', 1)]"
       ]
      },
-     "execution_count": 18,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -658,7 +658,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
@@ -667,7 +667,7 @@
        "['合肥微尺度国家物理科学实验室']"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -678,7 +678,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
@@ -687,7 +687,7 @@
        "['живи', 'сегодня', 'надейся', 'на', 'завтра']"
       ]
      },
-     "execution_count": 20,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -698,7 +698,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
@@ -707,7 +707,7 @@
        "['عش', 'اليوم', 'الأمل', 'ليوم', 'غد']"
       ]
      },
-     "execution_count": 21,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }

diff --git a/ktrain/tests/test_shallownlp.py b/ktrain/tests/test_shallownlp.py
@@ -42,7 +42,7 @@ def test_classifier(self):
     #@skip('temporarily disabled')
     def test_classifier_chinese(self):
         fpath = './text_data/chinese_hotel_reviews.csv'
-        (x_train,  y_train, label_names) = snlp.Classifier.texts_from_csv(fpath, text_column='content', label_column='pos', sep='|')
+        (x_train,  y_train, label_names) = snlp.Classifier.load_texts_from_csv(fpath, text_column='content', label_column='pos', sep='|')
         print('label names: %s' % (label_names))
         clf = snlp.Classifier()
         clf.fit(x_train, y_train, ctype='nbsvm')

diff --git a/ktrain/text/preprocessor.py b/ktrain/text/preprocessor.py
@@ -419,7 +419,7 @@ def _transform_y(self, y_data):
         y_data = np.array(y_data) if type(y_data) == list else y_data
 
         # check for errors and warnings
-        if not isinstance(y_data, str) and len(y_data.shape) ==1 and not self.get_classes():
+        if not isinstance(y_data[0], str) and len(y_data.shape) ==1 and not self.get_classes():
             warnings.warn('Task is being treated as TEXT REGRESSION because ' +\
                           'class_names argument was not supplied. ' + \
                           'If this is incorrect, supply class_names argument.')
@@ -432,7 +432,7 @@ def _transform_y(self, y_data):
             if self.label_encoder is None:
                 self.label_encoder = LabelEncoder()
                 self.label_encoder.fit(y_data)
-                #if self.get_classes(): warnings.warn('class_names argument is being overridden by string labels from data')
+                if self.get_classes(): warnings.warn('class_names argument was ignored, as they were extracted from string labels in dataset')
                 self.set_classes(self.label_encoder.classes_)
             y_data = self.label_encoder.transform(y_data)
 

diff --git a/ktrain/text/shallownlp/classifier.py b/ktrain/text/shallownlp/classifier.py
@@ -92,10 +92,10 @@ def create_model(self, ctype, texts, hp_dict={}, ngram_range=(1,3), binary=True)
 
 
     @classmethod
-    def texts_from_folder(cls, folder_path, 
-                          subfolders=None, 
-                          shuffle=True,
-                          encoding=None):
+    def load_texts_from_folder(cls, folder_path, 
+                              subfolders=None, 
+                              shuffle=True,
+                              encoding=None):
         """
         load text files from folder
 
@@ -137,8 +137,8 @@ def texts_from_folder(cls, folder_path,
 
 
     @classmethod
-    def texts_from_csv(cls, csv_filepath, text_column='text', label_column='label',
-                       sep=',', encoding=None):
+    def load_texts_from_csv(cls, csv_filepath, text_column='text', label_column='label',
+                            sep=',', encoding=None):
         """
         load text files from csv file
         CSV should have at least two columns.

diff --git a/ktrain/text/textutils.py b/ktrain/text/textutils.py
@@ -203,7 +203,7 @@ def detect_lang(texts, sample_size=32):
         except:
             continue
     if len(lst) == 0: 
-        raise Exception('could not detect language in random sample of %s docs.'  % (sample_size))
+        raise Exception('could not detect language in random sample of %s docs. Are you sure you provided a list of strings?'  % (sample_size))
     return max(set(lst), key=lst.count)
 
 

diff --git a/ktrain/version.py b/ktrain/version.py
@@ -1,2 +1,2 @@
 __all__ = ['__version__']
-__version__ = '0.10.0'
+__version__ = '0.10.1'