Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
amaiya committed Mar 4, 2020
2 parents 0038b38 + 0eb962c commit 84a8527
Show file tree
Hide file tree
Showing 8 changed files with 56 additions and 37 deletions.
17 changes: 17 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,23 @@ Most recent releases are shown at the top. Each release shows:
- **Fixed**: Bug fixes that don't change documented behaviour


## 0.10.1 (2020-03-04)

### New:
- N/A

### Changed:
- `shallownlp.Classifier.texts_from_folder` changed to `shallownlp.Classifier.load_texts_from_folder`
- `shallownlp.Classifier.texts_from_csv` changed to `shallownlp.Classifier.load_texts_from_csv`
- In `text.preprocessor`, added warning that `class_names` is being ignored when `class_names` were supplied
and `y_train` and `y_test` contain string labels

### Fixed:
- N/A




## 0.10.0 (2020-03-03)

### New:
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
### [Overview](#overview) | [Tutorials](#tutorials) | [Examples](#examples) | [Installation](#installation)
[![PyPI Status](https://badge.fury.io/py/ktrain.svg)](https://badge.fury.io/py/ktrain) [![ktrain python compatibility](https://img.shields.io/pypi/pyversions/ktrain.svg)](https://pypi.python.org/pypi/ktrain) [![license](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/amaiya/ktrain/blob/master/LICENSE) [![Downloads](https://pepy.tech/badge/ktrain)](https://pepy.tech/project/ktrain) [![Downloads](https://pepy.tech/badge/ktrain/month)](https://pepy.tech/project/ktrain/month)


# ktrain

Expand Down
52 changes: 26 additions & 26 deletions examples/text/shallownlp-examples.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"%matplotlib inline\n",
"import os\n",
"os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # CPU\n",
"os.environ['DISABLE_V2_BEHAVIOR'] = '1'"
"os.environ['DISABLE_V2_BEHAVIOR'] = '1' # disable V2 Behavior - required for NER in TF2 right now"
]
},
{
Expand Down Expand Up @@ -340,8 +340,8 @@
],
"source": [
"datadir = r'/home/amaiya/data/aclImdb'\n",
"(x_train, y_train, label_names) = snlp.Classifier.texts_from_folder(datadir+'/train')\n",
"(x_test, y_test, _) = snlp.Classifier.texts_from_folder(datadir+'/test', shuffle=False)\n",
"(x_train, y_train, label_names) = snlp.Classifier.load_texts_from_folder(datadir+'/train')\n",
"(x_test, y_test, _) = snlp.Classifier.load_texts_from_folder(datadir+'/test', shuffle=False)\n",
"print('label names: %s' % (label_names))\n",
"clf = snlp.Classifier().fit(x_train, y_train, ctype='nbsvm')\n",
"print('validation accuracy: %s%%' % (round(clf.evaluate(x_test, y_test)*100, 2)))\n",
Expand Down Expand Up @@ -370,9 +370,9 @@
"output_type": "stream",
"text": [
"Building prefix dict from the default dictionary ...\n",
"I0302 23:02:22.177368 140669362145088 __init__.py:111] Building prefix dict from the default dictionary ...\n",
"I0304 17:09:57.303427 140636486911808 __init__.py:111] Building prefix dict from the default dictionary ...\n",
"Loading model from cache /tmp/jieba.cache\n",
"I0302 23:02:22.179908 140669362145088 __init__.py:131] Loading model from cache /tmp/jieba.cache\n"
"I0304 17:09:57.305477 140636486911808 __init__.py:131] Loading model from cache /tmp/jieba.cache\n"
]
},
{
Expand All @@ -389,10 +389,10 @@
"name": "stderr",
"output_type": "stream",
"text": [
"Loading model cost 0.672 seconds.\n",
"I0302 23:02:22.851611 140669362145088 __init__.py:163] Loading model cost 0.672 seconds.\n",
"Loading model cost 0.640 seconds.\n",
"I0304 17:09:57.945362 140636486911808 __init__.py:163] Loading model cost 0.640 seconds.\n",
"Prefix dict has been built succesfully.\n",
"I0302 23:02:22.853565 140669362145088 __init__.py:164] Prefix dict has been built succesfully.\n"
"I0304 17:09:57.946959 140636486911808 __init__.py:164] Prefix dict has been built succesfully.\n"
]
},
{
Expand All @@ -407,7 +407,7 @@
],
"source": [
"datadir = '/home/amaiya/data/ChnSentiCorp_htl_ba_6000'\n",
"(texts, labels, label_names) = snlp.Classifier.texts_from_folder(datadir+'/train')\n",
"(texts, labels, label_names) = snlp.Classifier.load_texts_from_folder(datadir+'/train')\n",
"print('label names: %s' % (label_names))\n",
"from sklearn.model_selection import train_test_split\n",
"x_train, x_test, y_train, y_test = train_test_split(texts, labels, test_size=0.1, random_state=42)\n",
Expand All @@ -430,7 +430,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 14,
"metadata": {},
"outputs": [
{
Expand All @@ -444,8 +444,8 @@
"source": [
"# setup data\n",
"datadir = r'/home/amaiya/data/aclImdb'\n",
"(x_train, y_train, label_names) = snlp.Classifier.texts_from_folder(datadir+'/train')\n",
"(x_test, y_test, _) = snlp.Classifier.texts_from_folder(datadir+'/test', shuffle=False)\n",
"(x_train, y_train, label_names) = snlp.Classifier.load_texts_from_folder(datadir+'/train')\n",
"(x_test, y_test, _) = snlp.Classifier.load_texts_from_folder(datadir+'/test', shuffle=False)\n",
"\n",
"# initialize a model to optimize\n",
"clf = snlp.Classifier()\n",
Expand Down Expand Up @@ -483,7 +483,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -529,7 +529,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 16,
"metadata": {},
"outputs": [
{
Expand All @@ -541,7 +541,7 @@
" ('doc2', 'Arabic', 1)]"
]
},
"execution_count": 15,
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -564,7 +564,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 17,
"metadata": {},
"outputs": [
{
Expand All @@ -573,7 +573,7 @@
"[('doc1', '合肥微尺度国家物理科学实验室', 7)]"
]
},
"execution_count": 16,
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -602,7 +602,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 18,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -631,7 +631,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 19,
"metadata": {},
"outputs": [
{
Expand All @@ -640,7 +640,7 @@
"[('doc1', 'сегодня надейся на завтра', 1)]"
]
},
"execution_count": 18,
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -658,7 +658,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 20,
"metadata": {},
"outputs": [
{
Expand All @@ -667,7 +667,7 @@
"['合肥微尺度国家物理科学实验室']"
]
},
"execution_count": 19,
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -678,7 +678,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 21,
"metadata": {},
"outputs": [
{
Expand All @@ -687,7 +687,7 @@
"['живи', 'сегодня', 'надейся', 'на', 'завтра']"
]
},
"execution_count": 20,
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -698,7 +698,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 22,
"metadata": {},
"outputs": [
{
Expand All @@ -707,7 +707,7 @@
"['عش', 'اليوم', 'الأمل', 'ليوم', 'غد']"
]
},
"execution_count": 21,
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
Expand Down
2 changes: 1 addition & 1 deletion ktrain/tests/test_shallownlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def test_classifier(self):
#@skip('temporarily disabled')
def test_classifier_chinese(self):
fpath = './text_data/chinese_hotel_reviews.csv'
(x_train, y_train, label_names) = snlp.Classifier.texts_from_csv(fpath, text_column='content', label_column='pos', sep='|')
(x_train, y_train, label_names) = snlp.Classifier.load_texts_from_csv(fpath, text_column='content', label_column='pos', sep='|')
print('label names: %s' % (label_names))
clf = snlp.Classifier()
clf.fit(x_train, y_train, ctype='nbsvm')
Expand Down
4 changes: 2 additions & 2 deletions ktrain/text/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,7 @@ def _transform_y(self, y_data):
y_data = np.array(y_data) if type(y_data) == list else y_data

# check for errors and warnings
if not isinstance(y_data, str) and len(y_data.shape) ==1 and not self.get_classes():
if not isinstance(y_data[0], str) and len(y_data.shape) ==1 and not self.get_classes():
warnings.warn('Task is being treated as TEXT REGRESSION because ' +\
'class_names argument was not supplied. ' + \
'If this is incorrect, supply class_names argument.')
Expand All @@ -432,7 +432,7 @@ def _transform_y(self, y_data):
if self.label_encoder is None:
self.label_encoder = LabelEncoder()
self.label_encoder.fit(y_data)
#if self.get_classes(): warnings.warn('class_names argument is being overridden by string labels from data')
if self.get_classes(): warnings.warn('class_names argument was ignored, as they were extracted from string labels in dataset')
self.set_classes(self.label_encoder.classes_)
y_data = self.label_encoder.transform(y_data)

Expand Down
12 changes: 6 additions & 6 deletions ktrain/text/shallownlp/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,10 @@ def create_model(self, ctype, texts, hp_dict={}, ngram_range=(1,3), binary=True)


@classmethod
def texts_from_folder(cls, folder_path,
subfolders=None,
shuffle=True,
encoding=None):
def load_texts_from_folder(cls, folder_path,
subfolders=None,
shuffle=True,
encoding=None):
"""
load text files from folder
Expand Down Expand Up @@ -137,8 +137,8 @@ def texts_from_folder(cls, folder_path,


@classmethod
def texts_from_csv(cls, csv_filepath, text_column='text', label_column='label',
sep=',', encoding=None):
def load_texts_from_csv(cls, csv_filepath, text_column='text', label_column='label',
sep=',', encoding=None):
"""
load text files from csv file
CSV should have at least two columns.
Expand Down
2 changes: 1 addition & 1 deletion ktrain/text/textutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def detect_lang(texts, sample_size=32):
except:
continue
if len(lst) == 0:
raise Exception('could not detect language in random sample of %s docs.' % (sample_size))
raise Exception('could not detect language in random sample of %s docs. Are you sure you provided a list of strings?' % (sample_size))
return max(set(lst), key=lst.count)


Expand Down
2 changes: 1 addition & 1 deletion ktrain/version.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
__all__ = ['__version__']
__version__ = '0.10.0'
__version__ = '0.10.1'

0 comments on commit 84a8527

Please sign in to comment.