update code examples

amaiya · May 20, 2022 · 5fc3f11 · 5fc3f11
1 parent 7d571cd
commit 5fc3f11
Show file tree

Hide file tree

Showing 4 changed files with 108 additions and 99 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,7 +6,7 @@ Most recent releases are shown at the top. Each release shows:
 - **Changed**: Additional parameters, changes to inputs or outputs, etc
 - **Fixed**: Bug fixes that don't change documented behaviour
 
-## 0.31.2 (TBD)
+## 0.31.2 (2022-05-20)
 
 ### new:
 - N/A

diff --git a/README.md b/README.md
@@ -34,16 +34,16 @@ kwe = KeywordExtractor()
 kwe.extract_keywords(text, candidate_generator='noun_phrases')
 
 # OUTPUT
-#[('machine learning', 0.0784313725490196),
-# ('text classification', 0.049019607843137254),
-# ('image classification', 0.049019607843137254),
-# ('exact answers', 0.0392156862745098),
-# ('augmented machine learning', 0.0392156862745098),
-# ('graph data', 0.029411764705882353),
-# ('node classification', 0.029411764705882353),
-# ('entity recognition', 0.029411764705882353),
-# ('code example', 0.029411764705882353),
-# ('index documents', 0.029411764705882353)]
+[('machine learning', 0.5341716824761019),
+ ('augmented machine learning', 0.5208544167057394),
+ ('text classification', 0.5134074336523509),
+ ('image classification', 0.5071170746851726),
+ ('node classification', 0.4973034499292447),
+ ('tabular data', 0.49645958463369566),
+ ('entity recognition', 0.45195059648705926),
+ ('exact answers', 0.4462502183477142),
+ ('import ktrain', 0.32891369271775894),
+ ('load model', 0.32052348289886556)]
 ```
 - **2022-01-28**
   - **ktrain v0.29.x** is released and includes miscellaneous enhancements contributed by [Sandy Khosasi](https://github.com/ilos-vigil) such as [support for MobileNetV3 and EfficientNet](https://colab.research.google.com/drive/1EJHpMVG6fBCg33UPla_Ly_6LQdswU2Ur?usp=sharing), [plotting improvements](https://colab.research.google.com/drive/1_WaRQ0J4g0VTn6HWS3kszdFZbBBWoa7R?usp=sharing), and [raw confidence scores in QA](https://colab.research.google.com/drive/1ParprLN9hFX6cxJ1w7bv91PYx4o0J1zm?usp=sharing).

diff --git a/examples/text/keyword_extraction_example.ipynb b/examples/text/keyword_extraction_example.ipynb
@@ -43,7 +43,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -60,7 +60,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "# of words in downloaded paper: 4551\n"
+      "# of words in downloaded paper: 4316\n"
      ]
     }
    ],
@@ -95,23 +95,23 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 396 ms, sys: 19.8 ms, total: 416 ms\n",
-      "Wall time: 415 ms\n"
+      "CPU times: user 341 ms, sys: 16.9 ms, total: 358 ms\n",
+      "Wall time: 357 ms\n"
      ]
     },
     {
      "data": {
       "text/plain": [
-       "[('machine learning', 0.10548523206751055),\n",
-       " ('step', 0.06751054852320675),\n",
-       " ('learning rate', 0.046413502109704644),\n",
-       " ('arxiv preprint', 0.046413502109704644),\n",
-       " ('text classification', 0.03375527426160337),\n",
-       " ('augmented machine', 0.02531645569620253),\n",
-       " ('open-domain question-answering', 0.02531645569620253),\n",
-       " ('augmented machine learning', 0.02531645569620253),\n",
-       " ('bert', 0.02109704641350211),\n",
-       " ('low-code library', 0.02109704641350211)]"
+       "[('machine learning', 0.5503444817814314),\n",
+       " ('augmented machine', 0.5123881190828152),\n",
+       " ('augmented machine learning', 0.5123881190828152),\n",
+       " ('low-code library', 0.5107922072149182),\n",
+       " ('step', 0.5092460272048237),\n",
+       " ('text classification', 0.5044526957819503),\n",
+       " ('open-domain question-answering', 0.4996712653266335),\n",
+       " ('learning rate', 0.4894264238049616),\n",
+       " ('bert', 0.424790141017796),\n",
+       " ('arxiv preprint', 0.16264098705836771)]"
       ]
      },
      "execution_count": 6,
@@ -143,23 +143,23 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 1.04 s, sys: 0 ns, total: 1.04 s\n",
-      "Wall time: 1.04 s\n"
+      "CPU times: user 855 ms, sys: 103 µs, total: 856 ms\n",
+      "Wall time: 855 ms\n"
      ]
     },
     {
      "data": {
       "text/plain": [
-       "[('machine learning', 0.0784313725490196),\n",
-       " ('text classification', 0.049019607843137254),\n",
-       " ('image classification', 0.049019607843137254),\n",
-       " ('exact answers', 0.0392156862745098),\n",
-       " ('augmented machine learning', 0.0392156862745098),\n",
-       " ('graph data', 0.029411764705882353),\n",
-       " ('node classification', 0.029411764705882353),\n",
-       " ('entity recognition', 0.029411764705882353),\n",
-       " ('code example', 0.029411764705882353),\n",
-       " ('index documents', 0.029411764705882353)]"
+       "[('machine learning', 0.5341716824761019),\n",
+       " ('augmented machine learning', 0.5208544167057394),\n",
+       " ('text classification', 0.5134074336523509),\n",
+       " ('image classification', 0.5071170746851726),\n",
+       " ('node classification', 0.4973034499292447),\n",
+       " ('tabular data', 0.49645958463369566),\n",
+       " ('entity recognition', 0.45195059648705926),\n",
+       " ('exact answers', 0.4462502183477142),\n",
+       " ('import ktrain', 0.32891369271775894),\n",
+       " ('load model', 0.32052348289886556)]"
       ]
      },
      "execution_count": 8,
@@ -188,16 +188,16 @@
     {
      "data": {
       "text/plain": [
-       "[('augmented machine learning', 0.07017543859649122),\n",
-       " ('a. s. maiya', 0.05263157894736842),\n",
-       " ('optimal learning rate', 0.03508771929824561),\n",
-       " ('natural language questions', 0.03508771929824561),\n",
-       " ('support text data', 0.017543859649122806),\n",
-       " ('learning rate schedules', 0.017543859649122806),\n",
-       " ('machine learning model', 0.017543859649122806),\n",
-       " ('unsupervised topic modeling', 0.017543859649122806),\n",
-       " ('large text corpus', 0.017543859649122806),\n",
-       " ('social media accounts', 0.017543859649122806)]"
+       "[('augmented machine learning', 0.541435342459079),\n",
+       " ('machine learning model', 0.4982195592681719),\n",
+       " ('support text data', 0.49549171563837363),\n",
+       " ('learning rate schedules', 0.47765279578595193),\n",
+       " ('a. s. maiya', 0.4612715229636928),\n",
+       " ('unsupervised topic modeling', 0.44648865417358047),\n",
+       " ('large text corpus', 0.4374416332143215),\n",
+       " ('optimal learning rate', 0.42667304584617965),\n",
+       " ('non-supervised ml tasks', 0.2330746472277638),\n",
+       " ('natural language questions', 0.21662908635171388)]"
       ]
      },
      "execution_count": 9,
@@ -224,16 +224,16 @@
     {
      "data": {
       "text/plain": [
-       "[('machine learning', 0.0784313725490196),\n",
-       " ('text classification', 0.049019607843137254),\n",
-       " ('image classification', 0.049019607843137254),\n",
-       " ('exact answers', 0.0392156862745098),\n",
-       " ('augmented machine learning', 0.0392156862745098),\n",
-       " ('graph data', 0.029411764705882353),\n",
-       " ('node classification', 0.029411764705882353),\n",
-       " ('entity recognition', 0.029411764705882353),\n",
-       " ('code example', 0.029411764705882353),\n",
-       " ('index documents', 0.029411764705882353)]"
+       "[('machine learning', 0.5341716824761019),\n",
+       " ('augmented machine learning', 0.5208544167057394),\n",
+       " ('text classification', 0.5134074336523509),\n",
+       " ('image classification', 0.5071170746851726),\n",
+       " ('node classification', 0.4973034499292447),\n",
+       " ('tabular data', 0.49645958463369566),\n",
+       " ('entity recognition', 0.45195059648705926),\n",
+       " ('exact answers', 0.4462502183477142),\n",
+       " ('import ktrain', 0.32891369271775894),\n",
+       " ('load model', 0.32052348289886556)]"
       ]
      },
      "execution_count": 10,
@@ -263,25 +263,35 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Building prefix dict from the default dictionary ...\n",
+      "Loading model from cache /tmp/jieba.cache\n",
+      "Loading model cost 0.669 seconds.\n",
+      "Prefix dict has been built successfully.\n"
+     ]
+    },
     {
      "data": {
       "text/plain": [
-       "[('监督 学习', 0.06),\n",
-       " ('训练 数据', 0.06),\n",
-       " ('学习 算法', 0.04),\n",
-       " ('机器 学习', 0.02),\n",
-       " ('学习 任务', 0.02),\n",
-       " ('样本 输入', 0.02),\n",
-       " ('输入 输出', 0.02),\n",
-       " ('输入 映射', 0.02),\n",
-       " ('自由 一组', 0.02),\n",
-       " ('一组 训练', 0.02)]"
+       "[('监督 学习', 0.53),\n",
+       " ('机器 学习', 0.48103658536585364),\n",
+       " ('学习 任务', 0.4764634146341463),\n",
+       " ('样本 输入', 0.4627439024390244),\n",
+       " ('输入 映射', 0.4398780487804878),\n",
+       " ('自由 一组', 0.39719512195121953),\n",
+       " ('一组 训练', 0.3926219512195122),\n",
+       " ('训练 数据', 0.38670731707317074),\n",
+       " ('学习 算法', 0.22731707317073171),\n",
+       " ('输入 输出', 0.01152439024390244)]"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -312,25 +322,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "[(\"données d'entraînement\", 0.0392156862745098),\n",
-       " (\"l'apprentissage supervisé\", 0.0196078431372549),\n",
-       " (\"tâche d'apprentissage\", 0.0196078431372549),\n",
-       " (\"d'apprentissage automatique\", 0.0196078431372549),\n",
-       " ('automatique consistant', 0.0196078431372549),\n",
-       " (\"base d'exemples\", 0.0196078431372549),\n",
-       " ('paires entrée-sortie', 0.0196078431372549),\n",
-       " (\"d'entraînement étiquetées\", 0.0196078431372549),\n",
-       " ('étiquetées constituées', 0.0196078431372549),\n",
-       " (\"constituées d'un\", 0.0196078431372549)]"
+       "[(\"l'apprentissage supervisé\", 0.5098039215686274),\n",
+       " (\"tâche d'apprentissage\", 0.4928634698232476),\n",
+       " (\"d'apprentissage automatique\", 0.489783387687724),\n",
+       " ('automatique consistant', 0.4815698353263277),\n",
+       " (\"base d'exemples\", 0.43588195031606075),\n",
+       " ('paires entrée-sortie', 0.4261283568869026),\n",
+       " (\"données d'entraînement\", 0.4051314571002939),\n",
+       " (\"d'entraînement étiquetées\", 0.39122075935096834),\n",
+       " ('étiquetées constituées', 0.3835205540121593),\n",
+       " (\"constituées d'un\", 0.37787373676369934)]"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -360,7 +370,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -410,7 +420,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -440,15 +450,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 2.19 s, sys: 225 ms, total: 2.42 s\n",
-      "Wall time: 9.51 s\n"
+      "CPU times: user 3.94 s, sys: 95 ms, total: 4.04 s\n",
+      "Wall time: 9.36 s\n"
      ]
     }
    ],
@@ -460,7 +470,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
@@ -473,19 +483,19 @@
     {
      "data": {
       "text/plain": [
-       "[('supervised learning', 0.07317073170731707),\n",
-       " ('training data', 0.07317073170731707),\n",
-       " ('learning algorithm', 0.04878048780487805),\n",
-       " ('machine learning', 0.024390243902439025),\n",
-       " ('learning task', 0.024390243902439025),\n",
-       " ('output based', 0.024390243902439025),\n",
-       " ('example input-output', 0.024390243902439025),\n",
-       " ('input-output pairs', 0.024390243902439025),\n",
-       " ('labeled training', 0.024390243902439025),\n",
-       " ('data consisting', 0.024390243902439025)]"
+       "[('supervised learning', 0.5357142857142857),\n",
+       " ('machine learning', 0.4946192305347235),\n",
+       " ('learning task', 0.4894975916102677),\n",
+       " ('output based', 0.44980488994573503),\n",
+       " ('example input-output', 0.4395616120968234),\n",
+       " ('input-output pairs', 0.43443997317236754),\n",
+       " ('training data', 0.4236784342418145),\n",
+       " ('labeled training', 0.40499054935674655),\n",
+       " ('data consisting', 0.3941070666422779),\n",
+       " ('learning algorithm', 0.2632461435278337)]"
       ]
      },
-     "execution_count": 20,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }

diff --git a/ktrain/text/kw/core.py b/ktrain/text/kw/core.py
@@ -156,7 +156,6 @@ def extract_keywords(
         else:
             noun_phrases = blob.noun_phrases
             for np in noun_phrases:
-                print(np)
                 words = np.split()
                 n = len(words)
                 if n not in ngram_lens: