diff --git a/docs/Detection.rst b/docs/Detection.rst index cac9e3e..32baf73 100644 --- a/docs/Detection.rst +++ b/docs/Detection.rst @@ -68,7 +68,7 @@ the confidence in the detection went down for the first line .. code:: python for line in mixed_text.strip().splitlines(): - print(line, "\n") + print(line + u"\n") for language in Detector(line).languages: print(language) print("\n") @@ -76,13 +76,15 @@ the confidence in the detection went down for the first line .. parsed-literal:: - (u'China (simplified Chinese: \u4e2d\u56fd; traditional Chinese: \u4e2d\u570b),', '\n') + China (simplified Chinese: 中国; traditional Chinese: 中國), + name: English code: en confidence: 71.0 read bytes: 887 name: Chinese code: zh_Hant confidence: 11.0 read bytes: 1755 name: un code: un confidence: 0.0 read bytes: 0 - (u"officially the People's Republic of China (PRC), is a sovereign state located in East Asia.", '\n') + officially the People's Republic of China (PRC), is a sovereign state located in East Asia. + name: English code: en confidence: 98.0 read bytes: 1291 name: un code: un confidence: 0.0 read bytes: 0 name: un code: un confidence: 0.0 read bytes: 0 diff --git a/docs/Download.rst b/docs/Download.rst index b55447a..a3bea7a 100644 --- a/docs/Download.rst +++ b/docs/Download.rst @@ -77,22 +77,6 @@ Library Interface from polyglot.downloader import downloader downloader.download("embeddings2.en") - -.. parsed-literal:: - - [polyglot_data] Downloading package embeddings2.en to - [polyglot_data] /home/rmyeid/polyglot_data... - [polyglot_data] Package embeddings2.en is already up-to-date! - - - - -.. parsed-literal:: - - True - - - Collections ----------- @@ -198,54 +182,25 @@ polyglot named entity recognition subsystem, as the following: .. code:: python - downloader.supported_languages(task="ner2") - - + print(downloader.supported_languages_table(task="ner2")) .. parsed-literal:: - ['Polish', - 'Turkish', - 'Russian', - 'Indonesian', - 'Czech', - 'Arabic', - 'Korean', - 'Catalan; Valencian', - 'Italian', - 'Thai', - 'Romanian, Moldavian, Moldovan', - 'Tagalog', - 'Danish', - 'Finnish', - 'German', - 'Persian', - 'Dutch', - 'Chinese', - 'French', - 'Portuguese', - 'Slovak', - 'Hebrew (modern)', - 'Malay', - 'Slovene', - 'Bulgarian', - 'Hindi', - 'Japanese', - 'Hungarian', - 'Croatian', - 'Ukrainian', - 'Serbian', - 'Lithuanian', - 'Norwegian', - 'Latvian', - 'Swedish', - 'English', - 'Greek, Modern', - 'Spanish; Castilian', - 'Vietnamese', - 'Estonian'] - + 1. Polish 2. Turkish 3. Russian + 4. Indonesian 5. Czech 6. Arabic + 7. Korean 8. Catalan; Valencian 9. Italian + 10. Thai 11. Romanian, Moldavian, ... 12. Tagalog + 13. Danish 14. Finnish 15. German + 16. Persian 17. Dutch 18. Chinese + 19. French 20. Portuguese 21. Slovak + 22. Hebrew (modern) 23. Malay 24. Slovene + 25. Bulgarian 26. Hindi 27. Japanese + 28. Hungarian 29. Croatian 30. Ukrainian + 31. Serbian 32. Lithuanian 33. Norwegian + 34. Latvian 35. Swedish 36. English + 37. Greek, Modern 38. Spanish; Castilian 39. Vietnamese + 40. Estonian You can view all the available and/or installed collections or packages diff --git a/docs/Transliteration.rst b/docs/Transliteration.rst index fc23aef..0b55af9 100644 --- a/docs/Transliteration.rst +++ b/docs/Transliteration.rst @@ -14,14 +14,10 @@ Dēmokratía". Languages Coverage ------------------ -**TODO** - -Describe how did we get these models - .. code:: python from polyglot.downloader import downloader - print(downloader.supported_languages_table("transliteration2", 3)) + print(downloader.supported_languages_table("transliteration2")) .. parsed-literal:: @@ -52,8 +48,8 @@ Describe how did we get these models -Download Necessary Models -^^^^^^^^^^^^^^^^^^^^^^^^^ +Downloading Necessary Models +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: python diff --git a/notebooks/Detection.ipynb b/notebooks/Detection.ipynb index 5d8a764..8e98c42 100644 --- a/notebooks/Detection.ipynb +++ b/notebooks/Detection.ipynb @@ -128,7 +128,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 14, "metadata": { "collapsed": false }, @@ -137,13 +137,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "(u'China (simplified Chinese: \\u4e2d\\u56fd; traditional Chinese: \\u4e2d\\u570b),', '\\n')\n", + "China (simplified Chinese: 中国; traditional Chinese: 中國),\n", + "\n", "name: English code: en confidence: 71.0 read bytes: 887\n", "name: Chinese code: zh_Hant confidence: 11.0 read bytes: 1755\n", "name: un code: un confidence: 0.0 read bytes: 0\n", "\n", "\n", - "(u\"officially the People's Republic of China (PRC), is a sovereign state located in East Asia.\", '\\n')\n", + "officially the People's Republic of China (PRC), is a sovereign state located in East Asia.\n", + "\n", "name: English code: en confidence: 98.0 read bytes: 1291\n", "name: un code: un confidence: 0.0 read bytes: 0\n", "name: un code: un confidence: 0.0 read bytes: 0\n", @@ -154,7 +156,7 @@ ], "source": [ "for line in mixed_text.strip().splitlines():\n", - " print(line, \"\\n\")\n", + " print(line + u\"\\n\")\n", " for language in Detector(line).languages:\n", " print(language)\n", " print(\"\\n\")" diff --git a/notebooks/Download.ipynb b/notebooks/Download.ipynb index 9b6969e..7fbb76f 100644 --- a/notebooks/Download.ipynb +++ b/notebooks/Download.ipynb @@ -125,31 +125,11 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[polyglot_data] Downloading package embeddings2.en to\n", - "[polyglot_data] /home/rmyeid/polyglot_data...\n", - "[polyglot_data] Package embeddings2.en is already up-to-date!\n" - ] - }, - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "from polyglot.downloader import downloader\n", "downloader.download(\"embeddings2.en\")" @@ -179,7 +159,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 6, "metadata": { "collapsed": false }, @@ -234,7 +214,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "metadata": { "collapsed": false }, @@ -245,7 +225,7 @@ "True" ] }, - "execution_count": 3, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -270,7 +250,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 8, "metadata": { "collapsed": false }, @@ -287,7 +267,7 @@ " u'tsne2']" ] }, - "execution_count": 4, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -305,63 +285,34 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { - "data": { - "text/plain": [ - "['Polish',\n", - " 'Turkish',\n", - " 'Russian',\n", - " 'Indonesian',\n", - " 'Czech',\n", - " 'Arabic',\n", - " 'Korean',\n", - " 'Catalan; Valencian',\n", - " 'Italian',\n", - " 'Thai',\n", - " 'Romanian, Moldavian, Moldovan',\n", - " 'Tagalog',\n", - " 'Danish',\n", - " 'Finnish',\n", - " 'German',\n", - " 'Persian',\n", - " 'Dutch',\n", - " 'Chinese',\n", - " 'French',\n", - " 'Portuguese',\n", - " 'Slovak',\n", - " 'Hebrew (modern)',\n", - " 'Malay',\n", - " 'Slovene',\n", - " 'Bulgarian',\n", - " 'Hindi',\n", - " 'Japanese',\n", - " 'Hungarian',\n", - " 'Croatian',\n", - " 'Ukrainian',\n", - " 'Serbian',\n", - " 'Lithuanian',\n", - " 'Norwegian',\n", - " 'Latvian',\n", - " 'Swedish',\n", - " 'English',\n", - " 'Greek, Modern',\n", - " 'Spanish; Castilian',\n", - " 'Vietnamese',\n", - " 'Estonian']" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + " 1. Polish 2. Turkish 3. Russian \n", + " 4. Indonesian 5. Czech 6. Arabic \n", + " 7. Korean 8. Catalan; Valencian 9. Italian \n", + " 10. Thai 11. Romanian, Moldavian, ... 12. Tagalog \n", + " 13. Danish 14. Finnish 15. German \n", + " 16. Persian 17. Dutch 18. Chinese \n", + " 19. French 20. Portuguese 21. Slovak \n", + " 22. Hebrew (modern) 23. Malay 24. Slovene \n", + " 25. Bulgarian 26. Hindi 27. Japanese \n", + " 28. Hungarian 29. Croatian 30. Ukrainian \n", + " 31. Serbian 32. Lithuanian 33. Norwegian \n", + " 34. Latvian 35. Swedish 36. English \n", + " 37. Greek, Modern 38. Spanish; Castilian 39. Vietnamese \n", + " 40. Estonian \n" + ] } ], "source": [ - "downloader.supported_languages(task=\"ner2\")" + "print(downloader.supported_languages_table(task=\"ner2\"))" ] }, { diff --git a/notebooks/Transliteration.ipynb b/notebooks/Transliteration.ipynb index 86e18e6..bb12851 100644 --- a/notebooks/Transliteration.ipynb +++ b/notebooks/Transliteration.ipynb @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 1, "metadata": { "collapsed": false }, @@ -33,18 +33,9 @@ "## Languages Coverage" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**TODO**\n", - "\n", - " Describe how did we get these models" - ] - }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": { "collapsed": false }, @@ -82,14 +73,14 @@ ], "source": [ "from polyglot.downloader import downloader\n", - "print(downloader.supported_languages_table(\"transliteration2\", 3))" + "print(downloader.supported_languages_table(\"transliteration2\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### Download Necessary Models" + "#### Downloading Necessary Models" ] }, {