### nltk modules
[modules](https://www.nltk.org/py-modindex.html)

#### nltk 分词 - Tokenize

In [15]:
import nltk
from nltk.corpus import words

# print(word)

# nltk.download('words')
# nltk.download('punkt')

sentence = """At eight o'clock on Thursday morning
... Arthur didn't feel very good."""
word_tokens = nltk.word_tokenize(sentence)
print(word_tokens)

punct_tokens = nltk.wordpunct_tokenize(sentence)
print(punct_tokens)

reg_tokens = nltk.regexp_tokenize(sentence, pattern='\w+|\$[\d\.]+|\S+')
print(reg_tokens)

blank_tokens = nltk.blankline_tokenize(sentence)
print(blank_tokens)

['At', 'eight', "o'clock", 'on', 'Thursday', 'morning', '...', 'Arthur', 'did', "n't", 'feel', 'very', 'good', '.']
['At', 'eight', 'o', "'", 'clock', 'on', 'Thursday', 'morning', '...', 'Arthur', 'didn', "'", 't', 'feel', 'very', 'good', '.']
['At', 'eight', 'o', "'clock", 'on', 'Thursday', 'morning', '...', 'Arthur', 'didn', "'t", 'feel', 'very', 'good', '.']
["At eight o'clock on Thursday morning\n... Arthur didn't feel very good."]


#### 词性标注

In [20]:
import nltk

# nltk.download('averaged_perceptron_tagger')

sentence = """2020-09-03"""
tokens = nltk.word_tokenize(sentence)
print(tokens)
tagged = nltk.pos_tag(tokens)

# print(nltk.help.upenn_tagset())
# print(nltk.help.brown_tagset())
# print(nltk.help.claws5_tagset())
print(tagged)

['2020-09-03']
[('2020-09-03', 'JJ')]


####  命名实体识别

In [3]:
import nltk

# nltk.download('averaged_perceptron_tagger')

nltk.download('maxent_ne_chunker')

sentence = """...my children were gonna know who their father was."""
tokens = nltk.word_tokenize(sentence)
tagged = nltk.pos_tag(tokens)
entities = nltk.chunk.ne_chunk(tagged)
print(entities)

(S
  .../:
  my/PRP$
  children/NNS
  were/VBD
  gon/VBG
  na/RB
  know/VB
  who/WP
  their/PRP$
  father/NN
  was/VBD
  ./.)


[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/fqdl123/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


#### nltk 词形还原 – Lemmatisation

In [4]:
import nltk
from nltk.stem.wordnet import WordNetLemmatizer

# nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("gonna"))


gonna


#### nltk 词干提取 – Stemming

In [40]:
# 基于Porter词干提取算法
from nltk.stem.porter import PorterStemmer

porter_stemmer = PorterStemmer()  
print("porter: " + porter_stemmer.stem("plays"))

# 基于Lancaster 词干提取算法
from nltk.stem.lancaster import LancasterStemmer

lancaster_stemmer = LancasterStemmer()  
print("lancaster: " + lancaster_stemmer.stem("played"))

# 基于Snowball 词干提取算法
from nltk.stem import SnowballStemmer

snowball_stemmer = SnowballStemmer("english")  
print("snowball: " + snowball_stemmer.stem("playing"))

porter: play
lancaster: play
snowball: play


#### nltk 词性

In [33]:
import nltk

# nltk.download('tagsets')

# nltk.help.brown_tagset()
# nltk.help.claws5_tagset()
# nltk.help.upenn_tagset()

nltk.help.upenn_tagset("JJ")


JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...


#### nltk tag

In [16]:
import nltk
from nltk.tag import HunposTagger
import os

# Hunpos
# https://code.google.com/archive/p/hunpos/downloads

# en_wsj.model
# https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/hunpos/en_wsj.model.gz

basedir = os.path.dirname(os.path.abspath("__file__"))
# print(basedir)
# print(HunposTagger.__init__.__code__.co_varnames)
%ll hunpos-1.0-macosx/

path_to_model = os.path.join(basedir, 'en_wsj.model')
# path_to_bin = os.path.join(basedir, os.path.join('hunpos-1.0-macosx', 'hunpos-tag'))

ht = HunposTagger(path_to_model)

ht.tag('What is the airspeed ofs an unladen swallow ?'.split())
# [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'VB'), ('?', '.')]


total 1320
-rwxr-xr-x@ 1 fqdl123  staff  343528  6 26  2007 [31mhunpos-tag[m[m*
-rwxr-xr-x@ 1 fqdl123  staff  328500  6 26  2007 [31mhunpos-train[m[m*


LookupError: 

===========================================================================
NLTK was unable to find the hunpos-tag file!
Use software specific configuration paramaters or set the HUNPOS_TAGGER environment variable.

  Searched in:
    - .
    - /usr/bin
    - /usr/local/bin
    - /opt/local/bin
    - /Applications/bin
    - /Users/fqdl123/bin
    - /Users/fqdl123/Applications/bin

  For more information on hunpos-tag, see:
    <http://code.google.com/p/hunpos/>
===========================================================================