In [1]:
import nltk

## POS Tagging

In [3]:
text = nltk.word_tokenize("I went to Florida last week.")
nltk.pos_tag(text)

[('I', 'PRP'),
 ('went', 'VBD'),
 ('to', 'TO'),
 ('Florida', 'NNP'),
 ('last', 'JJ'),
 ('week', 'NN'),
 ('.', '.')]

In [4]:
text = nltk.word_tokenize("They refuse to permit us to obtain the refuse permit")
nltk.pos_tag(text)

[('They', 'PRP'),
 ('refuse', 'VBP'),
 ('to', 'TO'),
 ('permit', 'VB'),
 ('us', 'PRP'),
 ('to', 'TO'),
 ('obtain', 'VB'),
 ('the', 'DT'),
 ('refuse', 'NN'),
 ('permit', 'NN')]

In [5]:
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
text.similar('woman')

man time day year moment car world family house country child boy
state job war way place girl room week


In [None]:
# Some corpora already have tagging

In [6]:
nltk.corpus.brown.tagged_words()
# nltk.corpus.nps_chat.tagged_words()

[('The', 'AT'), ('Fulton', 'NP-TL'), ...]

In [37]:
nltk.corpus.brown.tagged_words(tagset='universal')

[('The', 'DET'), ('Fulton', 'NOUN'), ...]

## WordNet
* WordNet is a semantically oriented dictionary of English, similar to a traditional the- saurus but with a richer structure. 
* NLTK includes the English WordNet, with 155,287 words and 117,659 synonym sets.

In [2]:
from nltk.corpus import wordnet as wn
wn.synsets('motorcar')

[Synset('car.n.01')]

* synset or “synonym set”  - a collection of synonymous words (or “lemmas”)

In [4]:
wn.synset('car.n.01').lemma_names()

['car', 'auto', 'automobile', 'machine', 'motorcar']

In [7]:
wn.synset('car.n.01').definition()

'a motor vehicle with four wheels; usually propelled by an internal combustion engine'

In [8]:
wn.synset('car.n.01').examples()

['he needs a car to get to work']

In [10]:
# Pairing of a synset with a word is called a lemma
wn.synset('car.n.01').lemmas()

[Lemma('car.n.01.car'),
 Lemma('car.n.01.auto'),
 Lemma('car.n.01.automobile'),
 Lemma('car.n.01.machine'),
 Lemma('car.n.01.motorcar')]

In [12]:
print(wn.lemma('car.n.01.automobile'))
print('#'*10)
print(wn.lemma('car.n.01.automobile').synset())
print('#'*10)
print(wn.lemma('car.n.01.automobile').name())

Lemma('car.n.01.automobile')
##########
Synset('car.n.01')
##########
automobile


In [16]:
for synset in wn.synsets('car'):
    print(synset.lemma_names())

['car', 'auto', 'automobile', 'machine', 'motorcar']
['car', 'railcar', 'railway_car', 'railroad_car']
['car', 'gondola']
['car', 'elevator_car']
['cable_car', 'car']


### WordNet Hierarchy

!['Common dictionary literals and operations'](img/l1.png)

In [17]:
motorcar = wn.synset('car.n.01')
types_of_motorcar = motorcar.hyponyms()
types_of_motorcar[:5]

[Synset('ambulance.n.01'),
 Synset('beach_wagon.n.01'),
 Synset('bus.n.04'),
 Synset('cab.n.03'),
 Synset('compact.n.03')]

In [21]:
[lemma.name() for synset in types_of_motorcar for lemma in synset.lemmas()][:10]

['ambulance',
 'beach_wagon',
 'station_wagon',
 'wagon',
 'estate_car',
 'beach_waggon',
 'station_waggon',
 'waggon',
 'bus',
 'jalopy']

In [22]:
motorcar.hypernyms()

[Synset('motor_vehicle.n.01')]

1. from items to their components (meronyms) or 
2. to the things they are contained in (holonyms). 
 
##### part_meronyms() - the parts of a tree are its trunk, crown, and so on
##### substance_meronyms() - The substance a tree is made of includes heartwood and sapwood
##### member_holonyms() - A collection of trees forms a forest

In [32]:
wn.synset('tree.n.01').part_meronyms()

[Synset('burl.n.02'),
 Synset('crown.n.07'),
 Synset('limb.n.02'),
 Synset('stump.n.01'),
 Synset('trunk.n.01')]

In [33]:
wn.synset('tree.n.01').substance_meronyms()

[Synset('heartwood.n.01'), Synset('sapwood.n.01')]

In [34]:
wn.synset('tree.n.01').member_holonyms()

[Synset('forest.n.01')]

In [35]:
# act of walking involves the act of stepping, so walking entails stepping
print(wn.synset('walk.v.01').entailments())
print('#'*10)
print(wn.synset('eat.v.01').entailments())

[Synset('step.v.01')]
##########
[Synset('chew.v.01'), Synset('swallow.v.01')]


In [36]:
wn.lemma('supply.n.02.supply').antonyms()

[Lemma('demand.n.02.demand')]

## Normalizing Text

1. Stemmers

In [23]:
raw = """DENNIS: Listen, strange women lying in ponds distributing swords is no basis for a system of government. Supreme executive power derives from a mandate from the masses, not from some farcical aquatic ceremony."""
tokens = nltk.word_tokenize(raw)

In [25]:
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
[porter.stem(t) for t in tokens][:10]

['denni',
 ':',
 'listen',
 ',',
 'strang',
 'women',
 'lie',
 'in',
 'pond',
 'distribut']

In [26]:
[lancaster.stem(t) for t in tokens][:10]

['den', ':', 'list', ',', 'strange', 'wom', 'lying', 'in', 'pond', 'distribut']

2. Lemmatization

In [27]:
wnl = nltk.WordNetLemmatizer()
[wnl.lemmatize(t) for t in tokens][:10]

['DENNIS',
 ':',
 'Listen',
 ',',
 'strange',
 'woman',
 'lying',
 'in',
 'pond',
 'distributing']

## N-Grams

In [28]:
sent = ['In', 'the', 'beginning', 'God', 'created', 'the', 'heaven',
'and', 'the', 'earth', '.']

In [29]:
nltk.bigrams(sent)

<generator object bigrams at 0x115a519e8>

In [30]:
[x for x in nltk.bigrams(sent)]

[('In', 'the'),
 ('the', 'beginning'),
 ('beginning', 'God'),
 ('God', 'created'),
 ('created', 'the'),
 ('the', 'heaven'),
 ('heaven', 'and'),
 ('and', 'the'),
 ('the', 'earth'),
 ('earth', '.')]

In [31]:
[x for x in nltk.trigrams(sent)]

[('In', 'the', 'beginning'),
 ('the', 'beginning', 'God'),
 ('beginning', 'God', 'created'),
 ('God', 'created', 'the'),
 ('created', 'the', 'heaven'),
 ('the', 'heaven', 'and'),
 ('heaven', 'and', 'the'),
 ('and', 'the', 'earth'),
 ('the', 'earth', '.')]