In [1]:
import nltk

In [2]:
# Download stuff using nltk's download window
# stopwords (Corpora), words(Corpora), maxnet_ne_chunker (Models), word2vec_sample (Models),
# sample_grammars (Models), porter_test (Models), punkt (Models), averaged_perceptron (Models),
# brown (Corpora), treebank (Corpora), words (Corpora)

# This opens the download window
# nltk.download()

## Tokenization
### Given a character sequence and a defined document unit, tokenization is the task of chopping it up into pieces, called tokens , perhaps at the same time throwing away certain characters, such as punctuation.


In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize, wordpunct_tokenize

In [4]:
a = "We are enjoying Pydata session very much"

In [5]:
word_tokenize(a)

['We', 'are', 'enjoying', 'Pydata', 'session', 'very', 'much']

In [6]:
b = "I can't do this task, I am very tired."
word_tokenize(b)

['I', 'ca', "n't", 'do', 'this', 'task', ',', 'I', 'am', 'very', 'tired', '.']

#### Tokenization might go awry at times.
#### can't -> ca + n't -> WTF?!!

In [7]:
c = "It's a rainy day, I'm enjoying the weather, I like it here."
sent_tokenize(c)

["It's a rainy day, I'm enjoying the weather, I like it here."]

#### Apparently, sentence tokenizers are more reliable than word tokenizers. (To-Check)

In [8]:
# Fact: Stopwords contains words from different languages
# Fact: Stopwords list might contain repetitive words. Making set can be useful, i.e. obtain unique words.
a = set(stopwords.words('english'))

In [9]:
b = ["my", "name", "is", "Mansi"]

In [10]:
# What happens when you read your facts right? -> Making set, i.e. removing repetitions, of stopwords helps our
# time complexity when going through the stopwords list, even if it's negligible.
# Just saying!!
c = [word for word in b if word not in a]
# c

In [11]:
# Punctuation tokenization
a = "It's a rainy day, I'm enjoying the weather, I like it here."
wordpunct_tokenize(a)

['It',
 "'",
 's',
 'a',
 'rainy',
 'day',
 ',',
 'I',
 "'",
 'm',
 'enjoying',
 'the',
 'weather',
 ',',
 'I',
 'like',
 'it',
 'here',
 '.']

#### ^Tokenized punctuations! Mind = blown yet?

In [12]:
from nltk import RegexpTokenizer, WhitespaceTokenizer, SExprTokenizer, TweetTokenizer

In [13]:
# `RegexpTokenizer`` splits a string into substrings using a regular expression.
tknzr = RegexpTokenizer(r'\w+')
tknzr.tokenize(a)

['It',
 's',
 'a',
 'rainy',
 'day',
 'I',
 'm',
 'enjoying',
 'the',
 'weather',
 'I',
 'like',
 'it',
 'here']

## Stemming and Lemmatization
### The goal of both stemming and lemmatization is to reduce inflectional forms and sometimes derivationally related forms of a word to a common base form.

### However, the two words differ in their flavor. Stemming usually refers to a crude heuristic process that chops off the ends of words in the hope of achieving this goal correctly most of the time, and often includes the removal of derivational affixes. Lemmatization usually refers to doing things properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma .


In [14]:
# Common Stemmers: Porter (Most popular, fast, gentle), Lancaster, Snowball
# Common Lematizer: WordNet
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer

In [15]:
p = PorterStemmer()
l = LancasterStemmer()

In [16]:
print("Porter - maximum: ", p.stem('maximum'))
print("Lancaster - maximum: ", l.stem('maximum'))

Porter - maximum:  maximum
Lancaster - maximum:  maxim


#### Porter is gentler, Lancaster is slightly more aggresive.

In [17]:
wordnet_lemmatizer = WordNetLemmatizer()

In [18]:
wordnet_lemmatizer.lemmatize('maximum')

'maximum'

## POS Tagging
### The process of assigning a part-of-speech to each word/token in a sentence. 8 common parts of speech (verb, noun, etc.) but can extend to even 100.
### It's non-trivial due to the intricacies of human language. For example, different words can have different meanings in different contexts.
### Two broad methods: - Probabilistic approach - Machine Learning

In [19]:
from nltk import pos_tag, tokenize
from nltk.corpus import brown
import string

In [20]:
def get_tags(document):
    tokens = tokenize.word_tokenize(document)
    for word, pos in pos_tag(tokens):
        print(word + "\\" + pos + " ")
    print("\n")
    tagged_document = pos_tag(tokens)
    tag_fd = nltk.FreqDist(tag for (word, tag) in tagged_document)
    print(tag_fd.most_common())
    print(tag_fd.max())
get_tags("Alice chased the rabbit.")

Alice\NNP 
chased\VBD 
the\DT 
rabbit\NN 
.\. 


[('NN', 1), ('DT', 1), ('.', 1), ('NNP', 1), ('VBD', 1)]
NN


## Parsing vs Shallow Parsing
### Parsing the sentence would convert the sentence into a tree whose leaves will hold POS tags. A shallow parser or 'chunker' comes somewhere in between a parser and a POS tagger.

### Alice chased the rabbit.
### Breakdown into a parse tree: Alice -> noun phrase, chased the rabbit -> verb phrase.

## Support for third-party softwares
### Stanford tagger, NER, Tokenizer and Parser; REPP Tokenizer; CRFSuite for CRF Tagger; Hunpos Tagger