# Primeiros passos com Natural Language Toolkit 

Referência: https://towardsdatascience.com/text-preprocessing-with-nltk-9de5de891658

In [6]:
import nltk

# Usando um dataset disponibilizado pelo nltk
nltk.download('inaugural')
from nltk.corpus import inaugural

corpus = inaugural.raw('1789-Washington.txt')
corpus[:200]

[nltk_data] Downloading package inaugural to
[nltk_data]     /home/anandaheino/nltk_data...
[nltk_data]   Package inaugural is already up-to-date!


'Fellow-Citizens of the Senate and of the House of Representatives:\n\nAmong the vicissitudes incident to life no event could have filled me with greater anxieties than that of which the notification was'

In [9]:
from nltk.tokenize import word_tokenize, sent_tokenize
sentences = nltk.sent_tokenize(corpus)
print('número de frases:', len(sentences))

número de frases: 23


In [10]:
words = nltk.word_tokenize(corpus)
print('número de palavras:', len(words))

número de palavras: 1537


In [12]:
print('número de palavras únicas:', len(set(words)))

número de palavras únicas: 626


In [15]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
print('Em ingles, o numero de stopwords é:',len(stop_words))

Em ingles, o numero de stopwords é: 179


In [17]:
tokens = list(w for w in words if w not in stop_words)
len(tokens)

800

## Stemming 
* NLTK provides many inbuilt stemmers such as: 
  * Porter Stemmer * also used  <---
  * Snowball Stemmer * better  <---
  * Lancaster Stemmer



In [23]:
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer 

example_words = ["grows","leaves","fairly","cats","trouble",
                 "misunderstanding","friendships","easily", 
                 "rational", "relational"]

#Create instances of both stemmers, and stem the words using them.
stemmer_ps = PorterStemmer() 

#an instance of Porter Stemmer

stemmed_words_ps = [stemmer_ps.stem(word) for word in example_words]
print("*** Porter stemmed words: ***\n ", stemmed_words_ps)

*** Porter stemmed words: ***
  ['grow', 'leav', 'fairli', 'cat', 'troubl', 'misunderstand', 'friendship', 'easili', 'ration', 'relat']


In [22]:
stemmer_ss = SnowballStemmer("english")   

#an instance of Snowball Stemmer
stemmed_words_ss = [stemmer_ss.stem(word) for word in example_words]
print("*** Snowball stemmed words: ***\n ", stemmed_words_ss)

*** Snowball stemmed words: ***
  ['grow', 'leav', 'fair', 'cat', 'troubl', 'misunderstand', 'friendship', 'easili', 'ration', 'relat']


## Lemmatization
* Lemmatization is the algorithmic process of finding the lemma of a word depending on their meaning. 
* Usually refers to the morphological analysis of words removing inflectional endings. 
* It helps in returning the base or dictionary form of a word, which is known as the lemma.

In [24]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet') 

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/anandaheino/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [25]:
lemmatizer = WordNetLemmatizer()  

# instanciating Word Net Lemmatizer
lemmatized_words = [lemmatizer.lemmatize(word) for word in example_words] 
print("*** The lemmatized words: ***\n ", lemmatized_words) 

*** The lemmatized words: ***
  ['grows', 'leaf', 'fairly', 'cat', 'trouble', 'misunderstanding', 'friendship', 'easily', 'rational', 'relational']


In [27]:
#prints the lemmatized words
lemmatized_words_pos = [lemmatizer.lemmatize(word, pos = "v") for word in example_words]
print("*** The lemmatized words using a POS tag: ***\n ", lemmatized_words_pos) 

*** The lemmatized words using a POS tag: ***
  ['grow', 'leave', 'fairly', 'cat', 'trouble', 'misunderstand', 'friendships', 'easily', 'rational', 'relational']


# How are Stemming and Lemmatization Different?
1. Stemming reduces word-forms to stems in order to reduce size, whereas lemmatization reduces the word-forms to linguistically valid lemmas.
2. Lemmatization is usually more sophisticated and requires some sort of lexica. Stemming, on the other hand, can be achieved with simple rule-based approaches.
3. A stemmer operates on a single word without knowledge of the context, and cannot discriminate between words which have similar/different meanings depending on part of speech.