**Tokenizing Words and Sentences with NLTK**
https://pythonprogramming.net/tokenizing-words-sentences-nltk-tutorial/

In [1]:
import nltk

In [2]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [5]:
#tokenizing - word tokenizers / sentence tokenizers
# lexicon and corporas
# corpora - body of text. ex: medical journals, presidential speeches, English language
# lexicon - words and their means

# investor-speak .... regular english-speak

# investor speak 'bull' - someone who is positive about the market
# english-speak 'bull' - scary animal you dont want running at you

In [6]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [11]:
example_text = "Hello Mr. Smith, how are you doing today? The weather is great and Python is awesome. The sky is pinkish-blue. You should not eat cardboard."

In [12]:
print(sent_tokenize(example_text))

['Hello Mr. Smith, how are you doing today?', 'The weather is great and Python is awesome.', 'The sky is pinkish-blue.', 'You should not eat cardboard.']


In [13]:
print(word_tokenize(example_text))

['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', 'and', 'Python', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', 'not', 'eat', 'cardboard', '.']


In [14]:
for i in word_tokenize(example_text):
    print(i)

Hello
Mr.
Smith
,
how
are
you
doing
today
?
The
weather
is
great
and
Python
is
awesome
.
The
sky
is
pinkish-blue
.
You
should
not
eat
cardboard
.


**Stop words with NLTK**
https://pythonprogramming.net/stop-words-nltk-tutorial/

In [15]:
from nltk.corpus import stopwords

In [16]:
from nltk.tokenize import word_tokenize

In [17]:
example_sentence = "This is an example showing off stopwords filtration."

In [18]:
stop_words = set(stopwords.words("english"))

In [22]:
print(len(stop_words))
print(stop_words)

179
{'during', 'have', 'has', 'himself', 'don', 'yourselves', 've', 'we', 'o', 'too', 'isn', 'up', 'their', 'i', 'aren', 'he', 'both', 'down', 'while', 'a', "didn't", 'here', 'myself', "that'll", "she's", 'wasn', 'and', "don't", 'each', 'hers', 'because', 'why', 'through', 'shouldn', 'until', 'before', 'themselves', 'doesn', 'same', 'she', 'it', 'as', 'yours', 'its', 'few', 'yourself', 'm', "should've", 'are', 'herself', 'your', 'these', 'into', 'or', 'ain', 'haven', 'ourselves', 'them', "weren't", 'ma', 'how', 'wouldn', "shan't", 'ours', 'did', 'in', 'when', 'some', 'about', 'had', 'not', 'shan', 'hasn', 'any', 'been', "wouldn't", 'should', 'me', 'if', 'didn', 'by', 'the', 'be', 'further', 'was', "wasn't", "mightn't", 'so', 'such', "couldn't", 'were', 'her', 'below', 'nor', "it's", 'y', 'you', 'than', 'those', 'then', 'which', 'can', 'doing', 'other', "hasn't", 'at', 'only', 'couldn', "needn't", 'is', "hadn't", 'will', "mustn't", 'this', 'who', 'they', 'his', 're', 'once', "aren't", '

In [23]:
words = word_tokenize(example_sentence)

In [24]:
filtered_sentence = [w for w in words if w not in stop_words]

In [25]:
print(filtered_sentence)

['This', 'example', 'showing', 'stopwords', 'filtration', '.']


**Stemming words with NLTK** https://pythonprogramming.net/stemming-nltk-tutorial/

In [26]:
from nltk.stem import PorterStemmer

In [27]:
from nltk.tokenize import word_tokenize

In [28]:
ps = PorterStemmer()

In [29]:
example_words = ["python","pythoner","pythoning","pythonned","pythonly"]

In [30]:
stemmed_words = [ps.stem(w) for w in example_words]

In [31]:
stemmed_words

['python', 'python', 'python', 'python', 'pythonli']

In [32]:
new_text = "It is in very important to be pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."

In [33]:
words = word_tokenize(new_text)

In [34]:
stemmed_words = [ps.stem(w) for w in words]

In [35]:
stemmed_words

['It',
 'is',
 'in',
 'veri',
 'import',
 'to',
 'be',
 'pythonli',
 'while',
 'you',
 'are',
 'python',
 'with',
 'python',
 '.',
 'all',
 'python',
 'have',
 'python',
 'poorli',
 'at',
 'least',
 'onc',
 '.']

**Lemmatizing with NLTK** https://pythonprogramming.net/lemmatizing-nltk-tutorial/

In [39]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/andresdelrio/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [40]:
lemmatizer = WordNetLemmatizer()

In [41]:
print(lemmatizer.lemmatize("cats"))

cat


In [42]:
print(lemmatizer.lemmatize("cacti"))

cactus


In [43]:
print(lemmatizer.lemmatize("geese"))

goose


In [44]:
print(lemmatizer.lemmatize("rocks"))

rock


In [45]:
print(lemmatizer.lemmatize("python"))

python


In [46]:
print(lemmatizer.lemmatize("better"))

better


In [51]:
print(lemmatizer.lemmatize("better",pos="a"))
# part of speech - pos - for better is adjective (a)

good
