In [1]:
import nltk

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [7]:
example_string = """Muad'Dib learned rapidly because his first training was in how to learn. 
And the first lesson of all was the basic trust that he could learn. It's shocking to find how many people do not believe
they can  learn, and how many more believe learning to be difficult."""

In [8]:
example_string

"Muad'Dib learned rapidly because his first training was in how to learn. \nAnd the first lesson of all was the basic trust that he could learn. It's shocking to find how many people do not believe\nthey can  learn, and how many more believe learning to be difficult."

# Sentence Tokenization

In [9]:
sent_tokenize(example_string)


["Muad'Dib learned rapidly because his first training was in how to learn.",
 'And the first lesson of all was the basic trust that he could learn.',
 "It's shocking to find how many people do not believe\nthey can  learn, and how many more believe learning to be difficult."]

# word Tokenization

In [10]:
word_tokenize(example_string)

["Muad'Dib",
 'learned',
 'rapidly',
 'because',
 'his',
 'first',
 'training',
 'was',
 'in',
 'how',
 'to',
 'learn',
 '.',
 'And',
 'the',
 'first',
 'lesson',
 'of',
 'all',
 'was',
 'the',
 'basic',
 'trust',
 'that',
 'he',
 'could',
 'learn',
 '.',
 'It',
 "'s",
 'shocking',
 'to',
 'find',
 'how',
 'many',
 'people',
 'do',
 'not',
 'believe',
 'they',
 'can',
 'learn',
 ',',
 'and',
 'how',
 'many',
 'more',
 'believe',
 'learning',
 'to',
 'be',
 'difficult',
 '.']

# Filtering Stop Words - Data Cleaning
Stop words are words that you want to ignore, so you filter them out
of your text when you’re processing it. Very common words like 'in','is'
, and 'an' are often used as stop words since they don’t add a lot of
meaning to a text in and of themselves.



In [12]:

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aravindv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
worf_quote = "Sir, I protest. I am not a merry man!"


In [14]:
words_in_quote = word_tokenize(worf_quote)
words_in_quote

['Sir', ',', 'I', 'protest', '.', 'I', 'am', 'not', 'a', 'merry', 'man', '!']

In [15]:
stop_words = set(stopwords.words("english"))


In [16]:
filtered_list = []


In [17]:
for words in words_in_quote:
     if words.casefold() not in stop_words:
         filtered_list.append(words)
filtered_list

['Sir', ',', 'protest', '.', 'merry', 'man', '!']

 # List Comprehension


In [18]:
filtered_list=[words for words in words_in_quote if words.casefold() not in stop_words]
filtered_list


['Sir', ',', 'protest', '.', 'merry', 'man', '!']

# Stemming - is a text processing task in which you reduce words to their root. NLTK has more than one stemmer, but we are using the Porter stemmer.


In [19]:
from nltk.stem import PorterStemmer

In [20]:
stemmer = PorterStemmer()


In [21]:
string_for_stemming = """The crew of the USS Discovery discovered many discoveries. Discovering is what explorers do."""

In [22]:
string_for_stemming


'The crew of the USS Discovery discovered many discoveries. Discovering is what explorers do.'

In [23]:
words = word_tokenize(string_for_stemming)

In [24]:
stemmed_words = [stemmer.stem(word) for word in words]


In [25]:
stemmed_words


['the',
 'crew',
 'of',
 'the',
 'uss',
 'discoveri',
 'discov',
 'mani',
 'discoveri',
 '.',
 'discov',
 'is',
 'what',
 'explor',
 'do',
 '.']

In [28]:
!pip install porter2stemmer





In [27]:
from porter2stemmer import Porter2Stemmer
stemmer = Porter2Stemmer()
print(stemmer.stem('conspicuous'))

conspicu


In [29]:
stemmed_words_new = [stemmer.stem(word) for word in words]

In [30]:

stemmed_words_new

['The',
 'crew',
 'of',
 'the',
 'USS',
 'Discoveri',
 'discov',
 'mani',
 'discoveri',
 '.',
 'Discov',
 'is',
 'what',
 'explor',
 'do',
 '.']

# Part of Speech

In [31]:
words

['The',
 'crew',
 'of',
 'the',
 'USS',
 'Discovery',
 'discovered',
 'many',
 'discoveries',
 '.',
 'Discovering',
 'is',
 'what',
 'explorers',
 'do',
 '.']

In [32]:
lotr_pos_tags = nltk.pos_tag(words)

LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle[0m

  Searched in:
    - 'C:\\Users\\aravindv/nltk_data'
    - 'C:\\ProgramData\\Anaconda3\\nltk_data'
    - 'C:\\ProgramData\\Anaconda3\\share\\nltk_data'
    - 'C:\\ProgramData\\Anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\aravindv\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [33]:
lotr_pos_tags

NameError: name 'lotr_pos_tags' is not defined

# Lemmatization

In [34]:
from nltk.stem import WordNetLemmatizer


In [35]:
lemmatizer = WordNetLemmatizer()

In [36]:
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

LookupError: 
**********************************************************************
  Resource [93mwordnet[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('wordnet')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/wordnet[0m

  Searched in:
    - 'C:\\Users\\aravindv/nltk_data'
    - 'C:\\ProgramData\\Anaconda3\\nltk_data'
    - 'C:\\ProgramData\\Anaconda3\\share\\nltk_data'
    - 'C:\\ProgramData\\Anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\aravindv\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [38]:
lemmatized_words

NameError: name 'lemmatized_words' is not defined

# Grammer Tree

In [39]:
grammar = "NP: {<DT>?<JJ>*<NN>}"

In [40]:
chunk_parser = nltk.RegexpParser(grammar)

In [41]:
tree = chunk_parser.parse(lotr_pos_tags)


NameError: name 'lotr_pos_tags' is not defined

In [42]:
tree.draw()

NameError: name 'tree' is not defined