In [1]:
# Importing library
import nltk

In [2]:
# # downloading relevant dependencies
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('tagsets')

In [3]:
# Importing Dataset
dataset = "Hello Evreyone. Welcome to this course. We are studying NLP."

In [4]:
# Tokenization
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

In [5]:
# Tokenizing Sentences
sent_tokenize(text = dataset, language = 'english')

['Hello Evreyone.', 'Welcome to this course.', 'We are studying NLP.']

In [6]:
# Tokenizing words
word_tokenize(text = dataset, language = 'english')

['Hello',
 'Evreyone',
 '.',
 'Welcome',
 'to',
 'this',
 'course',
 '.',
 'We',
 'are',
 'studying',
 'NLP',
 '.']

In [7]:
dataset = "Hello Evreyone. Welcome to this course. We are studying NLP."

In [10]:
word_tokenize(text = dataset, language='english')

['Hello',
 'Evreyone',
 '.',
 'Welcome',
 'to',
 'this',
 'course',
 '.',
 'We',
 'are',
 'studying',
 'NLP',
 '.']

In [11]:
# Stemming
from nltk.stem import PorterStemmer

In [12]:
dataset = ['love', 'loving', 'lover', 'loved', 'lovingly']
new_data = """It feels very special when you are loving someone.
              We care for our loved ones.
              specially when we love each other unconditionally."""
new_data

'It feels very special when you are loving someone.\n              We care for our loved ones.\n              specially when we love each other unconditionally.'

In [13]:
ps = PorterStemmer()
for i in dataset:
    print(ps.stem(i))

love
love
lover
love
lovingli


In [14]:
word = word_tokenize(text = new_data)
for i in word:
    print(ps.stem(i))

it
feel
veri
special
when
you
are
love
someon
.
we
care
for
our
love
one
.
special
when
we
love
each
other
uncondit
.


In [15]:
# Lemmatization
from nltk.stem import WordNetLemmatizer

In [16]:
wnl = WordNetLemmatizer()

In [17]:
dataset = ['churches', 'feet', 'lot', 'gave', 'sat', 'dogs']
for i in dataset:
    print(wnl.lemmatize(i))

church
foot
lot
gave
sat
dog


In [18]:
wnl.lemmatize('better', pos = 'a')
# pos is part os speech and 'a' is adjective

'good'

In [19]:
# Stop Words
from nltk.corpus import stopwords

In [20]:
dataset = """Hello Mr. Watson, how are you doing today?
         The weather is awesome. The garden is Green.
         We should go out for a walk."""
dataset = dataset.lower()

In [21]:
stop_words = set(stopwords.words('english'))
# stop_words

In [22]:
filtered_sentences = []

word_tokenize = word_tokenize(text = dataset)

for word in word_tokenize:
    if word not in stop_words:
        filtered_sentences.append(word)
        
filtered_sentences

['hello',
 'mr.',
 'watson',
 ',',
 'today',
 '?',
 'weather',
 'awesome',
 '.',
 'garden',
 'green',
 '.',
 'go',
 'walk',
 '.']

In [8]:
# Part of Speech
from nltk.tag import pos_tag

In [9]:
data = """Taj Mahal is one of the world's most celebrated structures
             in the world.
             It is a syunning symbol of the indian rich history"""

In [10]:
# Tokenizing words
word = word_tokenize(text = data, language = 'english')

In [11]:
# applying POS Tagging
pos = pos_tag(word)

In [13]:
nltk.help.upenn_tagsets()

AttributeError: module 'nltk.help' has no attribute 'upenn_tagsets'

In [14]:
# Chunking
from nltk.chunk import RegexpParser

In [15]:
sequence_chunk = """
chunk:
    {<NNPS>+}
    {<NNP>+}
    {<NN>+}
"""

In [16]:
# creating object 
re = RegexpParser(sequence_chunk)
chunked_result = re.parse(pos)
print(chunked_result)

(S
  (chunk Taj/NNP Mahal/NNP)
  is/VBZ
  one/CD
  of/IN
  the/DT
  (chunk world/NN)
  's/POS
  most/RBS
  celebrated/JJ
  structures/NNS
  in/IN
  the/DT
  (chunk world/NN)
  ./.
  It/PRP
  is/VBZ
  a/DT
  syunning/JJ
  (chunk symbol/NN)
  of/IN
  the/DT
  indian/JJ
  rich/JJ
  (chunk history/NN))
