In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
text = """The AI, called Pluribus, defeated poker professional Darren Elias, who holds the 
record for most World Poker Tour titles, and Chris "Jesus" Ferguson, winner of six World Series of Poker events. 
Each pro separately played 5,000 hands of poker against five copies of Pluribus. In another experiment involving 
13 pros, all of whom have won more than $1 million playing poker, Pluribus played five pros at a time for a total 
of 10,000 hands and again emerged victorious."""


In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
nltk_sentenses = sent_tokenize(text)
nltk_sentenses

['The AI, called Pluribus, defeated poker professional Darren Elias, who holds the \nrecord for most World Poker Tour titles, and Chris "Jesus" Ferguson, winner of six World Series of Poker events.',
 'Each pro separately played 5,000 hands of poker against five copies of Pluribus.',
 'In another experiment involving \n13 pros, all of whom have won more than $1 million playing poker, Pluribus played five pros at a time for a total \nof 10,000 hands and again emerged victorious.']

In [5]:
nltk_words = word_tokenize(text)
nltk_words

['The',
 'AI',
 ',',
 'called',
 'Pluribus',
 ',',
 'defeated',
 'poker',
 'professional',
 'Darren',
 'Elias',
 ',',
 'who',
 'holds',
 'the',
 'record',
 'for',
 'most',
 'World',
 'Poker',
 'Tour',
 'titles',
 ',',
 'and',
 'Chris',
 '``',
 'Jesus',
 "''",
 'Ferguson',
 ',',
 'winner',
 'of',
 'six',
 'World',
 'Series',
 'of',
 'Poker',
 'events',
 '.',
 'Each',
 'pro',
 'separately',
 'played',
 '5,000',
 'hands',
 'of',
 'poker',
 'against',
 'five',
 'copies',
 'of',
 'Pluribus',
 '.',
 'In',
 'another',
 'experiment',
 'involving',
 '13',
 'pros',
 ',',
 'all',
 'of',
 'whom',
 'have',
 'won',
 'more',
 'than',
 '$',
 '1',
 'million',
 'playing',
 'poker',
 ',',
 'Pluribus',
 'played',
 'five',
 'pros',
 'at',
 'a',
 'time',
 'for',
 'a',
 'total',
 'of',
 '10,000',
 'hands',
 'and',
 'again',
 'emerged',
 'victorious',
 '.']

In [6]:
text_with_punctuation = "John's burger was so! delicious that I ate it fully, #Whataburger."


In [7]:
from nltk.tokenize import RegexpTokenizer

In [8]:
tokenize_text = RegexpTokenizer(r'\w+')

In [9]:
output = tokenize_text.tokenize(text_with_punctuation)
output

['John',
 's',
 'burger',
 'was',
 'so',
 'delicious',
 'that',
 'I',
 'ate',
 'it',
 'fully',
 'Whataburger']

In [10]:
text_st_words = "An apple a day keeps a doctor away, who was the person quoted this saying?"


In [11]:
from nltk.corpus import stopwords

In [12]:
nltk.download('stopwords')
stop_word_list = set(stopwords.words('english'))
stop_word_list

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [13]:
new_word_list = []

In [14]:
text_st_words

'An apple a day keeps a doctor away, who was the person quoted this saying?'

In [15]:
text = word_tokenize(text_st_words)
text

['An',
 'apple',
 'a',
 'day',
 'keeps',
 'a',
 'doctor',
 'away',
 ',',
 'who',
 'was',
 'the',
 'person',
 'quoted',
 'this',
 'saying',
 '?']

In [16]:
for w in text:
  if w not in stop_word_list:
    new_word_list.append(w)

In [17]:
new_word_list

['An',
 'apple',
 'day',
 'keeps',
 'doctor',
 'away',
 ',',
 'person',
 'quoted',
 'saying',
 '?']

In [18]:
from nltk.stem import PorterStemmer

In [19]:
porter_stemmer = PorterStemmer()

In [20]:
print(porter_stemmer.stem('dogs'))
print(porter_stemmer.stem('hunt'))
print(porter_stemmer.stem('hunted'))
print(porter_stemmer.stem('hunting'))

dog
hunt
hunt
hunt


In [21]:
sentense = 'How is your father? How is your mother?. Hardly you remember.'

In [22]:
porter_stemmer.stem(sentense)

'how is your father? how is your mother?. hardly you remember.'

In [23]:
from nltk.tokenize import word_tokenize,sent_tokenize

In [24]:
tokens = word_tokenize(sentense)
tokens

['How',
 'is',
 'your',
 'father',
 '?',
 'How',
 'is',
 'your',
 'mother',
 '?',
 '.',
 'Hardly',
 'you',
 'remember',
 '.']

In [25]:
stemmed_sentense = []
for w in tokens:
  stemmed_sentense.append(porter_stemmer.stem(w))
  stemmed_sentense.append(" ")

stemmed_sentense


['how',
 ' ',
 'is',
 ' ',
 'your',
 ' ',
 'father',
 ' ',
 '?',
 ' ',
 'how',
 ' ',
 'is',
 ' ',
 'your',
 ' ',
 'mother',
 ' ',
 '?',
 ' ',
 '.',
 ' ',
 'hardli',
 ' ',
 'you',
 ' ',
 'rememb',
 ' ',
 '.',
 ' ']

In [26]:
from nltk.stem import WordNetLemmatizer

In [50]:
sentense = 'I am a better boy. I have two feet. I am a caring.'

In [51]:
words = word_tokenize(sentense)
words

['I',
 'am',
 'a',
 'better',
 'boy',
 '.',
 'I',
 'have',
 'two',
 'feet',
 '.',
 'I',
 'am',
 'a',
 'caring',
 '.']

In [52]:
lmt = WordNetLemmatizer()

In [53]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [54]:
for w in words:
  print(lmt.lemmatize(w,pos='v'))

I
be
a
better
boy
.
I
have
two
feet
.
I
be
a
care
.
