**Tokenization**

In [2]:
import nltk
from nltk.tokenize import sent_tokenize

my_message = "Hello there. Goodbye everybody."
tokens = sent_tokenize(my_message)
print(tokens)

['Hello there.', 'Goodbye everybody.']


In [3]:
from nltk.tokenize import word_tokenize
nltk.download("punkt")

my_message = "@Everybody: Hello NLP-world!"
tokens = word_tokenize(my_message)
print(tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
['@', 'Everybody', ':', 'Hello', 'NLP-world', '!']


In [None]:
from nltk.tokenize import wordpunct_tokenize

tokens = wordpunct_tokenize(my_message)
print(tokens)

['@', 'Everybody', ':', 'Hello', 'NLP', '-', 'world', '!']


In [None]:
from nltk.tokenize import regexp_tokenize

tokens = regexp_tokenize(my_message, r"\w+")
print(tokens)

['Everybody', 'Hello', 'NLP', 'world']


In [None]:
from nltk.tokenize import regexp_tokenize

input_sentences = ["Hello world", "this is only an example"]
tokens = []
for word in input_sentences:
   tokens.extend(regexp_tokenize(word, r"\w+"))

print(tokens)

['Hello', 'world', 'this', 'is', 'only', 'an', 'example']


**Stemming**

In [None]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
tokens = ["Enjoy", "enjoying", "enjoys", "enjoyable"]
stems = [stemmer.stem(token) for token in tokens]
print(stems)

['enjoy', 'enjoy', 'enjoy', 'enjoy']


In [None]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english")
stems = [stemmer.stem(token) for token in tokens]
print(stems)

['enjoy', 'enjoy', 'enjoy', 'enjoy']


List-comprehension in Python

In [None]:
input_list = ["red", "white", "purple", "yellow", "blue", "green", "black"]

# Example non-list comprehesion
output_list = []
for item in input_list:
    output_list.append(item)
print(output_list)

# Example of list-comprehension
output_list = [item for item in input_list]
print(output_list)

# Example non-list comprehesion (with conditional)
output_list = []
for item in input_list:
    if "u" in item:
      output_list.append(item)
print(output_list)

# Example of list-comprehension (with conditional)
output_list = [item for item in input_list if "u" in item]
print(output_list)

['red', 'white', 'purple', 'yellow', 'blue', 'green', 'black']
['red', 'white', 'purple', 'yellow', 'blue', 'green', 'black']
['purple', 'blue']
['purple', 'blue']


**Removing stop words**

In [None]:
from nltk.corpus import stopwords
nltk.download("stopwords")

example_text = "This is an example sentence to test stopwords"
sw_en = stopwords.words("english")

text_no_stopwords = [word for word in example_text.split() if word not in sw_en]
print(example_text)
print(text_no_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
This is an example sentence to test stopwords
['This', 'example', 'sentence', 'test', 'stopwords']


In [None]:
from nltk.corpus import gutenberg
nltk.download("gutenberg")

words = gutenberg.words("shakespeare-hamlet.txt")
words_no_stopwords = [word for word in words if word not in sw_en]

stopwords_percentage = len(text_no_stopwords) * 100 / len(words)
print("The percentage of words without stopwords in Hamlet is", stopwords_percentage, "%")

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
The percentage of words without stopwords in Hamlet is 0.013383297644539615 %


**Lemmatization**

In [None]:
import spacy
nlp = spacy.load("en")

sentence="We are putting in efforts to enhance our understanding of Lemmatization"

lemmas = [token.lemma_ for token in nlp(sentence)]
print(lemmas)

lemmas = [w.lemma_ if w.lemma_ !='-PRON-' else w.text for w in nlp(sentence)]
print(lemmas)

['-PRON-', 'be', 'put', 'in', 'effort', 'to', 'enhance', '-PRON-', 'understanding', 'of', 'lemmatization']
['We', 'be', 'put', 'in', 'effort', 'to', 'enhance', 'our', 'understanding', 'of', 'lemmatization']
