**Tokenization**

In [None]:
import nltk
from nltk.tokenize import word_tokenize

my_message = "@Everybody: Hello NLP-world!"
word_tokenize(my_message)

['@', 'Everybody', ':', 'Hello', 'NLP-world', '!']

In [None]:
from nltk.tokenize import wordpunct_tokenize

wordpunct_tokenize(my_message)

['@', 'Everybody', ':', 'Hello', 'NLP', '-', 'world', '!']

In [None]:
from nltk.tokenize import regexp_tokenize

regexp_tokenize(my_message, "\w+|[!,\-,]")

['Everybody', 'Hello', 'NLP', '-', 'world', '!']

**Stemming**

In [None]:
from nltk.stem import PorterStemmer
stemming = PorterStemmer()

word_list = ["Enjoy", "enjoying", "enjoys", "enjoyable"]
for word in word_list:
    print(stemming.stem(word))

enjoy
enjoy
enjoy
enjoy


**Removing stop words**

In [None]:
from nltk.corpus import stopwords
nltk.download("stopwords")

sw_en = stopwords.words("english")
example_text = "This is an example sentence to test stopwords"
example_text_without_stopwords = [word for word in example_text.split() if word not in sw_en]
print(example_text)
print(example_text_without_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
This is an example sentence to test stopwords
['This', 'example', 'sentence', 'test', 'stopwords']


In [None]:
input_list = ["red", "white", "purple", "yellow", "blue", "green", "black"]

# Example non-list comprehesion
output_list = []
for item in input_list:
    if "u" in item:
      output_list.append(item)
print(output_list)

# Example of list-comprehension
output_list = [item for item in input_list if "u" in item]
print(output_list)

['purple', 'blue']
['purple', 'blue']


In [None]:
from nltk.corpus import gutenberg
nltk.download("gutenberg")

words_in_hamlet = gutenberg.words("shakespeare-hamlet.txt")
words_in_hamlet_without_stopwords = [word for word in words_in_hamlet if word not in sw_en]

stopwords_percentage = len(words_in_hamlet_without_stopwords) * 100 / len(words_in_hamlet)
print("The percentage of words without stopwords in Hamlet is", stopwords_percentage, "%")

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
The percentage of words without stopwords in Hamlet is 69.26124197002142 %


**Lemmatization**

In [None]:
import spacy
nlp = spacy.load("en")

input_str="been had done languages cities mice"
output_list = [token.lemma_ for token in nlp(input_str)]
print(output_list)

['be', 'have', 'do', 'language', 'city', 'mouse']
