In [1]:
from nltk.tokenize import (
    sent_tokenize,
    word_tokenize,
    RegexpTokenizer,
    WhitespaceTokenizer,
    WordPunctTokenizer,
)
from nltk.corpus import stopwords, wordnet, movie_reviews
import random, nltk

from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer

# [1.0 tokenize]()


[1.1 sent_tokenize & word_tokenize]()


In [2]:
text = """Hello Mr. Hany, how are you doing today? The weater is great and Python is awesome.
        The sky is pinhish-blue. You should not eat carboard."""

# sent_tokenize: split the text by sentence


print("Spliting Text by sentence: \n")
print("\n".join(sent_tokenize(text=text, language="english")), "\n")

# word_tokenize: split the text by Words
print("Spliting Text by Words: \n")
print(" - ".join(word_tokenize(text=text, language="english", preserve_line=True)))

Spliting Text by sentence: 

Hello Mr. Hany, how are you doing today?
The weater is great and Python is awesome.
The sky is pinhish-blue.
You should not eat carboard. 

Spliting Text by Words: 

Hello - Mr. - Hany - , - how - are - you - doing - today - ? - The - weater - is - great - and - Python - is - awesome. - The - sky - is - pinhish-blue. - You - should - not - eat - carboard - .


[1.2 RegexpTokenizer]()


In [3]:
text = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."

# RegexpTokenizer: used like regular expressions, which takes a letter to split based on them

took = RegexpTokenizer(r"\w+|\$[\d\.]+|\S+[.]+")

print(took.tokenize(text))

['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', 'Please', 'buy', 'me', 'two', 'of', 'them', 'Thanks']


[1.3 WhitespaceTokenizer and WordPunctTokenizer]()


In [4]:
# WordPunctTokenizer: Tokenize a text into a sequence of alphabetic and non-alphabetic characters, using the regexp \w+|[^\w\s]+.
took = WordPunctTokenizer()

print(took.tokenize(text))


# WhitespaceTokenizer : Tokenize a string on whitespace (space, tab, newline). In general, users should use the string split() method instead.
took = WhitespaceTokenizer()

print(took.tokenize(text))

['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']


# [2.0 Stop]()


[2.1 stop words]()


In [5]:
stop_words = set(stopwords.words(fileids="english"))
# Stop words are a set of commonly used words in a language, Stop words are commonly used NLP, in which they are removed from the sentance
# filter the words based on the stop words in the language
text = "This is an example showing off stop words filtration."

words = word_tokenize(text)

filtered_words = [word for word in words if not word in stop_words]


filtered_words

['This', 'example', 'showing', 'stop', 'words', 'filtration', '.']

In [6]:
stop_words = set(stopwords.words(fileids="english"))
# Stop words are a set of commonly used words in a language, Stop words are commonly used NLP, in which they are removed from the sentance
# filter the words based on the stop words in the language
text = "This is an example showing off stop words filtration."

words = word_tokenize(text)

filtered_words = [word for word in words if not word in stop_words]

print("Before filtering : ", words, "\n")
print("after filtering : ", filtered_words, "\n")

Before filtering :  ['This', 'is', 'an', 'example', 'showing', 'off', 'stop', 'words', 'filtration', '.'] 

after filtering :  ['This', 'example', 'showing', 'stop', 'words', 'filtration', '.'] 



# [3.0 stem]()

Interfaces used to remove morphological affixes from words, leaving only the word stem. Stemming algorithms aim to remove those affixes required for eg. grammatical role, tense, derivational morphology leaving only the stem of the word


[3.1 porter]()

An algorithm for suffix stripping


In [7]:
ps = PorterStemmer()

some_words = [
    "maximum",
    "presumably",
    "multiply",
    "provision",
    "owed",
    "saying",
    "meant",
    "cement",
]


for w in some_words:
    print(ps.stem(w))

print()

text = """
    It is very imporatant to be pythonly while you are pythoning with python.
    All pythoners have pythoned poorly at least ones.
"""

words = word_tokenize(text)
print("words before : ", words, "\n")


filtered_words = set()

for w in words:
    filtered_words.add(ps.stem(w))

print("words after : ", filtered_words, "\n")

maximum
presum
multipli
provis
owe
say
meant
cement

words before :  ['It', 'is', 'very', 'imporatant', 'to', 'be', 'pythonly', 'while', 'you', 'are', 'pythoning', 'with', 'python', '.', 'All', 'pythoners', 'have', 'pythoned', 'poorly', 'at', 'least', 'ones', '.'] 

words after :  {'is', 'veri', 'at', 'have', 'it', 'least', 'to', 'python', 'while', '.', 'pythonli', 'one', 'you', 'be', 'with', 'poorli', 'all', 'are', 'imporat'} 



[3.2 LancasterStemmer]()

The Lancaster Stemmer is an algorithm for stemming words in the English language, It aims to efficiently remove prefixes and suffixes from words to find their root form.


In [8]:
ps = LancasterStemmer(strip_prefix_flag=True)

some_words = [
    "maximum",
    "presumably",
    "multiply",
    "provision",
    "owed",
    "saying",
    "meant",
    "cement",
    "kilometer",
]
for w in some_words:
    print(ps.stem(w))
print()

text = """It is very imporatant to be pythonly while you are pythoning with python. All pythoners have pythoned poorly at least ones."""
words = word_tokenize(text)

print("words before : ", words, "\n")

filtered_words = set()

for w in words:
    filtered_words.add(ps.stem(w))

print("words after : ", filtered_words, "\n")

maxim
presum
multiply
provid
ow
say
meant
cem
met

words before :  ['It', 'is', 'very', 'imporatant', 'to', 'be', 'pythonly', 'while', 'you', 'are', 'pythoning', 'with', 'python', '.', 'All', 'pythoners', 'have', 'pythoned', 'poorly', 'at', 'least', 'ones', '.'] 

words after :  {'very', 'whil', 'poor', 'is', 'to', 'imp', 'python', 'you', 'ar', 'on', '.', 'hav', 'at', 'be', 'it', 'al', 'least', 'with'} 



[3.3 WordNetLemmatizer]()

It's used for lemmatization, which is the process of reducing words to their base or root form, known as the lemma.

For example:

    The lemma of "running" is "run".

    The lemma of "better" is "good".

Lemmatization is useful in NLP tasks for standardizing words so that variations of the same word are treated as identical, which can improve the accuracy of analyses like text classification or information retrieval


In [9]:
lemma = WordNetLemmatizer()
print(lemma.lemmatize(word="running", pos="v"))
print(lemma.lemmatize("dogs"))
print(lemma.lemmatize("churches"))
print(lemma.lemmatize("hardrock", pos="v"))
print(lemma.lemmatize("reading", pos="v"))
print(lemma.lemmatize("reader", pos="v"))
print(lemma.lemmatize("better"))
print(lemma.lemmatize("better", pos="a"))

run
dog
church
hardrock
read
reader
better
good


# [4.0 wordnet]()

WordNet is a lexical database for the English language. It groups English words into sets of synonyms called synsets, each expressing a distinct concept. These synsets are interconnected by means of conceptual-semantic and lexical relations

Here are some key features of WordNet:

- Synonym Sets (Synsets): WordNet organizes words into synsets, which are groups of words that are synonymous with each other. For example, the synset for the word "car" might include synonyms such as "automobile" and "vehicle".

- Antonyms: WordNet includes antonyms for many words, allowing for the representation of opposite concepts. For example, "hot" is an antonym of "cold".


In [10]:
snys = wordnet.synsets("good")

# synsets
print("synsets of word good : \n")
print(snys)

# words

words = set()

for i, s in enumerate(snys):
    words.add(snys[i].lemmas()[0].name())

print("all words of word good : \n")
print(words)

synsets of word good : 

[Synset('good.n.01'), Synset('good.n.02'), Synset('good.n.03'), Synset('commodity.n.01'), Synset('good.a.01'), Synset('full.s.06'), Synset('good.a.03'), Synset('estimable.s.02'), Synset('beneficial.s.01'), Synset('good.s.06'), Synset('good.s.07'), Synset('adept.s.01'), Synset('good.s.09'), Synset('dear.s.02'), Synset('dependable.s.04'), Synset('good.s.12'), Synset('good.s.13'), Synset('effective.s.04'), Synset('good.s.15'), Synset('good.s.16'), Synset('good.s.17'), Synset('good.s.18'), Synset('good.s.19'), Synset('good.s.20'), Synset('good.s.21'), Synset('well.r.01'), Synset('thoroughly.r.02')]
all words of word good : 

{'commodity', 'adept', 'good', 'thoroughly', 'well', 'full', 'estimable', 'effective', 'dear', 'dependable', 'beneficial'}


In [11]:
synonyms = set()
antonyms = []
for syn in snys:
    for l in syn.lemmas():
        synonyms.add(l.name())
        if l.antonyms():
            antonyms.append([l.name(), l.antonyms()[0].name()])


print("all words of word good : \n")
print(synonyms, "\n")
print("words and thier antonyms :\n")
print(antonyms)

all words of word good : 

{'unspoilt', 'sound', 'well', 'practiced', 'honest', 'beneficial', 'commodity', 'right', 'in_force', 'respectable', 'goodness', 'soundly', 'effective', 'near', 'dear', 'proficient', 'upright', 'in_effect', 'adept', 'skillful', 'ripe', 'thoroughly', 'full', 'estimable', 'skilful', 'salutary', 'dependable', 'expert', 'trade_good', 'just', 'undecomposed', 'safe', 'good', 'unspoiled', 'honorable', 'serious', 'secure'} 

words and thier antonyms :

[['good', 'evil'], ['goodness', 'evilness'], ['good', 'bad'], ['goodness', 'badness'], ['good', 'bad'], ['good', 'evil'], ['well', 'ill']]


# [4.0 Wu Palmer Similarity]()

used to get the Similarity percentage between two words


In [25]:
w1 = wordnet.synsets("cake")
w2 = wordnet.synsets("loaf")
w3 = wordnet.synsets("bread")
cake = w1[0]
loaf = w2[0]
bread = w3[0]
print(w1, "\n")
print(w2, "\n")
print(w3, "\n")
print(cake.lemmas()[0].name(), "\n")
print(loaf, "\n")
print(bread, "\n")

words = [cake, loaf, bread]

for w in words:
    for ww in words:
        if w != ww:
            print(
                w.lemmas()[0].name(),
                " is Similar to ",
                ww.lemmas()[0].name(),
                " by ",
                w.wup_similarity(ww) * 100,
                "\n",
            )

[Synset('cake.n.01'), Synset('patty.n.01'), Synset('cake.n.03'), Synset('coat.v.03')] 

[Synset('loaf_of_bread.n.01'), Synset('loaf.n.02'), Synset('bum.v.02'), Synset('loiter.v.01')] 

[Synset('bread.n.01'), Synset('boodle.n.01'), Synset('bread.v.01')] 

cake 

Synset('loaf_of_bread.n.01') 

Synset('bread.n.01') 

cake  is Similar to  loaf_of_bread  by  26.666666666666668 

cake  is Similar to  bread  by  28.57142857142857 

loaf_of_bread  is Similar to  cake  by  26.666666666666668 

loaf_of_bread  is Similar to  bread  by  94.11764705882352 

bread  is Similar to  cake  by  28.57142857142857 

bread  is Similar to  loaf_of_bread  by  94.11764705882352 



cake  is Similar to  loaf_of_bread  by  26.666666666666668 

cake  is Similar to  bread  by  28.57142857142857 

loaf_of_bread  is Similar to  cake  by  26.666666666666668 

loaf_of_bread  is Similar to  bread  by  94.11764705882352 

bread  is Similar to  cake  by  28.57142857142857 

bread  is Similar to  loaf_of_bread  by  94.11764705882352 



# [5.0 Text Classification problem]()


In [12]:
documents = [
    (list(movie_reviews.words(fileid)), cat)
    for cat in movie_reviews.categories()
    for fileid in movie_reviews.fileids(cat)
]

all_words = []

for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:6000]


def find_features(document):

    words = set(document)

    features = {}
    for w in word_features:
        features[w] = w in words

    return features


features = [(find_features(review), cat) for (review, cat) in documents]


train = features[:2000]
test = features[1900:]


clf = nltk.NaiveBayesClassifier.train(train)

print("train acc = ", nltk.classify.accuracy(clf, train) * 100)
print("test acc = ", nltk.classify.accuracy(clf, test) * 100)

train acc =  91.3
test acc =  94.0
