# Lemmatizing Words Using Wordnet

In [1]:
import nltk
from nltk.stem import *
import pandas as pd

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

# Lemmatizing Words

In [4]:
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()
print(wnl.lemmatize('definitions'))

definition


# Lemmatizing words by specifying parts-of-speech

In [5]:
print('Adjective: ', wnl.lemmatize('running', pos='a'))
print('Adverb: ', wnl.lemmatize('running', pos='r'))
print('Noun: ', wnl.lemmatize('running', pos='n'))
print('Verb: ', wnl.lemmatize('running', pos='v'))

Adjective:  running
Adverb:  running
Noun:  running
Verb:  run


In [6]:
input_tokens = ['dictionaries', 'dictionary', 
                'hushed', 'hush', 'hushing',
                'functional', 'functionally',
                'lying', 'lied', 'lies',
                'flawed', 'flaws', 'flawless', 
                'friendship', 'friendships', 'friendly', 'friendless', 
                'definitions', 'definition', 'definitely',  
                'the', 'these', 'those',
                'motivational', 'motivate', 'motivating']

In [7]:
ss =  SnowballStemmer('english')

ss_stemmed_tokens = []
for token in input_tokens:
    ss_stemmed_tokens.append(ss.stem(token))

In [8]:
wnl_lemmatized_tokens = []
for token in input_tokens:
    wnl_lemmatized_tokens.append(wnl.lemmatize(token, pos='v'))

In [9]:
stems_lemmas_df = pd.DataFrame({
    'words': input_tokens,
    'Snowball Stemmer': ss_stemmed_tokens,
    'WordNet Lemmatizer': wnl_lemmatized_tokens
})

stems_lemmas_df

Unnamed: 0,words,Snowball Stemmer,WordNet Lemmatizer
0,dictionaries,dictionari,dictionaries
1,dictionary,dictionari,dictionary
2,hushed,hush,hush
3,hush,hush,hush
4,hushing,hush,hush
5,functional,function,functional
6,functionally,function,functionally
7,lying,lie,lie
8,lied,lie,lie
9,lies,lie,lie


In [10]:
from nltk.tokenize import word_tokenize
 
with open('DLdata.txt', 'r') as f:
    file_contents = f.read()
    
print(file_contents)

The advent of computer graphic processing units, improvement in mathematical models and availability of big data has allowed artificial intelligence (AI) using machine learning (ML) and deep learning (DL) techniques to achieve robust performance for broad applications in social-media, the internet of things, the automotive industry and healthcare. DL systems in particular provide improved capability in image, speech and motion recognition as well as in natural language processing. In medicine, significant progress of AI and DL systems has been demonstrated in image-centric specialties such as radiology, dermatology, pathology and ophthalmology. New studies, including pre-registered prospective clinical trials, have shown DL systems are accurate and effective in detecting diabetic retinopathy (DR), glaucoma, age-related macular degeneration (AMD), retinopathy of prematurity, refractive error and in identifying cardiovascular risk factors and diseases, from digital fundus photographs. Th

In [11]:
word_tokens = word_tokenize(file_contents)

In [12]:
wnl = WordNetLemmatizer()
lemmatized_words = []

for word in word_tokens:
    lemmatized_words.append(wnl.lemmatize(word, pos="v"))

In [13]:
" ".join(lemmatized_words)

'The advent of computer graphic process units , improvement in mathematical model and availability of big data have allow artificial intelligence ( AI ) use machine learn ( ML ) and deep learn ( DL ) techniques to achieve robust performance for broad applications in social-media , the internet of things , the automotive industry and healthcare . DL systems in particular provide improve capability in image , speech and motion recognition as well as in natural language process . In medicine , significant progress of AI and DL systems have be demonstrate in image-centric specialties such as radiology , dermatology , pathology and ophthalmology . New study , include pre-registered prospective clinical trials , have show DL systems be accurate and effective in detect diabetic retinopathy ( DR ) , glaucoma , age-related macular degeneration ( AMD ) , retinopathy of prematurity , refractive error and in identify cardiovascular risk factor and diseases , from digital fundus photograph . There 

# Stopwords

In [14]:
from nltk import word_tokenize
from nltk.corpus import stopwords

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [15]:
print(stopwords.fileids())

['arabic', 'azerbaijani', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hungarian', 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'slovene', 'spanish', 'swedish', 'tajik', 'turkish']


In [16]:
print("English: ",stopwords.words('english'))
print("Arabic: ",stopwords.words('arabic'))

English:  ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so',

In [17]:
text_array = ["A bird in hand is worth two in the bush.",
              "Good things come to those who wait.",
              "These watches cost $1500! ",
              "There are other fish in the sea.",
              "The ball is in your court.",
              "Mr. Smith Goes to Washington ",
              "Doogie Howser M.D."]

text = " ".join(text_array)
text

'A bird in hand is worth two in the bush. Good things come to those who wait. These watches cost $1500!  There are other fish in the sea. The ball is in your court. Mr. Smith Goes to Washington  Doogie Howser M.D.'

In [18]:
word_tokens = word_tokenize(text)

word_tokens

['A',
 'bird',
 'in',
 'hand',
 'is',
 'worth',
 'two',
 'in',
 'the',
 'bush',
 '.',
 'Good',
 'things',
 'come',
 'to',
 'those',
 'who',
 'wait',
 '.',
 'These',
 'watches',
 'cost',
 '$',
 '1500',
 '!',
 'There',
 'are',
 'other',
 'fish',
 'in',
 'the',
 'sea',
 '.',
 'The',
 'ball',
 'is',
 'in',
 'your',
 'court',
 '.',
 'Mr.',
 'Smith',
 'Goes',
 'to',
 'Washington',
 'Doogie',
 'Howser',
 'M.D',
 '.']

In [19]:
stop_words = set(stopwords.words('english'))

filtered_words = []

for word in word_tokens:
    if word not in stop_words:
        filtered_words.append(word)
        
print(filtered_words)

['A', 'bird', 'hand', 'worth', 'two', 'bush', '.', 'Good', 'things', 'come', 'wait', '.', 'These', 'watches', 'cost', '$', '1500', '!', 'There', 'fish', 'sea', '.', 'The', 'ball', 'court', '.', 'Mr.', 'Smith', 'Goes', 'Washington', 'Doogie', 'Howser', 'M.D', '.']


In [20]:
with open("DLdata.txt", "w") as f:
    for word in filtered_words:
        f.write(word)
        f.write(' ')

In [21]:
with open("DLdata.txt", "r") as f:
    file_contents = f.read()

print(file_contents)

A bird hand worth two bush . Good things come wait . These watches cost $ 1500 ! There fish sea . The ball court . Mr. Smith Goes Washington Doogie Howser M.D . 


In [22]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
count_vectorizer.fit([file_contents])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [23]:
transformed_vector = count_vectorizer.transform(text_array)

transformed_vector.shape

(7, 25)

In [24]:
feature_names_nltk = count_vectorizer.get_feature_names()

In [25]:
count_vectorizer.vocabulary_

{'bird': 2,
 'hand': 11,
 'worth': 24,
 'two': 20,
 'bush': 3,
 'good': 10,
 'things': 19,
 'come': 4,
 'wait': 21,
 'these': 18,
 'watches': 23,
 'cost': 5,
 '1500': 0,
 'there': 17,
 'fish': 8,
 'sea': 14,
 'the': 16,
 'ball': 1,
 'court': 6,
 'mr': 13,
 'smith': 15,
 'goes': 9,
 'washington': 22,
 'doogie': 7,
 'howser': 12}

In [26]:
transformed_vector.toarray()

array([[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
        0, 0, 1],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
        0, 0, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
        0, 0, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
        1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0]], dtype=int64)

In [27]:
count_vectorizer.inverse_transform(transformed_vector)

[array(['bird', 'bush', 'hand', 'the', 'two', 'worth'], dtype='<U10'),
 array(['come', 'good', 'things', 'wait'], dtype='<U10'),
 array(['1500', 'cost', 'these', 'watches'], dtype='<U10'),
 array(['fish', 'sea', 'the', 'there'], dtype='<U10'),
 array(['ball', 'court', 'the'], dtype='<U10'),
 array(['goes', 'mr', 'smith', 'washington'], dtype='<U10'),
 array(['doogie', 'howser'], dtype='<U10')]

# Removing Stpwords Using sklearn

In [28]:
count_vectorizer = CountVectorizer(stop_words='english')

transformed_vector = count_vectorizer.fit_transform(text_array)

transformed_vector.shape

(7, 21)

In [29]:
feature_names_sklearn = count_vectorizer.get_feature_names()

In [30]:
transformed_vector.toarray()

array([[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int64)

In [31]:
count_vectorizer.inverse_transform(transformed_vector)

[array(['bird', 'hand', 'worth', 'bush'], dtype='<U10'),
 array(['good', 'things', 'come', 'wait'], dtype='<U10'),
 array(['watches', 'cost', '1500'], dtype='<U10'),
 array(['fish', 'sea'], dtype='<U10'),
 array(['ball', 'court'], dtype='<U10'),
 array(['mr', 'smith', 'goes', 'washington'], dtype='<U10'),
 array(['doogie', 'howser'], dtype='<U10')]

# Set Difference of Both


In [32]:
def set_diff(first, second):
        second = set(second)
        return [item for item in first if item not in second]

In [33]:
set_diff(feature_names_sklearn, feature_names_nltk)

[]

In [34]:
set_diff(feature_names_nltk, feature_names_sklearn)

['the', 'there', 'these', 'two']

# Filtering words based on frequency

In [35]:
from sklearn.datasets import fetch_20newsgroups

newsgroups = fetch_20newsgroups(subset='train')

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [36]:
newsgroups.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [37]:
print(newsgroups.data[0])

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [38]:
newsgroups.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [39]:
count_vectorizer = CountVectorizer()

transformed_vector = count_vectorizer.fit_transform(newsgroups.data)

transformed_vector.shape

(11314, 130107)