# Stop Words Removal

In [1]:
import nltk

In [2]:
from nltk.corpus import stopwords

In [3]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
from nltk.tokenize import word_tokenize
 
example_sent = "The process of converting data to something a computer can understand is referred to as pre-processing. One of the major forms of pre-processing is to filter out useless data."
 
stop_words = set(stopwords.words('english'))
 
word_tokens = word_tokenize(example_sent)
 
filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
 
filtered_sentence = []
 
for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)
 
print(word_tokens)
print(filtered_sentence)

['The', 'process', 'of', 'converting', 'data', 'to', 'something', 'a', 'computer', 'can', 'understand', 'is', 'referred', 'to', 'as', 'pre-processing', '.', 'One', 'of', 'the', 'major', 'forms', 'of', 'pre-processing', 'is', 'to', 'filter', 'out', 'useless', 'data', '.']
['The', 'process', 'converting', 'data', 'something', 'computer', 'understand', 'referred', 'pre-processing', '.', 'One', 'major', 'forms', 'pre-processing', 'filter', 'useless', 'data', '.']


# Tokenization

In [5]:
from nltk.tokenize import word_tokenize
 
example_sent = "The process of converting data to something a computer can understand is referred to as pre-processing. One of the major forms of pre-processing is to filter out useless data."
word_tokens = word_tokenize(example_sent)
print(word_tokens)

['The', 'process', 'of', 'converting', 'data', 'to', 'something', 'a', 'computer', 'can', 'understand', 'is', 'referred', 'to', 'as', 'pre-processing', '.', 'One', 'of', 'the', 'major', 'forms', 'of', 'pre-processing', 'is', 'to', 'filter', 'out', 'useless', 'data', '.']


# tokenize tweets

In [6]:
tweet_text = '''
It was an honor to welcome United Kingdom Prime Minister
Boris Johnson to the White House this afternoon. The bond between our two
nations is ironclad and we’re committed to working together on everything
from climate change to COVID-19 in the years ahead.
'''
tweet_tokens = nltk.word_tokenize(tweet_text)
print(tweet_tokens)

['It', 'was', 'an', 'honor', 'to', 'welcome', 'United', 'Kingdom', 'Prime', 'Minister', 'Boris', 'Johnson', 'to', 'the', 'White', 'House', 'this', 'afternoon', '.', 'The', 'bond', 'between', 'our', 'two', 'nations', 'is', 'ironclad', 'and', 'we', '’', 're', 'committed', 'to', 'working', 'together', 'on', 'everything', 'from', 'climate', 'change', 'to', 'COVID-19', 'in', 'the', 'years', 'ahead', '.']


# Similarities

In [6]:
import spacy

## Loading the small model containing tensors.
nlp = spacy.load('en_core_web_md')
  
print("Enter two space-separated words")
words = input()
  
tokens = nlp(words)
  
for token in tokens:
    # Printing the following attributes of each token.
    # text: the word string, has_vector: if it contains
    # a vector representation in the model, 
    # vector_norm: the algebraic norm of the vector,
    # is_oov: if the word is out of vocabulary.
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)
  
token1, token2 = tokens[0], tokens[1]
  
print("Similarity:", token1.similarity(token2))

Enter two space-separated words
man woman
man True 6.352939 False
woman True 6.8987513 False
Similarity: 0.7401744
