In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import spacy
from textblob import TextBlob

In [2]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bishw\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\bishw\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bishw\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bishw\AppData\Roaming\nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\bishw\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\bishw\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


True

In [3]:
sentence = "Apple is looking at buying a U.K. startup for $1 billion."

In [4]:
tokens = word_tokenize(sentence)

In [5]:
tokens

['Apple',
 'is',
 'looking',
 'at',
 'buying',
 'a',
 'U.K.',
 'startup',
 'for',
 '$',
 '1',
 'billion',
 '.']

In [6]:
stop_words = set(stopwords.words('english'))

In [7]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [8]:
len(stop_words)

179

In [9]:
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

In [10]:
filtered_tokens

['Apple', 'looking', 'buying', 'U.K.', 'startup', '$', '1', 'billion', '.']

In [12]:
stemmer = PorterStemmer()

In [13]:
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

In [14]:
stemmed_tokens

['appl', 'look', 'buy', 'u.k.', 'startup', '$', '1', 'billion', '.']

In [11]:
lemmatizer = WordNetLemmatizer()

In [15]:
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

In [16]:
lemmatized_tokens

['Apple', 'looking', 'buying', 'U.K.', 'startup', '$', '1', 'billion', '.']

In [17]:
pos_tags = nltk.pos_tag(tokens)

In [18]:
pos_tags

[('Apple', 'NNP'),
 ('is', 'VBZ'),
 ('looking', 'VBG'),
 ('at', 'IN'),
 ('buying', 'VBG'),
 ('a', 'DT'),
 ('U.K.', 'NNP'),
 ('startup', 'NN'),
 ('for', 'IN'),
 ('$', '$'),
 ('1', 'CD'),
 ('billion', 'CD'),
 ('.', '.')]

In [20]:
nlp = spacy.load('en_core_web_sm')

In [21]:
doc = nlp(sentence)

In [22]:
doc

Apple is looking at buying a U.K. startup for $1 billion.

In [23]:
doc.ents

(Apple, U.K., $1 billion)

In [24]:
for ent in doc.ents:
    print(f"{ent.text} -> {ent.label_}")

Apple -> ORG
U.K. -> GPE
$1 billion -> MONEY


In [25]:
for token in doc:
    print(f"{token.text} ({token.dep_}) <- {token.head.text}")

Apple (nsubj) <- looking
is (aux) <- looking
looking (ROOT) <- looking
at (prep) <- looking
buying (pcomp) <- at
a (det) <- U.K.
U.K. (dobj) <- buying
startup (advcl) <- looking
for (prep) <- startup
$ (quantmod) <- billion
1 (compound) <- billion
billion (pobj) <- for
. (punct) <- looking


In [26]:
blob = TextBlob(sentence)

In [27]:
print(f"Polarity: {blob.sentiment.polarity}, Subjectivity: {blob.sentiment.subjectivity}")

Polarity: 0.0, Subjectivity: 0.0


In [28]:
incorrect_sen = "certain condtion druing aseverl generaton aree modifition in the samme mannners."

In [29]:
textBlb = TextBlob(incorrect_sen)

In [30]:
textBlb.correct().string

'certain condition during several generation are modification in the same manners.'

In [33]:
#couny occurance
textBlb.word_counts[incorrect_sen]

0

In [39]:
textBlb.words.lemmatize()

WordList(['certain', 'condtion', 'druing', 'aseverl', 'generaton', 'aree', 'modifition', 'in', 'the', 'samme', 'mannners'])