In [36]:
from textblob import TextBlob
s = TextBlob("I love Natural Language Processing, not you!")

In [3]:
# Part of speech (POS) Tagging
s.tags

[('I', 'PRP'),
 ('love', 'VBP'),
 ('Natural', 'JJ'),
 ('Language', 'NNP'),
 ('Processing', 'NNP'),
 ('not', 'RB'),
 ('you', 'PRP')]

In [4]:
# Noun Phrase Extraction
s.noun_phrases

WordList(['language processing'])

In [5]:
# Sentiment Analysis
# The sentiment property returns a named tuple of the form Sentiment(polarity, subjectivity). The polarity 
# score is a float within the range [-1.0, 1.0]. The subjectivity is a float within the range [0.0, 1.0] 
# where 0.0 is very objective and 1.0 is very subjective.
testimonial = TextBlob("Textblob is amazingly simple to use. What great fun!")
testimonial.sentiment

Sentiment(polarity=0.39166666666666666, subjectivity=0.4357142857142857)

In [6]:
# Tokenization
s = TextBlob("Data is a new fuel. "
               "Explicit is better than implicit. "
               "Simple is better than complex. ")
print(s.words)
print(s.sentences)

['Data', 'is', 'a', 'new', 'fuel', 'Explicit', 'is', 'better', 'than', 'implicit', 'Simple', 'is', 'better', 'than', 'complex']
[Sentence("Data is a new fuel."), Sentence("Explicit is better than implicit."), Sentence("Simple is better than complex.")]


In [7]:
# Word Inflection
sentence = TextBlob('Use 4 spaces per indentation level')
words = sentence.words
print(words)
print(words[2].singularize())
print(words[0].pluralize())

['Use', '4', 'spaces', 'per', 'indentation', 'level']
space
Uses


In [8]:
# Lemmatization
from textblob import Word
w1 = Word("lions")
print(w1.lemmatize())
w2 = Word("went")
print(w2.lemmatize("v"))

lion
go


In [9]:
# Synset
# It is a special kind of a simple interface that is present in the NLTK for look up words in WordNet. Synset instances are 
# the groupings of synonymous that express the same type of concept. Some words have only one synset and some have several.
from textblob import Word
from textblob.wordnet import VERB
word = Word("goat")
print(word.synsets)
print(Word("hack").get_synsets(pos=VERB))

[Synset('goat.n.01'), Synset('butt.n.03'), Synset('capricorn.n.01'), Synset('capricorn.n.03')]
[Synset('chop.v.05'), Synset('hack.v.02'), Synset('hack.v.03'), Synset('hack.v.04'), Synset('hack.v.05'), Synset('hack.v.06'), Synset('hack.v.07'), Synset('hack.v.08')]


In [10]:
# You can access the definitions for each synset via the definitions property or the define() method, 
# which can also take an optional part-of-speech(pos) argument.
Word("Height").definitions

['the vertical dimension of extension; distance from the base of something to the top',
 'the highest level or degree attainable; the highest stage of development',
 '(of a standing person) the distance from head to foot',
 "elevation especially above sea level or above the earth's surface"]

In [11]:
# You can also create synsets directly.
from textblob.wordnet import Synset
octopus = Synset("octopus.n.02")
shrimp = Synset("shrimp.n.03")
octopus.path_similarity(shrimp)

0.1111111111111111

In [13]:
# Wordlists
# A wordlist is just the Python list with additional methods.
animals = TextBlob("cow sheep octopus")
print(animals.words)
print(animals.words.pluralize())

['cow', 'sheep', 'octopus']
['kine', 'sheep', 'octopodes']


In [16]:
# Spelling Correction
g = TextBlob('can yooou pronounce czechuslovakia?')
g.correct()

TextBlob("can you pronounce czechoslovakia?")

In [20]:
# Word objects have a spellcheck(), this method that returns a list of (word, confidence) tuples with spelling suggestions.
from textblob import Word
k = Word("longitude")
k.spellcheck()

[('longitude', 1.0)]

In [33]:
# Get Word and Noun Phrase Frequencies
sent = TextBlob('She sales sea shells at the sea shore.')
print(sent.word_counts)
print(sent.words.count('Sea', case_sensitive=True))

# Each of these methods can also be used with noun phrases.
print(sent.noun_phrases.count('sea'))

defaultdict(<class 'int'>, {'she': 1, 'sales': 1, 'sea': 2, 'shells': 1, 'at': 1, 'the': 1, 'shore': 1})
0
0


In [40]:
# Translation and Language Detection
blob = TextBlob(u'Something is better than nothing.')
blob.translate(from_lang='en', to='hi')

TextBlob("कुछ नहीं से कुछ भला।")

In [44]:
# Parsing
# By default, TextBlob uses pattern's parser.
b = TextBlob("And now for something completely different.")
print(b.parse())

And/CC/O/O now/RB/B-ADVP/O for/IN/B-PP/B-PNP something/NN/B-NP/I-PNP completely/RB/B-ADJP/O different/JJ/I-ADJP/O ././O/O


In [45]:
# Textblob are like python strings
b[:15]

TextBlob("And now for som")

In [46]:
b.upper()

TextBlob("AND NOW FOR SOMETHING COMPLETELY DIFFERENT.")

In [47]:
b.find("and")

-1

In [49]:
# n-grams
# ngrams() returns a list of tuples of n successice words.
blob = TextBlob("Now is better than never.")
blob.ngrams(n=4)

[WordList(['Now', 'is', 'better', 'than']),
 WordList(['is', 'better', 'than', 'never'])]

In [53]:
# get start and end indices of sentences
zen = TextBlob("Data is a new fuel. "
               "Explicit is better than implicit. "
               "Simple is better than complex. ")
for line in zen.sentences:
    print(line + "start: " + str(line.start) + " end: " + str(line.end))

Data is a new fuel.start: 0 end: 19
Explicit is better than implicit.start: 20 end: 53
Simple is better than complex.start: 54 end: 84


## Text Classification System

The textblob.classifiers module makes it simple to create custom classifiers.

As an example, let’s create a custom sentiment analyzer.

### Loading Data and Creating a Classifier


In [54]:
# First we'll create some training and test data.
train = [
    ('I love this sandwich.', 'pos'),
    ('this is an amazing place!', 'pos'),
    ('I feel very good about these beers.', 'pos'),
    ('this is my best work.', 'pos'),
    ("what an awesome view", 'pos'),
    ('I do not like this restaurant', 'neg'),
    ('I am tired of this stuff.', 'neg'),
    ("I can't deal with this", 'neg'),
    ('he is my sworn enemy!', 'neg'),
    ('my boss is horrible.', 'neg')
]
test = [
    ('the beer was good.', 'pos'),
    ('I do not enjoy my job', 'neg'),
    ("I ain't feeling dandy today.", 'neg'),
    ("I feel amazing!", 'pos'),
    ('Gary is a friend of mine.', 'pos'),
    ("I can't believe I'm doing this.", 'neg')
]

In [56]:
# Now we’ll create a Naive Bayes classifier, passing the training data into the constructor.
from textblob.classifiers import NaiveBayesClassifier
cl = NaiveBayesClassifier(train)
cl

<NaiveBayesClassifier trained on 10 instances>

### Loading data from files
<br/>
CSV files should be formatted like so:
<br/>
I love this sandwich.,pos<br/>
This is an amazing place!,pos<br/>
I do not like this restaurant,neg<br>
<br/><br/>
JSON files should be formatted like so:
<br>
[<br>
    {"text": "I love this sandwich.", "label": "pos"},<br>
    {"text": "This is an amazing place!", "label": "pos"},<br>
    {"text": "I do not like this restaurant", "label": "neg"}<br>
]<br><br>
You can then pass the opened file into the constructor.<br>

with open('train.json', 'r') as fp:<br>
    cl = NaiveBayesClassifier(fp, format="json")

### Classifying Text

In [57]:
cl.classify("This is an amazing library!")

'pos'

In [59]:
# You can get the label probability distribution with the prob_classify(text) method.
prob_dist = cl.prob_classify("This is an amazing library!")
prob_dist.max()

'pos'

In [60]:
prob_dist.prob("pos")

0.980117820324005

In [61]:
prob_dist.prob("neg")

0.01988217967599422

### Classifying TextBlobs

In [63]:
from textblob import TextBlob
blob = TextBlob("The beer is good. But the hangover is horrible.", classifier=cl)
blob.classify()

'pos'

In [64]:
# The advantage of this approach is that you can classify sentences within a TextBlob.
for s in blob.sentences:
    print(s, s.classify())

The beer is good. pos
But the hangover is horrible. neg


### Evaluating Classifiers

In [65]:
cl.accuracy(test)

0.8333333333333334

In [66]:
# Use the show_informative_features() method to display a listing of the most informative features.
cl.show_informative_features()

Most Informative Features
            contains(my) = True              neg : pos    =      1.7 : 1.0
            contains(an) = False             neg : pos    =      1.6 : 1.0
             contains(I) = False             pos : neg    =      1.4 : 1.0
             contains(I) = True              neg : pos    =      1.4 : 1.0
            contains(my) = False             pos : neg    =      1.3 : 1.0
         contains(about) = False             neg : pos    =      1.2 : 1.0
            contains(am) = False             pos : neg    =      1.2 : 1.0
       contains(amazing) = False             neg : pos    =      1.2 : 1.0
       contains(awesome) = False             neg : pos    =      1.2 : 1.0
         contains(beers) = False             neg : pos    =      1.2 : 1.0


### Updating Classifiers with new data

In [67]:
new_data = [('She is my best friend.', 'pos'),
            ("I'm happy to have a new friend.", 'pos'),
            ("Stay thirsty, my friend.", 'pos'),
            ("He ain't from around here.", 'neg')]
cl.update(new_data=new_data)

True

In [69]:
cl.accuracy(test)

1.0

### Feature Extractors
By default, the NaiveBayesClassifier uses a simple feature extractor that indicates which words in the training set are contained in a document.
<br><br>
For example, the sentence “I feel happy” might have the features contains(happy): True or contains(angry): False.
<br><br>
You can override this feature extractor by writing your own. A feature extractor is simply a function with document (the text to extract features from) as the first argument. The function may include a second argument, train_set (the training dataset), if necessary.
<br><br>
The function should return a dictionary of features for document.
<br><br>
For example, let’s create a feature extractor that just uses the first and last words of a document as its features.


In [72]:
def end_word_extractor(document):
    tokens = document.split()
    first_word, last_word = tokens[0], tokens[-1]
    feats = {}
    feats["first({0})".format(first_word)] = True
    feats["last({0})".format(last_word)] = False
    return feats

features = end_word_extractor("I feel happy")
print(features)
assert features == {'last(happy)': False, 'first(I)': True}

{'first(I)': True, 'last(happy)': False}


In [75]:
# We can then use the feature extractor in a classifier by passing it as the second argument of the constructor.
cl2 = NaiveBayesClassifier(test, feature_extractor=end_word_extractor)
blob = TextBlob("I'm excited to try my new classifier.", classifier=cl2)
blob.classify()

'pos'