In [5]:
 from textblob import TextBlob 

In [6]:
wiki = TextBlob("Blake tindol cannot figure out how to pass a csv file to Text blob. This is a problem for blake because he wants to learn")

In [7]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /home/cloudera/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# part of speech tagging

In [8]:
wiki.tags


[('Blake', 'NNP'),
 ('tindol', 'NN'),
 ('can', 'MD'),
 ('not', 'RB'),
 ('figure', 'VB'),
 ('out', 'RP'),
 ('how', 'WRB'),
 ('to', 'TO'),
 ('pass', 'VB'),
 ('a', 'DT'),
 ('csv', 'NN'),
 ('file', 'NN'),
 ('to', 'TO'),
 ('Text', 'NNP'),
 ('blob', 'NN'),
 ('This', 'DT'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('problem', 'NN'),
 ('for', 'IN'),
 ('blake', 'NN'),
 ('because', 'IN'),
 ('he', 'PRP'),
 ('wants', 'VBZ'),
 ('to', 'TO'),
 ('learn', 'VB')]

# Extraction of non phrases

In [12]:
nltk.download('brown')
wiki.noun_phrases

[nltk_data] Downloading package brown to /home/cloudera/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


WordList(['blake', 'csv file', 'text'])

# Sentiment 

In [19]:
testimonial = TextBlob("Textblob is amazingly simple to use. What great fun!")
print("Sentiment: {}".format(testimonial.sentiment))
print("Polarity: {0:.3f}. Subjectivity: {1:.3f}".format(testimonial.sentiment[0], testimonial.sentiment[1]))
print("Polarity: {0:.3f}".format(testimonial.sentiment.polarity))
print("Sentiment assessments: {}".format(testimonial.sentiment_assessments))

Sentiment: Sentiment(polarity=0.39166666666666666, subjectivity=0.4357142857142857)
Polarity: 0.392. Subjectivity: 0.436
Polarity: 0.392
Sentiment assessments: Sentiment(polarity=0.39166666666666666, subjectivity=0.4357142857142857, assessments=[(['amazingly', 'simple'], 0.0, 0.35714285714285715, None), (['great'], 0.8, 0.75, None), (['fun', '!'], 0.375, 0.2, None)])


# (1.4) Tokenization


In [22]:
zen = TextBlob("Blake is cool. "
                "Melina is cooler. "
                " complex not cool.")

In [23]:
# splitting words
zen.words

WordList(['Blake', 'is', 'cool', 'Melina', 'is', 'cooler', 'complex', 'not', 'cool'])

In [24]:
# splitting sentences
zen.sentences

[Sentence("Blake is cool."),
 Sentence("Melina is cooler."),
 Sentence("complex not cool.")]

In [25]:
# Sentence-level sentiment.
for sentence in zen.sentences:
    print(sentence.sentiment)

Sentiment(polarity=0.35, subjectivity=0.65)
Sentiment(polarity=0.0, subjectivity=0.0)
Sentiment(polarity=-0.2375, subjectivity=0.525)


# Words inflection lemmatization and stemming

Each word in TextBlob.words or Sentence.words is a Word object, which is a subclass of unicode, with useful methods.

Words Inflection refers to a process of word formation from its base form into a word in a different grammatical category, such as tense, case, voice, aspect, person, mood, etc. Examples are "bus" --> "buses", "party" --> "parties", "ride" --> "riding", "Paul" --> "Paul's", "Him" --> "Himself", "Smart" --> "Smarter".

Words Lemmatization usually refers to a process to reduce inflectional forms of a word into the base dictionary form known as lemma. This is slightly different from stemming, where stemming chops off the ends of words hoping to reduce inflectional forms into the common root. This common root may not be a dictionary word. For example, "argument", "arguing" and "argues" are all reduced into "argu" in stemming.

In [28]:
sentence = TextBlob('The test tomorrow scares me')
sentence.words

WordList(['The', 'test', 'tomorrow', 'scares', 'me'])

In [29]:
# Singularize the 3rd word.
sentence.words[2].singularize()

'tomorrow'

In [30]:
# Pluralize the last word.
sentence.words[-1].pluralize()

'us'

# WordNet Integration

Note:
The synsets property returns a list of Synset objects for the word. In other words, it basically is a set of words or phrases that are roughly synonymous. Synsets include simplex words and collocations, such as "play around" and "car pools".

Each synset has three parts: <word>.<pos>.<number>, where <word> is the word, <pos> is the part-of-speech of the word, <number> is the index of the sense. The sense aspect denotes the different usages of the word. 

In [34]:
nltk.download('wordnet')
from textblob import Word
from textblob.wordnet import VERB
word = Word("octopus")
word.synsets

[nltk_data] Downloading package wordnet to /home/cloudera/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


[Synset('octopus.n.01'), Synset('octopus.n.02')]

In [35]:
word.detect_language()

'en'

In [38]:
# Word definitions
Word("data").definitions

['a collection of facts from which conclusions may be drawn',
 'an item of factual information derived from measurement or research']

In [40]:
Word("hack").get_synsets(pos=VERB)
#Note: the number at the end (called sense) corresponds to the different meanings shown by the definitions property. For example, "hack.v.03" corresponds to the third definitoon of the word hack (i.e., "a tool (as a hoe or pick or mattock) used for breaking up the surface of the soil'")

[Synset('chop.v.05'),
 Synset('hack.v.02'),
 Synset('hack.v.03'),
 Synset('hack.v.04'),
 Synset('hack.v.05'),
 Synset('hack.v.06'),
 Synset('hack.v.07'),
 Synset('hack.v.08')]

# Translate

In [44]:
from nltk.corpus import wordnet as wn
wn.synset('hack.v.03').examples()

['he hacked his way through the forest']

In [45]:
Word("他是好人").detect_language()

'zh-CN'

In [46]:
Word("做").get_synsets(pos=VERB)

[]

In [47]:
Word("做").translate(from_lang="zh-CN", to="en")

'do'

In [48]:
Word("他是好人").translate(from_lang="zh-CN", to="en")

'He is a good person'

In [49]:
Word("I have to go poop").translate(from_lang="en", to="zh-CN" )

'我必须大便'

In [50]:
# Word similarity
from textblob.wordnet import Synset
octopus = Synset('octopus.n.02')
shrimp = Synset('shrimp.n.03')
octopus.path_similarity(shrimp)

0.1111111111111111

In [51]:
chop = Synset('chop.v.05')
hack05 = Synset('hack.v.02')
chop.path_similarity(hack05)

0.125

In [52]:
# Getting words in text
animals = TextBlob("cat dog octopus")
animals.words

WordList(['cat', 'dog', 'octopus'])

In [53]:
# Pluralize words
animals.words.pluralize()

WordList(['cats', 'dogs', 'octopodes'])

# Correct spelling

In [54]:
# Spelling correction
b = TextBlob("I havv goode speling!")
print(b.correct())

I have good spelling!


In [55]:
# Show a list of possible words
from textblob import Word
w = Word('spirt')
w.spellcheck()

[('spirit', 0.6788617886178862),
 ('shirt', 0.2032520325203252),
 ('skirt', 0.06910569105691057),
 ('sport', 0.028455284552845527),
 ('spit', 0.008130081300813009),
 ('spire', 0.008130081300813009),
 ('spilt', 0.0040650406504065045)]

In [57]:
# Calculate word frequency
monty = TextBlob("We are no longer the Knights who say Ni. "
                     "We are now the Knights who say Ekki ekki ekki PTANG.")
monty.word_counts['ekki']

3

In [59]:
# Another way to calculate word frequency
monty.words.count('longer')

1

In [60]:
monty.words.count('ekki', case_sensitive=True)

2

In [61]:
wiki.noun_phrases.count('python')

0

# Translation and language detection

In [62]:
en_blob = TextBlob(u'Simple is better than complex.')
en_blob.translate(to = "zh-TW")

TextBlob("簡單勝於復雜。")

In [64]:
en_blob.translate(to = "fr")

TextBlob("Simple, c'est mieux que complexe.")

In [65]:
en_blob.translate(to = "zh-CN")

TextBlob("简单胜于复杂。")

In [66]:
chinese_blob = TextBlob(u"簡單比複雜更好")
chinese_blob.translate(from_lang="zh-TW", to="en")

TextBlob("Simple is better than complex")

# Parsing, indexing and n-grams

In [67]:
b = TextBlob("And now for sometghing completely different.")
print(b.parse())

And/CC/O/O now/RB/B-ADVP/O for/IN/B-PP/B-PNP sometghing/VBG/B-VP/I-PNP completely/RB/B-ADJP/O different/JJ/I-ADJP/O ././O/O


In [68]:
zen[0:19]

TextBlob("Blake is cool. Meli")

In [69]:
zen.find("Simple")

-1

In [70]:
apple_blob = TextBlob('apples')
banana_blob = TextBlob('bananas')
apple_blob + ' and ' + banana_blob

TextBlob("apples and bananas")

In [71]:
"{0} and {1}".format(apple_blob, banana_blob)

'apples and bananas'

In [72]:
# n-grams
blob = TextBlob("Now is better than never.")
blob.ngrams(n=3)

[WordList(['Now', 'is', 'better']),
 WordList(['is', 'better', 'than']),
 WordList(['better', 'than', 'never'])]

In [73]:
TextBlob("I liek to aet tuo.").correct()

TextBlob("I like to aet to.")

# Text Classification

In [74]:
# Training data

train = [
('I love this sandwich.', 'pos'),
('this is an amazing place!', 'pos'),
('I feel very good about these beers.', 'pos'),
('this is my best work.', 'pos'),
("what an awesome view", 'pos'),
('I do not like this restaurant', 'neg'),
('I am tired of this stuff.', 'neg'),
("I can't deal with this", 'neg'),
('he is my sworn enemy!', 'neg'),
('my boss is horrible.', 'neg')
]

In [75]:
# Test data

test = [
('the beer was good.', 'pos'),
('I do not enjoy my job', 'neg'),
("I ain't feeling dandy today.", 'neg'),
("I feel amazing!", 'pos'),
('Gary is a friend of mine.', 'pos'),
("I can't believe I'm doing this.", 'neg')
]

In [76]:
# Build a Naive Bayes classifier
from textblob.classifiers import NaiveBayesClassifier
cl = NaiveBayesClassifier(train)

# Classify

In [77]:
# Classify text
cl.classify("This is an amazing library!")

'pos'

In [78]:
prob_dist = cl.prob_classify("This one's a doozy.")
prob_dist.max()

'pos'

In [79]:
# Probability of the text being positive.
round(prob_dist.prob("pos"), 2)

0.63

In [80]:
# Probability of the text being negative.
round(prob_dist.prob("neg"), 2)

0.37

# Classify another way

In [81]:
from textblob import TextBlob
blob = TextBlob("The beer is good. But the hangover is horrible.", classifier=cl)
blob.classify()

'pos'

In [82]:
for s in blob.sentences:
    print(s)
    print(s.classify())

The beer is good.
pos
But the hangover is horrible.
neg


In [83]:
cl.accuracy(test)

0.8333333333333334

In [84]:
cl.show_informative_features(5)

Most Informative Features
            contains(my) = True              neg : pos    =      1.7 : 1.0
            contains(an) = False             neg : pos    =      1.6 : 1.0
             contains(I) = True              neg : pos    =      1.4 : 1.0
             contains(I) = False             pos : neg    =      1.4 : 1.0
            contains(my) = False             pos : neg    =      1.3 : 1.0


In [85]:
# Update the classifier with enw training data.
new_data = [('She is my best friend.', 'pos'),
("I'm happy to have a new friend.", 'pos'),
("Stay thirsty, my friend.", 'pos'),
("He ain't from around here.", 'neg')]
cl.update(new_data)
cl.accuracy(test)

1.0

# Simplified Text Processing

In [86]:
from textblob import TextBlob

text = '''
The titular threat of The Blob has always struck me as the ultimate movie
monster: an insatiably hungry, amoeba-like mass able to penetrate
virtually any safeguard, capable of--as a doomed doctor chillingly
describes it--"assimilating flesh on contact.
Snide comparisons to gelatin be damned, it's a concept with the most
devastating of potential consequences, not unlike the grey goo scenario
proposed by technological theorists fearful of
artificial intelligence run rampant.
'''

blob = TextBlob(text)
blob.tags           # [('The', 'DT'), ('titular', 'JJ'),
                    #  ('threat', 'NN'), ('of', 'IN'), ...]

blob.noun_phrases   # WordList(['titular threat', 'blob',
                    #            'ultimate movie monster',
                    #            'amoeba-like mass', ...])

i = 1
print("Sentence polarity:")
for sentence in blob.sentences:
    print("Sentence %d: %7.3f" % (i, sentence.sentiment.polarity))
    i = i + 1
# 0.060
# -0.341

blob.translate(to="zh-TW")  # 'La amenaza titular de The Blob...'

Sentence polarity:
Sentence 1:   0.060
Sentence 2:  -0.342


TextBlob("The Blob的名義威脅一直令我震驚，因為它是終極電影
怪物：一種無法滿足的飢餓，變形蟲般的物質，能夠穿透
幾乎任何有保障的能力-就像一位注定要死的醫生一樣
形容它“在接觸時吸收肉。
該死的與明膠的比較是該死的，這是最多的概念
毀滅性的潛在後果，與灰色的情況不謀而合
技術理論家提出的擔心
人工智能猖ramp。")