# Chapter 6 Handling Text

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import re
import sys
import unicodedata
import numpy as np
from bs4 import BeautifulSoup
# nature language toolkit
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag
from sklearn.preprocessing import MultiLabelBinarizer

## 6.1 Cleaning Text

In [2]:
text_data = ["   Interrobang. By Aishwarya Henriette     ",
             "Parking And Going. By Karl Gautier",
             "   Today Is The night. By Jarek Prakash"]
strip_whitespace = [string.strip() for string in text_data]
remove_punctuation = [string.replace(".", "") for string in strip_whitespace]
capitalized = [string.upper() for string in remove_punctuation]
replace_with_X = [re.sub(r"[a-zA-Z]", "X", string) for string in capitalized]
print(strip_whitespace, remove_punctuation, capitalized, replace_with_X, sep='\n')

['Interrobang. By Aishwarya Henriette', 'Parking And Going. By Karl Gautier', 'Today Is The night. By Jarek Prakash']
['Interrobang By Aishwarya Henriette', 'Parking And Going By Karl Gautier', 'Today Is The night By Jarek Prakash']
['INTERROBANG BY AISHWARYA HENRIETTE', 'PARKING AND GOING BY KARL GAUTIER', 'TODAY IS THE NIGHT BY JAREK PRAKASH']
['XXXXXXXXXXX XX XXXXXXXXX XXXXXXXXX', 'XXXXXXX XXX XXXXX XX XXXX XXXXXXX', 'XXXXX XX XXX XXXXX XX XXXXX XXXXXXX']


## 6.2 Parsing and Cleaning HTML

In [3]:
html = """<div class='full_name'><span style='font-weight:bold'>Masego</span> Azra</div>"""
# parse html
soup = BeautifulSoup(html, "lxml")
# find the div with the class 'full_name', show text
soup.find("div", {"class": "full_name"}).text

'Masego Azra'

## 6.3 Removing Punctuation

In [4]:
text_data = ["Hi!!! I. love. This. Song....",
             "10000% Agree!!!! #Love IT",
             "Right?!?!"]
# dictionary of punctuation characters
punctuation = dict.fromkeys(i for i in range(sys.maxunicode)
                           if unicodedata.category(chr(i)).startswith('P'))
# remove punctuations
# str.translate(dict)
[string.translate(punctuation) for string in text_data]

['Hi I love This Song', '10000 Agree Love IT', 'Right']

## 6.4 Tokenizing Text

In [5]:
string = "The science of today is the technology of tomorrow"
word_tokenize(string)
string = "The science of today is the technology of tomorrow. Tomorrow is today"
sent_tokenize(string)

['The', 'science', 'of', 'today', 'is', 'the', 'technology', 'of', 'tomorrow']

['The science of today is the technology of tomorrow.', 'Tomorrow is today']

## 6.5 Removing Stop Words

In [6]:
tokenized_word = word_tokenize("i am going to go to the store and park")
# in lower case
stop_words = stopwords.words('english')
[word for word in tokenized_word if word not in stop_words]

['going', 'go', 'store', 'park']

## 6.6 Stemming Words

In [7]:
tokenized_word = word_tokenize("i am humbled by this traditional meeting")
porter = PorterStemmer()
[porter.stem(word) for word in tokenized_word]

['i', 'am', 'humbl', 'by', 'thi', 'tradit', 'meet']

## 6.7 Tagging Parts of Speech

In [8]:
text_data = "Chris loved outdoor running"
text_tagged = pos_tag(word_tokenize(text_data))
print(text_tagged)
# Tag  Part of speech
# NNP  Proper noun, singular
# NN   Noun, singular or mass
# RB   Adverb
# VBD  Verb, past tense
# VBG  Verb, gerund or present participle
# JJ   Adjective
# PRP  Personal pronoun
noun = [word for word, tag in text_tagged if tag in ['NN', 'NNS', 'NNP', 'NNPS']]
print(noun)

[('Chris', 'NNP'), ('loved', 'VBD'), ('outdoor', 'RP'), ('running', 'VBG')]
['Chris']


In [9]:
tweets = ["I am eating a burrito for breakfast",
          "Political science is an amazing field",
          "SanFrancisco is an awesome city"]
tagged_tweets = []
for tweet in tweets:
    tweet_tag = nltk.pos_tag(word_tokenize(tweet))
    tagged_tweets.append([tag for word, tag in tweet_tag])

# use one-hot encoding to convert the tags into features
one_hot_multi = MultiLabelBinarizer()
one_hot_multi.fit_transform(tagged_tweets)
one_hot_multi.classes_

array([[1, 1, 0, 1, 0, 1, 1, 1, 0],
       [1, 0, 1, 1, 0, 0, 0, 0, 1],
       [1, 0, 1, 1, 1, 0, 0, 0, 1]])

array(['DT', 'IN', 'JJ', 'NN', 'NNP', 'PRP', 'VBG', 'VBP', 'VBZ'],
      dtype=object)

In [10]:
# train speech of tag
from nltk.corpus import brown
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

# get some text from the Brown Corpus, broken into sentences
sentences = brown.tagged_sents(categories='news')
# split into 4000 sentences for training and 623 for testing
train = sentences[:4000]
test = sentences[4000:]
# create backoff tagger
unigram = UnigramTagger(train)
bigram = BigramTagger(train, backoff=unigram)
trigram = TrigramTagger(train, backoff=bigram)
# show accuracy
trigram.evaluate(test)

0.8174734002697437

## 6.8 Encoding Text as a Bag of Words

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

text_data = np.array(["I love Brazil. Brazil!",
                      "Sweden is best",
                      "Germany beats both"])

# create the bag of words feature matrix
count = CountVectorizer()
# a sparse matrix
bag_of_words = count.fit_transform(text_data)
# word and its count
count.get_feature_names()
bag_of_words.toarray()

['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love', 'sweden']

array([[0, 0, 0, 2, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 1, 0, 1],
       [1, 0, 1, 0, 1, 0, 0, 0]], dtype=int64)

In [12]:
count_2gram = CountVectorizer(ngram_range=(1, 2),
                              stop_words="english",
                              vocabulary=['brazil'])
bow = count_2gram.fit_transform(text_data)
count_2gram.get_feature_names()
count_2gram.vocabulary_
bow.toarray()

['brazil']

{'brazil': 0}

array([[2],
       [0],
       [0]], dtype=int64)

## 6.9 Weighting Word Importance

In [13]:
# term frequency-inverse document frequency (tf-idf)
from sklearn.feature_extraction.text import TfidfVectorizer

text_data = np.array(["I love Brazil. Brazil!",
                      "Sweden is best",
                      "Germany beats both"])
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)
tfidf.vocabulary_
feature_matrix.toarray()

{'love': 6,
 'brazil': 3,
 'sweden': 7,
 'is': 5,
 'best': 1,
 'germany': 4,
 'beats': 0,
 'both': 2}

array([[0.        , 0.        , 0.        , 0.89442719, 0.        ,
        0.        , 0.4472136 , 0.        ],
       [0.        , 0.57735027, 0.        , 0.        , 0.        ,
        0.57735027, 0.        , 0.57735027],
       [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027,
        0.        , 0.        , 0.        ]])