###Text Processing
Text processing is extracting useful features out of the text that's given to us. 
### Why do we this?


In [36]:
import nltk

In [37]:
nltk.download('brown')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [38]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [39]:
brown.words()

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [40]:
data = brown.sents(categories = ["adventure"])

In [41]:
len(data)

4637

In [42]:
data[0]

['Dan',
 'Morgan',
 'told',
 'himself',
 'he',
 'would',
 'forget',
 'Ann',
 'Turner',
 '.']

In [43]:
" ".join(data[0])

'Dan Morgan told himself he would forget Ann Turner .'

In [44]:
" ".join(data[1])

'He was well rid of her .'

In [45]:
" ".join(data[3])

"If he had married her , he'd have been asking for trouble ."

In [46]:
len(brown.words())

1161192

##Tokenization

In [47]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [16]:
document = """ It was a very good movie. The cast was amazing and I liked the story.
I went to the movie hall to see it.
"""

In [48]:
sentence = "Help this noob with his programming skills, @ee20b037"

In [49]:
import nltk
nltk.download('punkt')
  

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [50]:
sents = sent_tokenize(document)
print(sents)
len(sents)

[' It was a very good movie.', 'The cast was amazing and I liked the story.', 'I went to the movie hall to see it.']


3

In [51]:
words = word_tokenize(sentence) # also break down special characters
print(words)
print(len(words))

['Help', 'this', 'noob', 'with', 'his', 'programming', 'skills', ',', '@', 'ee20b037']
10


## Stopword Removal

In [52]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [53]:
sw = set(stopwords.words('english'))

In [54]:
text = "i am not a very good data scientist".split()
print(text)

['i', 'am', 'not', 'a', 'very', 'good', 'data', 'scientist']


In [55]:
def remove_stoprwords(text, stopwords):
    useful = [w for w in text if w not in stopwords]
    return useful

In [56]:
useful_words = remove_stoprwords(text, sw)
useful_words


['good', 'data', 'scientist']

In [57]:
##Tokenization using regex
sent = "My email is ee20b037@smail.iitm.ac.in, please don't spam my inbox"

In [58]:
from nltk.tokenize import RegexpTokenizer

In [59]:
tokenizer = RegexpTokenizer('[a-zA-Z@.]+')
useful = tokenizer.tokenize(sentence)
print(useful)

['Help', 'this', 'noob', 'with', 'his', 'programming', 'skills', '@ee', 'b']


In [60]:
##NLTK also provides us with stemmers like Porter, Snowball, Lancaster stemmers
from nltk.stem import SnowballStemmer, PorterStemmer, LancasterStemmer
ps = PorterStemmer()
ps.stem('fapping')

'fap'

In [61]:
# SnowballStemmer = Multilingual, supports other langs also.
corpus = [
    'Dan Morgan told himself he would forget Ann Turner.',
    'Sometimes he woke up in the middle of the night thinking of Ann , and then could not get back to sleep .',
    'His plans and dreams had revolved around her so much and for so long that now he felt as if he had nothing .',
    'He found that if he was tired enough at night , he went to sleep simply because he was too exhausted to stay awake .'
]

In [62]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

vc = cv.fit_transform(corpus)



In [63]:
vc = vc.toarray()
print(vc)
print(cv.vocabulary_)

[[0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1]
 [1 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 0 1 1 0 0 2 0 0
  0 1 0 1 0 0 2 1 1 0 1 0 0 0 1 0 0 1 0]
 [2 0 1 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 2 2 1 0 1 1 0 1 0 0 1 0 0 1 1 0 1 1
  0 0 2 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 1 0 1 0 0 0 1 1 0 0 0 1 0 0 4 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0
  1 1 0 0 1 1 0 0 0 1 2 0 1 0 0 2 1 0 0]]
{'dan': 9, 'morgan': 27, 'told': 47, 'himself': 21, 'he': 19, 'would': 54, 'forget': 15, 'ann': 1, 'turner': 49, 'sometimes': 39, 'woke': 53, 'up': 50, 'in': 24, 'the': 42, 'middle': 26, 'of': 33, 'night': 29, 'thinking': 44, 'and': 0, 'then': 43, 'could': 8, 'not': 30, 'get': 17, 'back': 6, 'to': 46, 'sleep': 37, 'his': 22, 'plans': 34, 'dreams': 10, 'had': 18, 'revolved': 35, 'around': 2, 'her': 20, 'so': 38, 'much': 28, 'for': 14, 'long': 25, 'that': 41, 'now': 32, 'felt': 13, 'as': 3, 'if': 23, 'nothing': 31, 'found': 16, 'was': 

In [64]:
print(len(cv.vocabulary_))

55


In [65]:
def myTokenizer(document):
    words = tokenizer.tokenize(document.lower())
    # remove the stopwords
    words = remove_stoprwords(words, sw)
    return words

In [66]:
myTokenizer('this is a random text')

['random', 'text']

In [67]:
cv = CountVectorizer(tokenizer=myTokenizer)

In [68]:
vc = cv.fit_transform(corpus).toarray()
print(vc)

[[0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1]
 [1 1 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 1 0]
 [1 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 1 0 0 0 0 1 1 0 0 1 0 0 0 0 0 1 0 0 0 1 1 0 1 0 1 0 0 1 0 0]]


In [69]:
len(vc[0])
cv.vocabulary_

{'.': 0,
 'ann': 1,
 'around': 2,
 'awake': 3,
 'back': 4,
 'could': 5,
 'dan': 6,
 'dreams': 7,
 'enough': 8,
 'exhausted': 9,
 'felt': 10,
 'forget': 11,
 'found': 12,
 'get': 13,
 'long': 14,
 'middle': 15,
 'morgan': 16,
 'much': 17,
 'night': 18,
 'nothing': 19,
 'plans': 20,
 'revolved': 21,
 'simply': 22,
 'sleep': 23,
 'sometimes': 24,
 'stay': 25,
 'thinking': 26,
 'tired': 27,
 'told': 28,
 'turner.': 29,
 'went': 30,
 'woke': 31,
 'would': 32}

In [70]:
len(cv.transform([sent]).toarray()[0])

33