In [5]:
import nltk
import numpy as np
import pandas as pd

In [4]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

## Tokenizing and Stop Words


In [6]:
sou_file = open("/home/user/crs1278/data/Eisenhower_1957.txt")
sou_text = sou_file.read()
sou_file.close()

In [9]:
sou_text[:150]

'To the Congress of the United States:\n\nI appear before the Congress today to report on the State of the Union and\nthe relationships of the Union to th'

In [10]:
# Typically we would do this at the top of the notebook but it is here just so the reader
# knows where this package is used
from nltk.tokenize import word_tokenize, sent_tokenize

You can use words as the tokens

In [11]:
sou_word_tokens = word_tokenize(sou_text)

Let's check what kind of structure we get

In [12]:
type(sou_word_tokens)

list

In [14]:
sou_word_tokens[:25]

['To',
 'the',
 'Congress',
 'of',
 'the',
 'United',
 'States',
 ':',
 'I',
 'appear',
 'before',
 'the',
 'Congress',
 'today',
 'to',
 'report',
 'on',
 'the',
 'State',
 'of',
 'the',
 'Union',
 'and',
 'the',
 'relationships']

You could also use sentences as tokens

In [16]:
sou_sent_tokens = sent_tokenize(sou_text)

In [17]:
sou_sent_tokens[:25]

['To the Congress of the United States:\n\nI appear before the Congress today to report on the State of the Union and\nthe relationships of the Union to the other nations of the world.',
 'I come\nhere, firmly convinced that at no time in the history of the Republic have\ncircumstances more emphatically underscored the need, in all echelons of\ngovernment, for vision and wisdom and resolution.',
 'You meet in a season of stress that is testing the fitness of political\nsystems and the validity of political philosophies.',
 'Each stress stems in\npart from causes peculiar to itself.',
 'But every stress is a reflection of a\nuniversal phenomenon.',
 'In the world today, the surging and understandable tide of nationalism is\nmarked by widespread revulsion and revolt against tyranny, injustice,\ninequality and poverty.',
 'As individuals, joined in a common hunger for\nfreedom, men and women and even children pit their spirit against guns and\ntanks.',
 'On a larger scale, in an ever more

We shall use the words as tokens

## Removing the stop words

In [19]:
from nltk.tokenize import RegexpTokenizer
# The r is to make the regular expression a raw string so that you 
# don't have to escape the slash
tokenizer = RegexpTokenizer(r'\w+')
sou_word_tokens = tokenizer.tokenize(str(sou_word_tokens))

In [21]:
sou_word_tokens[:25]

['To',
 'the',
 'Congress',
 'of',
 'the',
 'United',
 'States',
 'I',
 'appear',
 'before',
 'the',
 'Congress',
 'today',
 'to',
 'report',
 'on',
 'the',
 'State',
 'of',
 'the',
 'Union',
 'and',
 'the',
 'relationships',
 'of']

Lower case all the tokens

In [22]:
sou_word_tokens = [word.lower() for word in sou_word_tokens]

In [24]:
sou_word_tokens[:25]

['to',
 'the',
 'congress',
 'of',
 'the',
 'united',
 'states',
 'i',
 'appear',
 'before',
 'the',
 'congress',
 'today',
 'to',
 'report',
 'on',
 'the',
 'state',
 'of',
 'the',
 'union',
 'and',
 'the',
 'relationships',
 'of']

We remove the stop words

In [34]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words[:20]

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his']

Note that the stop words are in lower case

In [37]:
word_tokens = [word for word in sou_word_tokens if word not in stop_words]
word_tokens[:20]

['congress',
 'united',
 'states',
 'appear',
 'congress',
 'today',
 'report',
 'state',
 'union',
 'relationships',
 'union',
 'nations',
 'world',
 'come',
 'firmly',
 'convinced',
 'time',
 'history',
 'republic',
 'circumstances']

## Bag of Words and CountVectorizer

In [55]:
from sklearn.feature_extraction.text import CountVectorizer
#c = CountVectorizer(stop_words='english', token_pattern=r'\w+',max_features=50 )
c = CountVectorizer(stop_words='english', token_pattern=r'\w+')


In [56]:
converted_data = c.fit_transform(sou_sent_tokens).todense()
print(converted_data.shape)

(187, 1112)


In [57]:
converted_data

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 1, 0, 0]])

In [58]:
c.get_feature_names()[:25]

['1',
 '170',
 '2',
 '3',
 '4',
 '85th',
 'able',
 'abroad',
 'absolute',
 'accentuate',
 'accomplished',
 'account',
 'achieve',
 'action',
 'actions',
 'activities',
 'adapted',
 'additional',
 'adequacy',
 'adequate',
 'administration',
 'administrative',
 'advance',
 'advantages',
 'aerial']

This data comes from
https://archive.ics.uci.edu/ml/datasets/Spambase