In [1]:
# This code assumes you have downloaded nltk (pip install nltk) and that you have downloaded the nltk data.

# To download nltk data, open python and do the following:

# import nltk
# nltk.download()

# This produces a box that lets you download the necessary corpora.

In [2]:
import nltk
from nltk.corpus import brown

In [3]:
# brown is a corpus.reader object
#http://www.nltk.org/api/nltk.corpus.reader.html
print type(brown)

<class 'nltk.corpus.util.LazyCorpusLoader'>


In [4]:
# the brown corpus is made up of many files
# We can ignore this for the most part if we want to
# work with the whole corpus and not individual files
files = brown.fileids()
print files

[u'ca01', u'ca02', u'ca03', u'ca04', u'ca05', u'ca06', u'ca07', u'ca08', u'ca09', u'ca10', u'ca11', u'ca12', u'ca13', u'ca14', u'ca15', u'ca16', u'ca17', u'ca18', u'ca19', u'ca20', u'ca21', u'ca22', u'ca23', u'ca24', u'ca25', u'ca26', u'ca27', u'ca28', u'ca29', u'ca30', u'ca31', u'ca32', u'ca33', u'ca34', u'ca35', u'ca36', u'ca37', u'ca38', u'ca39', u'ca40', u'ca41', u'ca42', u'ca43', u'ca44', u'cb01', u'cb02', u'cb03', u'cb04', u'cb05', u'cb06', u'cb07', u'cb08', u'cb09', u'cb10', u'cb11', u'cb12', u'cb13', u'cb14', u'cb15', u'cb16', u'cb17', u'cb18', u'cb19', u'cb20', u'cb21', u'cb22', u'cb23', u'cb24', u'cb25', u'cb26', u'cb27', u'cc01', u'cc02', u'cc03', u'cc04', u'cc05', u'cc06', u'cc07', u'cc08', u'cc09', u'cc10', u'cc11', u'cc12', u'cc13', u'cc14', u'cc15', u'cc16', u'cc17', u'cd01', u'cd02', u'cd03', u'cd04', u'cd05', u'cd06', u'cd07', u'cd08', u'cd09', u'cd10', u'cd11', u'cd12', u'cd13', u'cd14', u'cd15', u'cd16', u'cd17', u'ce01', u'ce02', u'ce03', u'ce04', u'ce05', u'ce06', 

In [5]:
# sents(fileids=None) returns the given files as a list of sentences, each of which is a list of word strings
# using sents() with no arguments gives you sentences from all files in the corpus
# return type: list(list(str))

# words(fileids=None) returns the given fields as a list of words and punctuation symbols.
sentences = brown.sents()
words = brown.words()

In [6]:
num_sentences = len(sentences)
num_words = len(words)
print "# sentences:",num_sentences
print "# words",num_words

# sentences: 57340
# words 1161192


In [7]:
# the u in u'somestring' just means unicode. This is a very reliable encoding, but careful because
# python will not always say that 'mark' == u'mark'
print sentences[201]

[u'``', u'The', u'statements', u'may', u'be', u'highly', u'prejudicial', u'to', u'my', u'client', u"''", u',', u'Bellows', u'told', u'the', u'court', u'.']


In [8]:
print words[10:30]

[u"Atlanta's", u'recent', u'primary', u'election', u'produced', u'``', u'no', u'evidence', u"''", u'that', u'any', u'irregularities', u'took', u'place', u'.', u'The', u'jury', u'further', u'said', u'in']


In [9]:
# raw(filesids=None) gives you a single string, where the characters form words and their tags
raw = brown.raw()
print raw[:200]



	The/at Fulton/np-tl County/nn-tl Grand/jj-tl Jury/nn-tl said/vbd Friday/nr an/at investigation/nn of/in Atlanta's/np$ recent/jj primary/nn election/nn produced/vbd ``/`` no/at evidence/nn ''/'' tha


In [10]:
# This takes the big stream of words and turns them into a single string. This is different from "raw"
# because it has no tags, and differs from "sentences" and "words" because it is a single string and not a list.
whole_corpus = ' '.join(words)
print whole_corpus[:1000]

The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place . The jury further said in term-end presentments that the City Executive Committee , which had over-all charge of the election , `` deserves the praise and thanks of the City of Atlanta '' for the manner in which the election was conducted . The September-October term jury had been charged by Fulton Superior Court Judge Durwood Pye to investigate reports of possible `` irregularities '' in the hard-fought primary which was won by Mayor-nominate Ivan Allen Jr. . `` Only a relative handful of such reports was received '' , the jury said , `` considering the widespread interest in the election , the number of voters and the size of this city '' . The jury said it did find that many of Georgia's registration and election laws `` are outmoded or inadequate and often ambiguous '' . It recommended that Fulton legislators act `` to have th

In [11]:
whole_corpus = whole_corpus.lower()
print whole_corpus[:1000]

the fulton county grand jury said friday an investigation of atlanta's recent primary election produced `` no evidence '' that any irregularities took place . the jury further said in term-end presentments that the city executive committee , which had over-all charge of the election , `` deserves the praise and thanks of the city of atlanta '' for the manner in which the election was conducted . the september-october term jury had been charged by fulton superior court judge durwood pye to investigate reports of possible `` irregularities '' in the hard-fought primary which was won by mayor-nominate ivan allen jr. . `` only a relative handful of such reports was received '' , the jury said , `` considering the widespread interest in the election , the number of voters and the size of this city '' . the jury said it did find that many of georgia's registration and election laws `` are outmoded or inadequate and often ambiguous '' . it recommended that fulton legislators act `` to have th

In [12]:
# Now you have all of the sentences in a format independent of NLTK: just a list of sentences.
# You can now define any kind of regex stuff you want to search over these
# But I don't know much about regex!

In [13]:
import re

def find_matches(pattern,text_to_search):
    for match in re.finditer(pattern,text_to_search):
        s = match.start()
        e = match.end()
        print 'Found "%s" at symbol [%d] through symbol [%d]' % (text_to_search[s:e], s, e)

In [14]:
# search for a simple literal string
find_matches("recent primary election",whole_corpus)

Found "recent primary election" at symbol [71] through symbol [94]


In [15]:
# Great, the positions were correct.
print whole_corpus[71:104]

recent primary election produced 


In [16]:
# instead of printing, you could modify find_matches() to return a list of matches,
# each of which is a tuple of (start_position,end_position)
def find_matches(pattern,text_to_search):
    return [(match.start(),match.end()) for match in re.finditer(pattern,text_to_search)]

In [17]:
list_of_matches = find_matches("recent primary election",whole_corpus)

In [18]:
list_of_matches = find_matches("recent primary election",whole_corpus)
print "pattern searched:","'recent primary election'"
print "nummber of matches:",len(list_of_matches)

# It's a list of tuples, so you can grab each part with this construction:
# for i,j in [(i1,j1),(i2,j2),(i3,j3),...]
for start,end in list_of_matches:
    print "start position:",start
    print "end position:",end
    print ""

pattern searched: 'recent primary election'
nummber of matches: 1
start position: 71
end position: 94

