In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import nltk, re, pprint
from nltk import word_tokenize # for every chapter from now

In [3]:
# a lot of basic python i will skip here
# cutting data into a sequence of two or more parts
text = nltk.corpus.nps_chat.words()
cut = int(0.9 * len(text))
training_data, test_data = text[:cut], text[cut:]
text == training_data + test_data
len(training_data)/len(test_data)

True

9.0

In [5]:
# Combining different sequence types
words = 'I turned off the spectroroute'.split() # using a list comprehension to form a list of tuples
wordlens = [(len(word), word) for word in words] 
wordlens.sort()
" ".join(w for (_, w) in wordlens)

'I off the turned spectroroute'

In [12]:
# Generator Expressions
text = '''"When I use a word," Humpty Dumpty said in rather a scornful tone,
"it means just what I choose it to mean - neither more nor less."'''
print([w.lower() for w in word_tokenize(text)])

['``', 'when', 'i', 'use', 'a', 'word', ',', "''", 'humpty', 'dumpty', 'said', 'in', 'rather', 'a', 'scornful', 'tone', ',', "''", 'it', 'means', 'just', 'what', 'i', 'choose', 'it', 'to', 'mean', '-', 'neither', 'more', 'nor', 'less', '.', "''"]


In [16]:
# we can process the words like we did by putting them inside the print statement. python let's us omit the brackets
max([w.lower() for w in word_tokenize(text)]) 
max(w.lower() for w in word_tokenize(text)) # this uses a generator expression

# in many language processing tasks, generator expressions are useful as in the first code, storage of the object must be 
# allocated before the value of max is calculated

'word'

'word'

In [18]:
# looping using enumerate
fd = nltk.FreqDist(nltk.corpus.brown.words())
cumulative = 0.0
most_common_words = [word for (word, count) in fd.most_common()]
for rank, word in enumerate(most_common_words):
    cumulative += fd.freq(word)
    print("%3d %6.2f%% %s" % (rank+1, cumulative*100, word))
    if cumulative > 0.25:
        break

  1   5.40% the
  2  10.42% ,
  3  14.67% .
  4  17.78% of
  5  20.19% and
  6  22.40% to
  7  24.29% a
  8  25.97% in


In [25]:
# Looping to get successive overlapping n-grams from a list
sent = ['The', 'dog', 'gave', 'John', 'the', 'newspaper']
n = 3
[sent[i:i+n] for i in range(len(sent)-n+1)] # NLTK has functions bigrams, and trigrams and ngrams(text, n)

[['The', 'dog', 'gave'],
 ['dog', 'gave', 'John'],
 ['gave', 'John', 'the'],
 ['John', 'the', 'newspaper']]

In [29]:
# Functions as arguments: python lets us pass functions as arguments to another funntions
sent = ['Take', 'care', 'of', 'the', 'sense', ',', 'and', 'the',
        'sounds', 'will', 'take', 'care', 'of', 'themselves', '.']

def extract_property(prop):
    return [prop(word) for word in sent]

extract_property(len)

def last_letter(word):
    return word[-1]

extract_property(last_letter)

[4, 4, 2, 3, 5, 1, 3, 3, 6, 4, 4, 4, 2, 10, 1]

['e', 'e', 'f', 'e', 'e', ',', 'd', 'e', 's', 'l', 'e', 'e', 'f', 's', '.']

In [31]:
# Accumulative functions
def search1(substring, words):
    result = []                 # accumulative functions start by initializing some storage and iterate over the input to build
    for word in words:          # it up
        if substring in word:
            result.append(word)
    return result

def search2(substring, words): # search2 is a generator
    for word in words:
        if substring in word:
            yield word
# the first time search2 is called, it gets as far as the yield statement and pauses.
# the calling program gets the first word and does any necessary processing. once the calling program is ready for another word, 
# execution of the function is continued from where it stopped, until the next time it encounters a yield statement

In [34]:
def permutations(seq): # a more sophisticated example of a generator which produces all permutations of a list of words
    if len(seq) <= 1:
        yield seq
    else:
        for perm in permutations(seq[1:]):
            for i in range(len(perm) + 1):
                yield perm[:i] + seq[0:1] + perm[i:]

list(permutations(["police", "fish", "buffalo"]))

[['police', 'fish', 'buffalo'],
 ['fish', 'police', 'buffalo'],
 ['fish', 'buffalo', 'police'],
 ['police', 'buffalo', 'fish'],
 ['buffalo', 'police', 'fish'],
 ['buffalo', 'fish', 'police']]

In [37]:
["police", "fish", "buffalo"][1:]
["police", "fish", "buffalo"][:1]
["police", "fish", "buffalo"][0:1]

['fish', 'buffalo']

['police']

['police']

In [40]:
# Higher order functions
def is_content_word(word): # we use this function as the first parameter to filter() which applies the function to each item of
    return word.lower() not in ['a', 'of', 'the', 'and', 'will', ',', '.'] # the second parameter and only retains the items that
# return true

sent = ['Take', 'care', 'of', 'the', 'sense', ',', 'and', 'the', 'sounds', 'will', 'take', 'care', 'of', 'themselves', '.']

list(filter(is_content_word, sent))

['Take', 'care', 'sense', 'sounds', 'take', 'care', 'themselves']

In [42]:
# Another higher order function is map(), which applies a function to every item in a sequence.
# here is a simple way to find the average length of a sentence in the news section of the brown corpus
lengths = list(map(len, nltk.corpus.brown.sents(categories="news")))
sum(lengths)/len(lengths)

21.75081116158339

In [48]:
# we can also provide a lambda expression. here is a pair of equivalent examples which count the number of vowels in each word
list(map(lambda w: len(filter(lambda c: c.lower() in "aeiou", w)), sent))

# generally, solutions based on list comprehensions are more readable than solutions based on higher order functions

TypeError: object of type 'filter' has no len()