## Word frequency, unigrams, bigrams and trigrams

In [3]:
import nltk
from nltk.util import ngrams
nltk.download('alpino')
from nltk.corpus import alpino
alpino.words()

[nltk_data] Downloading package alpino to
[nltk_data]     /Users/alextanhongpin/nltk_data...
[nltk_data]   Package alpino is already up-to-date!


['De', 'verzekeringsmaatschappijen', 'verhelen', ...]

In [11]:
unigrams = ngrams(alpino.words(), 1)
row = 0
for i in unigrams:
    print(i)
    row += 1
    if row > 10:
        break

('De',)
('verzekeringsmaatschappijen',)
('verhelen',)
('niet',)
('dat',)
('ook',)
('de',)
('rentegrondslag',)
('van',)
('vier',)
('procent',)


In [12]:
quadgrams = ngrams(alpino.words(), 4)
row = 0
for i in quadgrams:
    print(i)
    row += 1
    if row > 10:
        break

('De', 'verzekeringsmaatschappijen', 'verhelen', 'niet')
('verzekeringsmaatschappijen', 'verhelen', 'niet', 'dat')
('verhelen', 'niet', 'dat', 'ook')
('niet', 'dat', 'ook', 'de')
('dat', 'ook', 'de', 'rentegrondslag')
('ook', 'de', 'rentegrondslag', 'van')
('de', 'rentegrondslag', 'van', 'vier')
('rentegrondslag', 'van', 'vier', 'procent')
('van', 'vier', 'procent', 'nog')
('vier', 'procent', 'nog', 'een')
('procent', 'nog', 'een', 'ruime')


In [17]:
import nltk
from nltk.collocations import BigramCollocationFinder
nltk.download('webtext')
from nltk.corpus import webtext
from nltk.metrics import BigramAssocMeasures

[nltk_data] Downloading package webtext to
[nltk_data]     /Users/alextanhongpin/nltk_data...
[nltk_data]   Package webtext is already up-to-date!


In [20]:
tokens = [t.lower() for t in webtext.words('grail.txt')]
words = BigramCollocationFinder.from_words(tokens)

In [21]:
words.nbest(BigramAssocMeasures.likelihood_ratio, 10)

[("'", 's'),
 ('arthur', ':'),
 ('#', '1'),
 ("'", 't'),
 ('villager', '#'),
 ('#', '2'),
 (']', '['),
 ('1', ':'),
 ('oh', ','),
 ('black', 'knight')]

In [25]:
from nltk.corpus import stopwords
from nltk.corpus import webtext
from nltk.collocations import BigramCollocationFinder 
from nltk.metrics import BigramAssocMeasures

In [38]:
stopwordsset = set(stopwords.words('english'))

# Exclude if the length is less than 3 or if it is a stopword.
stops_filter = lambda w: len(w) < 3 or w in stopwordsset

# Lowercase all the words first.
tokens = [t.lower() for t in webtext.words('grail.txt')]
words = BigramCollocationFinder.from_words(tokens)
words.apply_word_filter(stops_filter)
freq = 10
words.nbest(BigramAssocMeasures.likelihood_ratio, freq)

[('black', 'knight'),
 ('clop', 'clop'),
 ('head', 'knight'),
 ('mumble', 'mumble'),
 ('squeak', 'squeak'),
 ('saw', 'saw'),
 ('holy', 'grail'),
 ('run', 'away'),
 ('french', 'guard'),
 ('cartoon', 'character')]

## Generating bigrams from a text using collocation finders.

In [36]:
import nltk
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures

text = 'Hardwork is the key to success. Never give up!'
word = nltk.wordpunct_tokenize(text)
finder = BigramCollocationFinder.from_words(word)
bigram_measures = BigramAssocMeasures()
value = finder.score_ngrams(bigram_measures.raw_freq)
sorted(bigram for bigram, score in value)

[('.', 'Never'),
 ('Hardwork', 'is'),
 ('Never', 'give'),
 ('give', 'up'),
 ('is', 'the'),
 ('key', 'to'),
 ('success', '.'),
 ('the', 'key'),
 ('to', 'success'),
 ('up', '!')]

## Generating bigrams using ngrams

In [39]:
from nltk.util import ngrams
from nltk.corpus import alpino
bigrams = ngrams(alpino.words(), 2)
count = 0
for i in bigrams:
    print(i)
    count += 1
    if count > 10:
        break

('De', 'verzekeringsmaatschappijen')
('verzekeringsmaatschappijen', 'verhelen')
('verhelen', 'niet')
('niet', 'dat')
('dat', 'ook')
('ook', 'de')
('de', 'rentegrondslag')
('rentegrondslag', 'van')
('van', 'vier')
('vier', 'procent')
('procent', 'nog')


## Generating frequency of quadgrams

In [41]:
import nltk
from nltk import wordpunct_tokenize
from nltk.collocations import QuadgramCollocationFinder

In [42]:
text = "Hello how are you doing ? I hope you find the book interesting"
tokens = wordpunct_tokenize(text)
fourgrams = QuadgramCollocationFinder.from_words(tokens)
for fourgram, freq in fourgrams.ngram_fd.items():
    print(fourgram, freq)

('Hello', 'how', 'are', 'you') 1
('how', 'are', 'you', 'doing') 1
('are', 'you', 'doing', '?') 1
('you', 'doing', '?', 'I') 1
('doing', '?', 'I', 'hope') 1
('?', 'I', 'hope', 'you') 1
('I', 'hope', 'you', 'find') 1
('hope', 'you', 'find', 'the') 1
('you', 'find', 'the', 'book') 1
('find', 'the', 'book', 'interesting') 1
