# Collocations and ngrams revisited

## Getting n-grams

In [1]:
txt = "now is the time for all good people to come to the aid of their country".split()

In [None]:
import nltk
list(nltk.bigrams(txt))

In [None]:
list(nltk.ngrams(txt, 3))

We **could** include bigrams, or even trigrams, as dimensions in our vectors, and redo all of the analyses we have already done. But there are a couple of patterns:

* There will be a lot of ngrams.
* Most of them won't appear all that often.

That means we generally want to be strategic about which ones to include. We want only the good ones (whatever that means).

Good ngrams are also interesting for other reasons.

## Load the civil war dataset

In [4]:
import wikipediaapi
pages = [
    "American Civil War",
    "Abraham Lincoln",
    "Slavery in the United States",
    "Slave states and free states",
    "Emancipation Proclamation",
    "Robert E. Lee",
    "Ulysses S. Grant",
    "Conclusion of the American Civil War",
    "Origins of the American Civil War",
    "Issues of the American Civil War"
]
import re

def underscorize(pagename):
    return re.sub(" ", "_", pagename)

wiki_wiki = wikipediaapi.Wikipedia(
        language='en',
        extract_format=wikipediaapi.ExtractFormat.WIKI
)
page_dict = {}
for page in pages:
    pagename = underscorize(page)
    print(pagename)
    p_wiki = wiki_wiki.page(pagename)
    page_text = p_wiki.text.split("\n")
    page_paras = [para for para in page_text if len(para) > 1]
    page_dict[pagename] = page_paras

American_Civil_War
Abraham_Lincoln
Slavery_in_the_United_States
Slave_states_and_free_states
Emancipation_Proclamation
Robert_E._Lee
Ulysses_S._Grant
Conclusion_of_the_American_Civil_War
Origins_of_the_American_Civil_War
Issues_of_the_American_Civil_War


### Tokenize

In [6]:
def process_para(para):
    sentences = nltk.sent_tokenize(para)
    tokenized_sentences = []
    for sent in sentences:
        tokenized_sentence = nltk.word_tokenize(sent)
        tokenized_sentences.append(tokenized_sentence)
    return tokenized_sentences

In [7]:
tokenized_sentences = []
for pagename, page_paras in page_dict.items():
    for para in page_paras:
        tokenized_sentences += process_para(para)

### Create a bigram frequency distribution

In [9]:
bigram_fdist = nltk.FreqDist()
for sent in tokenized_sentences:
    bigram_fdist.update(nltk.bigrams(sent))

In [None]:
bigram_fdist.most_common(25)

## Finding meaningful collocations

Generally, we'd like to find collocations that appear more than would be expected.

In [12]:
bigram_measures = nltk.collocations.BigramAssocMeasures()

bigram_measures.raw_freq
bigram_measures.chi_sq
bigram_measures.likelihood_ratio
bigram_measures.pmi

In [16]:
new_documents = list(nltk.BigramCollocationFinder._build_new_documents(tokenized_sentences, 2, pad_right=True))

In [37]:
finder = nltk.BigramCollocationFinder.from_words(new_documents, 2)

### First, raw frequency

In [38]:
finder.nbest(bigram_measures.raw_freq, 20)

[('of', 'the'),
 ('in', 'the'),
 (',', 'and'),
 (',', 'the'),
 ('to', 'the'),
 ('and', 'the'),
 ('the', 'Union'),
 ('Civil', 'War'),
 (',', 'but'),
 ('the', 'South'),
 ('the', 'war'),
 ('for', 'the'),
 ('as', 'a'),
 ('that', 'the'),
 ('United', 'States'),
 ('on', 'the'),
 (',', 'which'),
 ('by', 'the'),
 (',', 'Grant'),
 ('at', 'the')]

In [39]:
finder.score_ngrams(bigram_measures.raw_freq)[:20]

[(('of', 'the'), 0.00985164242618555),
 (('in', 'the'), 0.006820367833513073),
 ((',', 'and'), 0.006660060042554432),
 ((',', 'the'), 0.0033227433035063684),
 (('to', 'the'), 0.0030822816170684077),
 (('and', 'the'), 0.002477484042088082),
 (('the', 'Union'), 0.0020038473869830073),
 (('Civil', 'War'), 0.0019455536448162289),
 ((',', 'but'), 0.0017415255472325046),
 (('the', 'South'), 0.001508350578565391),
 (('the', 'war'), 0.0014792037074820017),
 (('for', 'the'), 0.0014646302719403073),
 (('as', 'a'), 0.00145734355416946),
 (('that', 'the'), 0.0014500568363986126),
 (('United', 'States'), 0.0014136232475443762),
 (('on', 'the'), 0.0014136232475443762),
 ((',', 'which'), 0.0013553295053775978),
 (('by', 'the'), 0.0012606021743565828),
 ((',', 'Grant'), 0.0012241685855023463),
 (('at', 'the'), 0.0011585881255647206)]

### Let's try filtering out stop_words

In [45]:
f = open("lists/stop-words_english_1_en.txt")
stop_list = f.read().split("\n")
stop_list += list('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~’')
stop_list += list("abcdefghijklmnopqrstuvwxyz0123456789")
stop_list = set(stop_list)

In [41]:
def filter_function(w,):
    return w in stop_list

In [48]:
finder = nltk.BigramCollocationFinder.from_words(new_documents, 2)
finder.nbest(bigram_measures.raw_freq, 20)
finder.apply_word_filter(filter_function)

In [49]:
finder.nbest(bigram_measures.raw_freq, 20)

[('Civil', 'War'),
 ('United', 'States'),
 ('Grant', "'s"),
 ('Lee', "'s"),
 ('Lincoln', "'s"),
 ('Emancipation', 'Proclamation'),
 ('South', 'Carolina'),
 ('New', 'York'),
 ('Abraham', 'Lincoln'),
 ('American', 'Civil'),
 ('E.', 'Lee'),
 ('Robert', 'E.'),
 ('``', 'The'),
 ('slave', 'states'),
 ("'s", 'army'),
 ('Republican', 'Party'),
 ('slave', 'trade'),
 ('Major', 'General'),
 ('New', 'Orleans'),
 ('free', 'states')]

### Other measures

In [50]:
finder = nltk.BigramCollocationFinder.from_words(new_documents, 2)
finder.nbest(bigram_measures.chi_sq, 20)

[("'30s", 'dozens'),
 ("'ll", 'hit'),
 ("'their", 'Moses'),
 ('--', '+Jacobs.pdf'),
 ('//amerautobiofa14.wiki.uml.edu/file/view/Excerpts+from+Incidents+in+the+Life+of+a+Slave+Girl+',
  '--'),
 ('10:13', 'pm'),
 ('15,741', 'other/unknown'),
 ('1526.The', 'ill-fated'),
 ('181', 'high-slavery'),
 ('190,000', 'volunteered'),
 ('205', 'low-slavery'),
 ('29th', 'Ulto'),
 ('3,200', 'kg'),
 ('364—went', 'bankrupt.Congress'),
 ('43rd', 'Battalion'),
 ('533,000', 'battle-ready'),
 ('642-ton', 'iron-hulled'),
 ('728', 'acknowledging'),
 ('9780300192001.Scholarly', 'articlesTurner'),
 ('Adalberto', 'Aguirre')]

In [53]:
finder = nltk.BigramCollocationFinder.from_words(new_documents, 2)
finder.apply_freq_filter(10)
finder.nbest(bigram_measures.chi_sq, 20)

[('habeas', 'corpus'),
 ('Wilmot', 'Proviso'),
 ('Founding', 'Fathers'),
 ('’', 's'),
 ('Lost', 'Cause'),
 ('United', 'States'),
 ('Civil', 'War'),
 ('Supreme', 'Court'),
 ('Emancipation', 'Proclamation'),
 ('inaugural', 'address'),
 ('Dred', 'Scott'),
 ('Thirteenth', 'Amendment'),
 ('Ulysses', 'S.'),
 ('St.', 'Louis'),
 ('New', 'York'),
 ('Maj.', 'Gen.'),
 ('Eric', 'Foner'),
 ('Robert', 'E.'),
 ('Chief', 'Justice'),
 ('Fugitive', 'Slave')]

In [54]:
finder = nltk.BigramCollocationFinder.from_words(new_documents, 2)
finder.apply_freq_filter(10)
finder.nbest(bigram_measures.likelihood_ratio, 20)

[('Civil', 'War'),
 ('United', 'States'),
 ('of', 'the'),
 ('in', 'the'),
 (',', 'and'),
 ('Emancipation', 'Proclamation'),
 ('New', 'York'),
 (',', 'but'),
 ('did', 'not'),
 ('the', 'Union'),
 ('Robert', 'E.'),
 ('South', 'Carolina'),
 ('as', 'a'),
 ('Grant', "'s"),
 ('Lee', "'s"),
 ('Abraham', 'Lincoln'),
 ('the', 'war'),
 ('the', 'United'),
 ('had', 'been'),
 (',', 'which')]

## Using part of speech

### Tagging a sentence

In [55]:
import nltk
sentence = "now is the time for all good people to come to the aid of their country"
print(nltk.pos_tag(sentence.split()))

[('now', 'RB'), ('is', 'VBZ'), ('the', 'DT'), ('time', 'NN'), ('for', 'IN'), ('all', 'DT'), ('good', 'JJ'), ('people', 'NNS'), ('to', 'TO'), ('come', 'VB'), ('to', 'TO'), ('the', 'DT'), ('aid', 'NN'), ('of', 'IN'), ('their', 'PRP$'), ('country', 'NN')]


In [None]:
nltk.help.upenn_tagset()

   Note that everything we did with word vectors, 
   you can do with tagged words.

### Tag the entire civil war corpus

Note that we have to separate it into sentences before tokenizing

In [60]:
import nltk
def process_para2(para):
    sentences = nltk.sent_tokenize(para)
    tagged_sentences = []
    for sent in sentences:
        tokenized_sentence = nltk.word_tokenize(sent)
        tagged_sentence = nltk.pos_tag(tokenized_sentence)
        tagged_sentences.append(tagged_sentence)
    return tagged_sentences

In [61]:
tagged_sentences = []
for pagename, page_paras in page_dict.items():
    for para in page_paras:
        tagged_sentences += process_para2(para)

Let's again get the most frequent bigrams.

In [65]:
bigram_fdist2 = nltk.FreqDist()
for sent in tagged_sentences:
    bigram_fdist2.update(nltk.bigrams(sent))

In [66]:
bigram_fdist2.most_common(25)

[((('of', 'IN'), ('the', 'DT')), 1352),
 ((('in', 'IN'), ('the', 'DT')), 936),
 (((',', ','), ('and', 'CC')), 914),
 (((',', ','), ('the', 'DT')), 456),
 ((('to', 'TO'), ('the', 'DT')), 423),
 ((('and', 'CC'), ('the', 'DT')), 340),
 ((('the', 'DT'), ('Union', 'NNP')), 275),
 ((('Civil', 'NNP'), ('War', 'NNP')), 267),
 (((',', ','), ('but', 'CC')), 239),
 ((('the', 'DT'), ('South', 'NNP')), 207),
 ((('the', 'DT'), ('war', 'NN')), 203),
 ((('for', 'IN'), ('the', 'DT')), 201),
 ((('as', 'IN'), ('a', 'DT')), 200),
 ((('that', 'IN'), ('the', 'DT')), 199),
 ((('on', 'IN'), ('the', 'DT')), 194),
 ((('United', 'NNP'), ('States', 'NNPS')), 192),
 (((',', ','), ('which', 'WDT')), 186),
 ((('by', 'IN'), ('the', 'DT')), 173),
 (((',', ','), ('Grant', 'NNP')), 168),
 ((('at', 'IN'), ('the', 'DT')), 159),
 ((('the', 'DT'), ('United', 'NNP')), 154),
 ((('.', '.'), ("''", "''")), 152),
 ((('the', 'DT'), ('North', 'NNP')), 151),
 ((('as', 'IN'), ('the', 'DT')), 151),
 ((('of', 'IN'), ('slavery', 'NN'))

### Now let's filter on part of speech

We want the first tag to be a noun or adjective, and the second tag to be a noun

In [69]:
def filter_on_pos(bg):
    tag1 = bg[0][0][1]
    tag2 = bg[0][1][1]
    return re.search("^N.*", tag2) and ((re.search("^N.*", tag1) or re.search("^J.*", tag1)))

In [70]:
for bg in bigram_fdist2.most_common(100):
    if filter_on_pos(bg):
        print(bg)

((('Civil', 'NNP'), ('War', 'NNP')), 267)
((('United', 'NNP'), ('States', 'NNPS')), 192)
((('Emancipation', 'NNP'), ('Proclamation', 'NNP')), 80)
((('South', 'NNP'), ('Carolina', 'NNP')), 78)
((('New', 'NNP'), ('York', 'NNP')), 77)
((('Abraham', 'NNP'), ('Lincoln', 'NNP')), 70)
((('American', 'NNP'), ('Civil', 'NNP')), 66)
((('E.', 'NNP'), ('Lee', 'NNP')), 65)
((('Robert', 'NNP'), ('E.', 'NNP')), 64)
