# Collocations and ngrams revisited

## Getting n-grams

In [None]:
txt = "now is the time for all good people to come to the aid of their country".split()

In [None]:
import nltk
list(nltk.bigrams(txt))

In [None]:
list(nltk.ngrams(txt, 3))

We **could** include bigrams, or even trigrams, as dimensions in our vectors, and redo all of the analyses we have already done. But there are a couple of patterns:

* There will be a lot of ngrams.
* Most of them won't appear all that often.

That means we generally want to be strategic about which ones to include. We want only the good ones (whatever that means).

Good ngrams are also interesting for other reasons.

## Load the civil war dataset

In [None]:
import wikipediaapi
pages = [
    "American Civil War",
    "Abraham Lincoln",
    "Slavery in the United States",
    "Slave states and free states",
    "Emancipation Proclamation",
    "Robert E. Lee",
    "Ulysses S. Grant",
    "Conclusion of the American Civil War",
    "Origins of the American Civil War",
    "Issues of the American Civil War"
]
import re

def underscorize(pagename):
    return re.sub(" ", "_", pagename)

wiki_wiki = wikipediaapi.Wikipedia(
        language='en',
        extract_format=wikipediaapi.ExtractFormat.WIKI
)
page_dict = {}
for page in pages:
    pagename = underscorize(page)
    print(pagename)
    p_wiki = wiki_wiki.page(pagename)
    page_text = p_wiki.text.split("\n")
    page_paras = [para for para in page_text if len(para) > 1]
    page_dict[pagename] = page_paras

### Wordify

In [None]:
def process_para(para):
    sentences = nltk.sent_tokenize(para)
    wordified_sentences = []
    for sent in sentences:
        wordified_sentence = nltk.word_tokenize(sent)
        wordified_sentences.append(wordified_sentence)
    return wordified_sentences

In [None]:
wordified_sentences = []
for pagename, page_paras in page_dict.items():
    for para in page_paras:
        wordified_sentences += process_para(para)

In [None]:
print(wordified_sentences[0])

### Create a bigram frequency distribution

In [None]:
bigram_fdist = nltk.FreqDist()
for sent in wordified_sentences:
    bigram_fdist.update(nltk.bigrams(sent))

In [None]:
bigram_fdist.most_common(25)

## Using part of speech

### Tagging a sentence

In [None]:
import nltk
sentence = "now is the time for all good people to come to the aid of their country"
print(nltk.pos_tag(sentence.split()))

In [None]:
nltk.help.upenn_tagset()

   Note that everything we did with word vectors, 
   you can do with tagged words.

### Tag the entire civil war corpus

Note that we have to separate it into sentences before wordifying

In [None]:
import nltk
def process_para2(para):
    sentences = nltk.sent_tokenize(para)
    tagged_sentences = []
    for sent in sentences:
        wordified_sentence = nltk.word_tokenize(sent)
        tagged_sentence = nltk.pos_tag(wordified_sentence)
        tagged_sentences.append(tagged_sentence)
    return tagged_sentences

In [None]:
tagged_sentences = []
for pagename, page_paras in page_dict.items():
    for para in page_paras:
        tagged_sentences += process_para2(para)

In [None]:
tagged_sentences[0]

Let's again get the most frequent bigrams.

In [None]:
bigram_fdist2 = nltk.FreqDist()
for sent in tagged_sentences:
    bigram_fdist2.update(nltk.bigrams(sent))

In [None]:
bigram_fdist2.most_common(25)

### Now let's filter on part of speech

We want the first tag to be a noun or adjective, and the second tag to be a noun

In [None]:
def filter_on_pos(bg):
    tag1 = bg[0][0][1]
    tag2 = bg[0][1][1]
    return re.search("^N.*", tag2) and ((re.search("^N.*", tag1) or re.search("^J.*", tag1)))

In [None]:
for bg in bigram_fdist2.most_common(100):
    if filter_on_pos(bg):
        print(bg)