# Working with N-Grams

In [1]:
import nltk
import re
import os
from pathlib import Path
from nltk.lm.preprocessing import pad_both_ends, padded_everygrams
from nltk.util import everygrams
from nltk.util import ngrams
from itertools import groupby
from nltk.lm import MLE
from nltk.tag.hmm import HiddenMarkovModelTagger
import numpy as np


In [2]:
# navigate folders
p=Path(os.getcwd())
os.chdir(p.parent)

In [3]:
os.getcwd()

'/Users/Chris/Documents/00.Data_science/00.MADS/GitHub_MADS/portfolio'

In [4]:
# import data
nltk_data_path = "assets/nltk_data"
if nltk_data_path not in nltk.data.path:
    nltk.data.path.append(nltk_data_path)

## Create a few functions to help parse the data

In [5]:
def load_data():
    """
    Load text data from a file and produce a list of token lists
    """
    
    with open('assets/gutenberg/THE_SONNETS.txt', "r") as file_object:
        
        # read file content
        data = file_object.readlines()
        data = [re.sub(r'\d+','',li) for li in data] # removing numbers
        
        sentences = [line.lstrip() for line in data] # removing leading space
        sentences = [line.strip('\n') for line in sentences] # removing line break
        sentences = list(filter(None, sentences)) # removing empty lists
        sentences = [nltk.word_tokenize(w.lower()) for w in sentences] # tokenizing words
    
    return sentences

In [6]:
def build_vocab(sentences):
    """
    Take a list of sentences and return a vocab
    """
    words = set()
    for sent in sentences:
        words.update(set(sent))
    words.update(('<s>', '</s>'))
    vocab = list(words)
    
    return vocab

In [7]:
def build_ngrams(n, sentences):
    """
    Take a list of unpadded sentences and create all n-grams as specified by the argument "n" for each sentence
    """
    sentences_d = [list(pad_both_ends(sent, n)) for sent in sentences]
    all_ngrams = [list(ngrams(sent, n)) for sent in sentences_d]
    
    return all_ngrams

In [8]:
# run functions
sentence_data = load_data()
vocab = build_vocab(sentence_data)
bigrams = build_ngrams(2, sentence_data)

## Let's now show that we can guess the next token 
### First-Order Markov

In [9]:
# bigram occurences
bigram_occurences = [grams[0] for grams in bigrams]

# create key set with implied <s> start
bigram_set = set(bigram_occurences)
# create count
start_count = len(bigram_occurences)
# instantiate dict
bi_dict = {}

In [10]:
def bigram_next_token(start_tokens=("<s>", ) * 3):
    """
    Take some starting tokens and produce the most likely token that follows under a bi-gram model
    """
    
    for bi in bigram_set:
        bi_dict[bi] = bigram_occurences.count(bi)/start_count

    keymax = max(bi_dict, key=bi_dict.get)

    next_token, prob = keymax[1], bi_dict[keymax]
    
    
    return next_token, prob

In [11]:
# okay let's test this function
bigram_next_token(start_tokens=("<s>", ) * 3)

('and', 0.1122969837587007)

## Train an N-Gram language model
Now we are well positioned to start training an $n$-gram language model. We can fit a language model using the `MLE` class from `nltk.lm`. It requires two inputs: a list of all $n$-grams for each sentence and a vocabulary, both of which you have already written a function to build. Now it's time to put them together to work. 

In [12]:
def train_ngram_lm(n):
    """
    Train a n-gram language model as specified by the argument "n"
    This uses the global sentence_data variable
    """
    ngrams = build_ngrams(n, sentence_data)
    lm = MLE(n)
    lm.fit(ngrams, vocabulary_text=vocab)
    
    return lm

## Okay now let's have some fun and generate a Shakespearean poem from a corpus of his work

In [13]:
# Every time it runs, depending on how drunk it is, a different sonnet is written. 
n = 3
num_lines = 14
num_words_per_line = 8
text_seed = ["<s>"] * (n - 1)

lm = train_ngram_lm(n)

sonnet = []
while len(sonnet) < num_lines:
    while True:  # keep generating a line until success
        try:
            line = lm.generate(num_words_per_line, text_seed=text_seed)
        except ValueError:  # the generation is not always successful. need to capture exceptions
            continue
        else:
            line = [x for x in line if x not in ["<s>", "</s>"]]
            sonnet.append(" ".join(line))
            break

# pretty-print your sonnet
print("\n".join(sonnet))

to tie up envy , evermore enlarged ,
my love is a babe , then she
and to flatterer stopped are :
make sweet some vial ; treasure thou some
and you but one hour mine ,
is lust in action , lust
when your countenance filled up his burning head
you are you whose worthiness gives scope ,
eat up thy charge ? is this ,
and even thence thou wilt leave me ,
he of tall building , and my discourse
that i was thy will ,
spending again what is had , having ,


## Now let's train a Hidden Markov Model (HMM) that is able to tag words with their part-of-speech (POS)

In [14]:
def tag_sent(data_file, label=True):
    """
    Load tokens (and labels, if allowed) from a data_file
    """
    with open(f"assets/conll03/{data_file}", "r") as file_object:
        
        # read file content and stripp the docstart lines
        data = [line for line in file_object.readlines() if line != '-DOCSTART- -X- O O\n']
        # grouping by line break
        sentences = [list(g) for k, g in groupby(data, key=lambda x: x != "\n") if k]
        # filtering for lines with >= 10 tokens
        g10_sent = [line for line in sentences if len(line) >=10]
        # nested list comprehension to remove line break
        output = [[l.strip('\n') for l in line] for line in g10_sent]
        
        
        # extract POS if label is True
        if label == True:
            tagged_sents = [[(t.split()[0],t.split()[1]) for t in line] for line in output]
        else:
            tagged_sents = [[t.split()[0] for t in line] for line in output]    
    
    return tagged_sents

The data below comes from [CoNLL-2003 shared task](https://www.clips.uantwerpen.be/conll2003/ner/). The shared task was originally held as a competition for Named Entity Recognition (NER), but the data it provided also includes POS tags that make POS Tagging possible. NER and POS Tagging are very similar tasks and HMMs are capable of handling both of them.

In [15]:
# Let's create a training, development and testing sets using tag_sent above
# in all the three data files, we only consider "substantial" sentences that have at least 10 tokens
dataset = {"train": tag_sent('eng.train', label=True), "dev": tag_sent('eng.testa', label=True), "test": tag_sent('eng.testb', label=False)}

In [16]:
# Train an HMM tagger, which takes a while
hmm_tagger = HiddenMarkovModelTagger.train(dataset["train"], test_sequence = dataset["dev"])

accuracy over 43319 tokens: 89.13


Now that we have trained an HMM tagger, let's now apply it to the testing set to see how it does. For example, the line of code below tags the first sentence in the testing set.

In [17]:
hmm_tagger.tag(dataset["test"][0])

[('SOCCER', 'NN'),
 ('-', ':'),
 ('JAPAN', 'NNP'),
 ('GET', '.'),
 ('LUCKY', '"'),
 ('WIN', 'NNP'),
 (',', ','),
 ('CHINA', 'NNP'),
 ('IN', 'IN'),
 ('SURPRISE', 'NNP'),
 ('DEFEAT', 'NNP'),
 ('.', '.')]