# Hidden Markov Model

In [1]:
import collections
import itertools
import random
import math
from sys import float_info
from collections import defaultdict
from pathlib import Path
from operator import itemgetter
from pprint import pprint
import numpy as np

In [2]:
np.set_printoptions(formatter={'float': lambda x: f"{x:0.3f}"})

## Constants

In [3]:
DATA_ROOT = Path("data")
DATASETS = ["SG", "CN", "EN", "AL"]

## Helper Functions

In [4]:
def load_dataset(path, split=True, shuffle=False):
    """
    Load a dataset from a specified path.
    
    Args:
        path: The path to read the data from
        split (bool): Whether to split labels from each line of data
    """
    with open(path) as f:
        sequences = [sent.split("\n") for sent in f.read().split("\n\n")][:-1]
    if shuffle:
        random.shuffle(sequences)
    if split:
        sequences = [[pair.split() for pair in seq] for seq in sequences]
        sequences = [[[pair[i] for pair in s] for s in sequences] for i in [0, 1]]
    return sequences


def pairwise(sequence, include_start_stop=True):
    """
    Rolling window over iterable (with offset=1 and window_size=2)

    Args:
        sequence: The iterable to window over
        include_start_stop (bool): If True, adds START & STOP are added to either end of output
        
    Examples:
        >>> pairwise([1, 2, 3], include_start_stop=True)
        [("START", 1), (1, 2), (2, 3), (3, "STOP")]

        >>> pairwise([1, 2, 3, 4], include_start_stop=False)
        [(1, 2), (2, 3), (3, 4)]
    """
    a, b = itertools.tee(sequence)
    next(b)
    pairs = zip(a, b)
    if include_start_stop:
        pairs = itertools.chain([("START", sequence[0])], pairs, [(sequence[-1], "STOP")])
    return pairs


def flatten(sequences):
    """
    Flatten a nested sequence
    """
    return itertools.chain.from_iterable(sequences)


def unique(sequences, sort=True):
    items = set(flatten(sequences))
    if sort:
        items = sorted(items)
    return items


def count(sequences, as_probability=False):
    """
    Get a dictionary of word-count pairs in a dataset.

    Args:
        sequences: The sequence (or collection of sequences) of words to count
        as_probability (bool): Whether to return the counts as probabilties (over the entire dataset)
    """
    counts = dict(collections.Counter(flatten(sequences)))
    if as_probability:
        counts = {k: v / sum(counts.values()) for k, v in counts.items()}
    return counts


def smooth(inputs, thresh):
    """
    Replace tokens appearing less than `thresh` times with a "#UNK#" token.

    Args:
        inputs: The collection of sequences to smooth
        thresh (bool): The minimum number of occurrences required for a word to not be replaced
    """
    inputs = list(inputs)
    to_replace = {k for k, v in count(inputs, as_probability=False).items() if v < thresh}
    return [["#UNK#" if x in to_replace else x for x in sub] for sub in inputs]


def smooth_dev(sequences, train_sequences):
    """
    For each token in the given inputs, replace it with "#UNK#" if it doesn't appear in the training corpus.
    """
    train_sequences = unique(train_sequences, sort=False)
    return [[x if x in train_sequences else "#UNK#" for x in sequence] for sequence in sequences]


def get_token_map(sequences):
    """
    Get token_to_id and id_to_token maps from a collection of sequences
    """
    tokens = unique(sequences)
    return {token: i for i, token in enumerate(tokens)}


def encode_numeric(sequences, token_map=None):
    """
    Encode a collection of token sequences as numerical values
    """
    token_map = token_map or get_token_map(sequences)  # Compute token map if not provided
    return [[token_map[token] for token in sequence] for sequence in sequences], token_map


def decode_numeric(sequences, token_map):
    """
    Decode a collection of token ID sequences to tokens
    """
    token_map = {token: i for i, token in token_map.items()}  # Reverse token map
    return [[token_map[val] for val in sequence] for sequence in sequences]


def pprint_dict(d, max_entires=40):
    pprint(dict(itertools.islice(d.items(), max_entires)))

## Part 3

### Emission Parameters

In [5]:
def get_emission_parameters(observations, states):
    """
    Estimate emission paramters from a collection of observation-state pairs
    """
    n_observations = max(flatten(observations)) + 1  # Observation space size
    n_states = max(flatten(states)) + 1  # State space size
    emission_matrix = np.zeros((n_states, n_observations))
#     emission_matrix = [[0 for _ in range(n_observations)] for _ in range(n_states)]

    for state, obs in zip(states, observations):
        for s, o in zip(state, obs):
            emission_matrix[s, o] += 1

    emission_matrix /= emission_matrix.sum(axis=0)
#     for i in range(n_states):
#         row_sum = sum(emission_matrix[i])
#         for j in range(n_observations):
#             emission_matrix[i][j] /= row_sum

    return emission_matrix

In [6]:
for dataset in DATASETS:
    print(f"Dataset: {dataset}")
    features, labels = load_dataset(f"data/{dataset}/train")
    features = smooth(features, 3)  # Input feature smoothing

    # Numerically encode dataset
    feature_ids, feature_map = encode_numeric(features)
    label_ids, label_map = encode_numeric(labels)

    # Calculate Emission Parameters
    emission_matrix = get_emission_parameters(feature_ids, label_ids)
    print(f"Emission Matrix: ({len(emission_matrix)} States -> {len(emission_matrix[0])} Observations)")

Dataset: SG
Emission Matrix: (7 States -> 10733 Observations)
Dataset: CN
Emission Matrix: (7 States -> 7364 Observations)
Dataset: EN
Emission Matrix: (21 States -> 6187 Observations)
Dataset: AL
Emission Matrix: (42 States -> 2698 Observations)


### Transition Parameters

In [18]:
def get_transition_parameters(state_sequences):
    """
    Estimate transition paramters from a collection of state sequences
    """
    n_states = max(flatten(state_sequences)) + 1  # State space size (Excluding START and STOP)
    transition_matrix = np.zeros((n_states + 1, n_states + 1))

    for state_sequence in state_sequences:
        transition_matrix[0, state_sequence[0]] += 1
        transition_matrix[state_sequence[-1] + 1, n_states] += 1
        for i in range(len(state_sequence)):
            transition_matrix[state_sequence[i - 1] + 1, state_sequence[i]] += 1

    transition_matrix = (transition_matrix.T / transition_matrix.sum(axis=1)).T
    return transition_matrix

In [44]:
for dataset in DATASETS:
    print(f"Dataset: {dataset}")
    _, labels = load_dataset(f"data/{dataset}/train")  # Load dataset
    label_ids, label_map = encode_numeric(labels)  # Numerically encode dataset
    transition_matrix = get_transition_parameters(label_ids)
    print(f"Transition Parameters: ({len(transition_matrix)} States -> {len(transition_matrix[0])} States)")

Dataset: SG
Transition Parameters: (8 States -> 8 States)
Dataset: CN
Transition Parameters: (8 States -> 8 States)
Dataset: EN
Transition Parameters: (22 States -> 22 States)
Dataset: AL
Transition Parameters: (43 States -> 43 States)


### Viterbi Algorithm

In [62]:
def log(x):
    return math.log(x) if x else -100


def viterbi(observations, transition_matrix, emission_matrix):
    start_p = transition_matrix[0]
    transition_matrix = transition_matrix[1:]

    n_states = len(transition_matrix)
    n_observations = len(observations)
    states = list(range(n_states))

#     V_prob = np.full((len(observations), len(states)), -np.inf)
    V = [[[-float("inf"), None] for _ in states] for _ in observations]

    # First layer
    for state in states:
        V[0][state] = [log(start_p[state]) + log(emission_matrix[state][observations[0]]), None]

    for t in range(1, len(observations)):  # Exclude first observation
        for state in states:
            max_tr_prob = V[t - 1][states[0]][0] + log(transition_matrix[states[0]][state])
            prev_state_selected = states[0]
            for prev_state in states[1:]:
                tr_prob = V[t - 1][prev_state][0] + log(transition_matrix[prev_state][state])
                if tr_prob > max_tr_prob:
                    max_tr_prob = tr_prob
                    prev_state_selected = prev_state
            max_prob = max_tr_prob + log(emission_matrix[state][observations[t]])
            V[t][state] = max([V[t - 1][y0][0] + log(transition_matrix[y0][state]) + log(emission_matrix[state][observations[t]]), y0] for y0 in states)

    opt = []
    max_prob = max(value[0] for value in V[-1])
    prev = None
    for state, value in enumerate(V[-1]):
        if value[0] == max_prob:
            opt.append(state)
            prev = state
            break
    for t in range(len(V) - 2, -1, -1):
        opt.insert(0, V[t + 1][prev][1])
        prev = V[t + 1][prev][1]
    return opt

In [80]:
def predict_and_save(train_file, test_file, output_file):
    features, labels = load_dataset(train_file)
    smoothed_features = smooth(features, 3)  # Input feature smoothing

    # Numerically encode dataset
    feature_ids, feature_map = encode_numeric(smoothed_features)
    label_ids, label_map = encode_numeric(labels)

    # Get HMM model parameters
    emission_matrix = get_emission_parameters(feature_ids, label_ids)
    transition_matrix = get_transition_parameters(label_ids)

    # Load dev dataset, smooth and numerically encode it
    dev_features = load_dataset(test_file, split=False)
    smoothed_dev_features = smooth_dev(dev_features, smoothed_features)
    dev_feature_ids, _ = encode_numeric(smoothed_dev_features, token_map=feature_map)  # Make sure to reuse the same token map as the training set

    # Run Viterbi algorithm to get most likely labels
    predicted_dev_labels = []
    for feature_id in dev_feature_ids:
        pred = viterbi(feature_id, transition_matrix, emission_matrix)
        predicted_dev_labels.append(pred)
#         print(decode_numeric([feature_id], feature_map)[0])
#         print(decode_numeric([pred], label_map)[0])
    predicted_dev_labels = decode_numeric(predicted_dev_labels, label_map)

    # Write predictions to file
    with open(output_file, "w") as outfile:
        for dev_feature_sequence, predicted_dev_label_sequence in zip(dev_features, predicted_dev_labels):
            for dev_feature, predicted_dev_label in zip(dev_feature_sequence, predicted_dev_label_sequence):
                print(dev_feature, predicted_dev_label, file=outfile)
            print(file=outfile)

In [82]:
for dataset in DATASETS:
    print(f"Dataset: {dataset}")
    predict_and_save(f"data/{dataset}/train", f"data/{dataset}/dev.in", f"data/{dataset}/dev.p3.out")

Dataset: SG
Dataset: CN
Dataset: EN
Dataset: AL
