# Hidden Markov Model

In [1]:
import collections
import itertools
import random
import math
from sys import float_info
from collections import defaultdict
from pathlib import Path
from operator import itemgetter
from pprint import pprint

## Constants

In [2]:
DATA_ROOT = Path("data")
DATASETS = ["SG", "CN", "EN", "AL"]

## Helper Functions

In [3]:
def load_dataset(path, split=True, shuffle=False):
    """
    Load a dataset from a specified path.
    
    Args:
        path: The path to read the data from
        split (bool): Whether to split labels from each line of data
    """
    with open(path) as f:
        sequences = [sent.split("\n") for sent in f.read().split("\n\n")][:-1]
    if shuffle:
        random.shuffle(sequences)
    if split:
        sequences = [[pair.split() for pair in seq] for seq in sequences]
        sequences = [[[pair[i] for pair in s] for s in sequences] for i in [0, 1]]
    return sequences


def pairwise(sequence, include_start_stop=True):
    """
    Rolling window over iterable (with offset=1 and window_size=2)

    Args:
        sequence: The iterable to window over
        include_start_stop (bool): If True, adds START & STOP are added to either end of output
        
    Examples:
        >>> pairwise([1, 2, 3], include_start_stop=True)
        [("START", 1), (1, 2), (2, 3), (3, "STOP")]

        >>> pairwise([1, 2, 3, 4], include_start_stop=False)
        [(1, 2), (2, 3), (3, 4)]
    """
    a, b = itertools.tee(sequence)
    next(b)
    pairs = zip(a, b)
    if include_start_stop:
        pairs = itertools.chain([("START", sequence[0])], pairs, [(sequence[-1], "STOP")])
    return pairs


def flatten(sequences):
    """
    Flatten a nested sequence
    """
    return itertools.chain.from_iterable(sequences)


def count(sequences, as_probability=False):
    """
    Get a dictionary of word-count pairs in a dataset.

    Args:
        sequences: The sequence (or collection of sequences) of words to count
        as_probability (bool): Whether to return the counts as probabilties (over the entire dataset)
    """
    counts = dict(collections.Counter(flatten(sequences)))
    if as_probability:
        counts = {k: v / sum(counts.values()) for k, v in counts.items()}
    return counts


def smooth(inputs, thresh):
    """
    Replace tokens appearing less than `thresh` times with a "#UNK#" token.

    Args:
        inputs: The collection of sequences to smooth
        thresh (bool): The minimum number of occurrences required for a word to not be replaced
    """
    inputs = list(inputs)
    to_replace = {k for k, v in count(inputs, as_probability=False).items() if v < thresh}
    return [["#UNK#" if x in to_replace else x for x in sub] for sub in inputs]


def clean_inputs(inputs, emission):
    """
    For each token in the given inputs, replace it with "#UNK#" if it doesn't appear in the emission probability list.
    """
    return [[x if x in emission else "#UNK#" for x in sub] for sub in inputs]


def get_token_map(sequences):
    """
    Get token_to_id and id_to_token maps from a collection of sequences
    """
    tokens = sorted(set(flatten(sequences)))
    return {token: i for i, token in enumerate(tokens)}


def encode_numeric(sequences):
    """
    Encode a collection of token sequences as numerical values
    """
    token_map = get_token_map(sequences)
    return [[token_map[token] for token in sequence] for sequence in sequences], token_map


def decode_numeric(sequences, token_map):
    """
    Decode a collection of token ID sequences to tokens
    """
    token_map = {i: token for i, token in token_map.items()}  # Reverse token map
    return [[token_map[val] for val in sequence] for sequence in sequences]


def pprint_dict(d, max_entires=40):
    pprint(dict(itertools.islice(d.items(), max_entires)))

## Part 3

### Emission Parameters

In [4]:
def get_emission_parameters(observations, states):
    """
    Estimate emission paramters from a collection of observation-state pairs
    """
    n_observations = max(flatten(observations)) + 1  # Observation space size
    n_states = max(flatten(states)) + 1  # State space size
    emission_matrix = [[0 for _ in range(n_observations)] for _ in range(n_states)]

    for state, obs in zip(states, observations):
        for s, o in zip(state, obs):
            emission_matrix[s][o] += 1

    for i in range(n_states):
        row_sum = sum(emission_matrix[i])
        for j in range(n_observations):
            emission_matrix[i][j] /= row_sum

    return emission_matrix

In [5]:
for dataset in DATASETS:
    print(f"Dataset: {dataset}")
    features, labels = load_dataset(f"data/{dataset}/train")
    features = smooth(features, 3)  # Input feature smoothing

    # Numerically encode dataset
    feature_ids, feature_map = encode_numeric(features)
    label_ids, label_map = encode_numeric(labels)

    # Calculate Emission Parameters
    emission_matrix = get_emission_parameters(feature_ids, label_ids)
    print(f"Emission Matrix: ({len(emission_matrix)} States x {len(emission_matrix[0])} Observations)")

Dataset: SG
Emission Matrix: (7 States x 10733 Observations)
Dataset: CN
Emission Matrix: (7 States x 7364 Observations)
Dataset: EN
Emission Matrix: (21 States x 6187 Observations)
Dataset: AL
Emission Matrix: (42 States x 2698 Observations)


### Transition Parameters

In [12]:
def get_transition_parameters(state_sequences):
    """
    Estimate transition paramters from a collection of state sequences
    """
    n_states = max(flatten(state_sequences)) + 1  # State space size (Excluding START and STOP)
    transition_matrix = [[0 for _ in range(n_states + 2)] for _ in range(n_states + 1)]  # Transition matrix does not include (STOP -> other) transitions
    # transition_matrix[0] represents q(Y_i=S | Y_i-1=START)
    # transition_matrix[-1] represents q(Y_i=STOP | Y_i-1=S)
    for states in state_sequences:
        for i in range(len(states) - 1):
            if i == 0:
                transition_matrix[0][states[1] + 1] += 1
            elif i == len(states) - 2:
                transition_matrix[states[-1] + 1][-1] += 1
            else:
                transition_matrix[states[i] + 1][states[i + 1] + 1] += 1
    for row in transition_matrix:
        row_sum = sum(row)
        if row_sum:
            for i in range(len(row)):
                row[i] /= row_sum

    return transition_matrix

In [11]:
for dataset in DATASETS:
    print(f"Dataset: {dataset}")
    _, labels = load_dataset(f"data/{dataset}/train")  # Load dataset
    label_ids, label_map = encode_numeric(labels)  # Numerically encode dataset
    transition_matrix = get_transition_parameters(label_ids)
    print(f"Transition Parameters: ({len(transition_matrix)} States x {len(transition_matrix[0])} States)")
    print(transition_matrix)

Dataset: SG
Transition Parameters: (9 States x 9 States)
[[0.0, 0.014390142981931866, 0.035262746540388946, 0.020780653763045377, 0.008413406280171027, 0.029148085145510552, 0.021148452944692198, 0.87085651234426, 0.0], [0.0, 0.004098360655737705, 0.0028373266078184113, 0.0, 0.2808953341740227, 0.0, 0.0, 0.6793820933165196, 0.03278688524590164], [0.0, 0.0002959981056121241, 0.00947193937958797, 0.0011839924224484964, 0.0, 0.5175230878522378, 0.0, 0.44766753492777644, 0.0238574473123372], [0.0, 0.0, 0.001949317738791423, 0.005523066926575698, 0.0, 0.0, 0.41455490578297594, 0.543859649122807, 0.0341130604288499], [0.0, 0.00211118930330753, 0.0007037297677691766, 0.0, 0.27867698803659396, 0.0, 0.0, 0.6910626319493315, 0.02744546094299789], [0.0, 5.957700327673518e-05, 0.0032767351802204347, 0.0005361930294906167, 0.0, 0.5537682454572534, 0.0, 0.4291331546023235, 0.01322609472743521], [0.0, 0.0, 0.0011061946902654867, 0.0017699115044247787, 0.0, 0.0, 0.39557522123893807, 0.5796460176991151

### Viterbi Algorithm

In [105]:
def log(x):
    return math.log(x) if x else -float('inf')


def viterbi(observations, transition, emission):
    label_space = transition.keys()
    n = len(observations)
    path = defaultdict(lambda: defaultdict(tuple))

    # Base case
    for label in label_space:
        path[0][label] = (log(0), None)
        emit_prob = emission[observations[0]][label]
        trans_prob = transition["START"][label]
        path[1][label] = (log(trans_prob) + log(emit_prob), "START")
    path[0]["START"] = (log(1), None)

    path = {origin: {dest: count for dest, count in dests.items()} for origin, dests in path.items()}
    print(path)

In [106]:
for dataset in DATASETS:
    print(f"Dataset: {dataset}")
    features, labels = load_dataset(f"data/{dataset}/train")
    feature_counts = count(features)
    label_counts = count(labels)
    emission_parameters = get_emission_parameters(features, labels)
    transition_parameters = get_transition_parameters(labels)

    dev_features = load_dataset(f"data/{dataset}/dev.in", split=False)
    smoothed_dev_features = smooth(dev_features)
    print(dev_features[0])
    predicted_dev_labels = viterbi(dev_features[0], transition_parameters, emission_parameters)

Dataset: SG
['Tour', 'Scotland', 'followers', 'to', 'visit', '@beley_crysta2', ',', '@MyExpatJob', ',', ',', '@paridise15', ',', '@JCBlakeney76', ',', '@UnwearyWorld', ',', '@szilviade', ',', '@garrywatts1231']
{0: {'START': (0.0, None), 'O': (-inf, None), 'B-positive': (-inf, None), 'I-positive': (-inf, None), 'B-neutral': (-inf, None), 'I-neutral': (-inf, None), 'B-negative': (-inf, None), 'I-negative': (-inf, None)}, 1: {'START': (-inf, 'START'), 'O': (-0.3760144579374646, 'START'), 'B-positive': (-5.808641770632523, 'START'), 'I-positive': (-inf, 'START'), 'B-neutral': (-inf, 'START'), 'I-neutral': (-inf, 'START'), 'B-negative': (-inf, 'START'), 'I-negative': (-inf, 'START')}}
Dataset: CN
['一', '觉醒', '来天', '都', '黑', '了', '。', '梦到', '了', '继科', '哥哥', '，', '给', '他', '做饭', '忙', '得', '团团转', '，', '继科', '哥哥', '很乖', '，', '不怎么', '爱', '说话', '。']
{0: {'START': (0.0, None), 'O': (-inf, None), 'B-neutral': (-inf, None), 'I-neutral': (-inf, None), 'B-positive': (-inf, None), 'I-positive': (-inf,