# Hidden Markov Model

In [107]:
import collections
import itertools
import random
import math
from sys import float_info
from collections import defaultdict
from pathlib import Path
from operator import itemgetter
from pprint import pprint

## Constants

In [48]:
DATA_ROOT = Path("data")
DATASETS = ["SG", "CN", "EN", "AL"]

## Helper Functions

In [55]:
def load_dataset(path, split=True, shuffle=False):
    """
    Load a dataset from a specified path.
    
    Args:
        path: The path to read the data from
        split (bool): Whether to split labels from each line of data
    """
    with open(path) as f:
        sequences = [sent.split("\n") for sent in f.read().split("\n\n")][:-1]
    if shuffle:
        random.shuffle(sequences)
    if split:
        sequences = [[pair.split() for pair in seq] for seq in sequences]
        sequences = [[[pair[i] for pair in s] for s in sequences] for i in [0, 1]]
    return sequences


def pairwise(sequence, include_start_stop=True):
    """
    Rolling window over iterable (with offset=1 and window_size=2)

    Args:
        sequence: The iterable to window over
        include_start_stop (bool): If True, adds START & STOP are added to either end of output
        
    Examples:
        >>> pairwise([1, 2, 3], include_start_stop=True)
        [("START", 1), (1, 2), (2, 3), (3, "STOP")]

        >>> pairwise([1, 2, 3, 4], include_start_stop=False)
        [(1, 2), (2, 3), (3, 4)]
    """
    a, b = itertools.tee(sequence)
    next(b)
    pairs = zip(a, b)
    if include_start_stop:
        pairs = itertools.chain([("START", sequence[0])], pairs, [(sequence[-1], "STOP")])
    return pairs


def count(sequences, flatten=True, as_probability=False):
    if flatten:
        sequences = itertools.chain.from_iterable(sequences)
    counts = dict(collections.Counter(sequences))
    if as_probability:
        counts = {k: v / sum(counts.values()) for k, v in counts.items()}
    return counts


def pprint_dict(d, max_entires=20):
    pprint(dict(itertools.islice(d.items(), max_entires)))

## Part 3

### Emission Parameters

In [51]:
def get_emission_parameters(sequences):
    """
    Estimate emission paramters from a collection of input-output sequence pairs
    """
    emissions = defaultdict(lambda: defaultdict(int))
    for inputs, outputs in sequences:
        for inp, out in zip(inputs, outputs):
            emissions[inp][out] += 1
    emission_parameters = {
        inp: {
            out: count / sum(outs.values()) for out, count in outs.items()
        } for inp, outs in emissions.items()
    }
    emission_parameters = {k: emission_parameters[k] for k in sorted(emission_parameters)}
    return emission_parameters

In [52]:
for dataset in DATASETS:
    print(f"Dataset: {dataset}")
    features, labels = load_dataset(f"data/{dataset}/train")
    emission_parameters = get_emission_parameters(zip(features, labels))
    pprint_dict(emission_parameters)

Dataset: SG
{'!': {'I-negative': 0.0005693680015183147,
       'I-neutral': 0.0007591573353577528,
       'I-positive': 0.001708104004554944,
       'O': 0.996963370658569},
 '"': {'B-negative': 0.001976284584980237,
       'B-neutral': 0.001976284584980237,
       'B-positive': 0.002635046113306983,
       'I-negative': 0.001976284584980237,
       'I-neutral': 0.006587615283267457,
       'I-positive': 0.005270092226613966,
       'O': 0.9795783926218709},
 '#': {'B-positive': 0.019230769230769232, 'O': 0.9807692307692307},
 '###DVDjoseph': {'O': 1.0},
 '##speed': {'O': 1.0},
 '#010110arch': {'O': 1.0},
 '#10U': {'O': 1.0},
 '#10shots': {'O': 1.0},
 '#13': {'O': 1.0},
 '#130': {'I-neutral': 1.0},
 '#13TH': {'O': 1.0},
 '#13thjbarts': {'O': 1.0},
 '#14': {'O': 1.0},
 '#160729': {'O': 1.0},
 '#17': {'O': 1.0},
 '#1800GEYLANG': {'O': 1.0},
 '#1800s': {'O': 1.0},
 '#1835': {'O': 1.0},
 '#19': {'O': 1.0},
 '#1KONNIVERSARY2016': {'O': 1.0}}
Dataset: CN
{'!': {'I-neutral': 0.044943820224719

### Transition Parameters

In [42]:
def get_transition_parameters(sequences):
    """
    Estimate transition paramters from a collection of sequences
    """
    transitions = defaultdict(lambda: defaultdict(int))  # Number of transitions along each edge
    for sequence in sequences:
        for origin, dest in pairwise(sequence, include_start_stop=True):
            transitions[origin][dest] += 1
    transition_parameters = {
        origin: {
            dest: count / sum(dests.values()) for dest, count in dests.items()
        } for origin, dests in transitions.items()
    }
    transition_parameters = {k: transition_parameters[k] for k in sorted(transition_parameters)}
    return transition_parameters

*Not sure if I should pass `features` or `labels` to `get_transition_probabilities`...*

In [50]:
for dataset in DATASETS:
    print(f"Dataset: {dataset}")
    features, labels = load_dataset(f"data/{dataset}/train")
    transition_parameters = get_transition_parameters(labels)
    pprint_dict(transition_parameters)

Dataset: SG
{'B-negative': {'B-negative': 0.004140786749482402,
                'B-neutral': 0.002329192546583851,
                'I-negative': 0.28442028985507245,
                'O': 0.681935817805383,
                'STOP': 0.02717391304347826},
 'B-neutral': {'B-negative': 0.00024764735017335313,
               'B-neutral': 0.010599306587419515,
               'B-positive': 0.0011391778107974245,
               'I-neutral': 0.47206537890044575,
               'O': 0.4958395245170877,
               'STOP': 0.020108964834076277},
 'B-positive': {'B-neutral': 0.002011802575107296,
                'B-positive': 0.00536480686695279,
                'I-positive': 0.4134924892703863,
                'O': 0.5508315450643777,
                'STOP': 0.028299356223175965},
 'I-negative': {'B-negative': 0.0019880715705765406,
                'B-neutral': 0.0013253810470510272,
                'I-negative': 0.2717031146454606,
                'O': 0.6991385023194169,
                'STOP'

### Viterbi Algorithm

In [112]:
def log(x):
    return math.log(x) if x else math.log(float_info.min)


def viterbi(sequence, transition, emission):
    label_space = transition.keys()
    feature_space = emission.keys()
    n = len(sequence)
    path = defaultdict(lambda: defaultdict(list))
    
    # Base case
    path[0]["STOP"] = (0, None)
    path[0]["START"] = (1, None)
    for label in label_space:
        path[0][label] = (0, None)
        emit_prob = emission[sequence[0]].get(label, 0)
        trans_prob = transition["START"].get(label, 0)
        path[1][label] = (
            log(trans_prob) + log(emit_prob),
            "START"
        )

    for i in range(2, n + 1):
        for label in label_space:
            ith_word = sequence[i - 1]
            probs = [(
                path[i - 1][u][0] + log(transition[u].get(label)) + log(emission[u].get(label)),
                u
            ) for u in label_space]
            path[i][label] = max(probs, key=itemgetter(0))

    print(path)

In [113]:
for dataset in DATASETS:
    print(f"Dataset: {dataset}")
    features, labels = load_dataset(f"data/{dataset}/train")
    feature_counts = count(features)
    label_counts = count(labels)
    emission_parameters = get_emission_parameters(zip(features, labels))
    transition_parameters = get_transition_parameters(labels)

    dev_features = load_dataset(f"data/{dataset}/dev.in", split=False)
    print(dev_features[0])
    predicted_dev_labels = viterbi(dev_features[0], transition_parameters, emission_parameters)

Dataset: SG
['Tour', 'Scotland', 'followers', 'to', 'visit', '@beley_crysta2', ',', '@MyExpatJob', ',', ',', '@paridise15', ',', '@JCBlakeney76', ',', '@UnwearyWorld', ',', '@szilviade', ',', '@garrywatts1231']


KeyError: 'B-negative'