# Hidden Markov Model

In [70]:
import itertools
import math
import random
from collections import defaultdict
from pathlib import Path
from pprint import pprint

## Constants

In [71]:
DATA_ROOT = Path("data")
DATASETS = ["SG", "EN", "CN", "AL"]

## Part 3

### Transition Parameters

In [93]:
def load_dataset(path, split=True, shuffle=True):
    """
    Load a dataset from a specified path.
    
    Args:
        path: The path to read the data from
        split (bool): Whether to split labels from each line of data
    """
    with open(path) as f:
        sequences = [sent.split("\n") for sent in f.read().split("\n\n")][:-1]
    if shuffle:
        random.shuffle(sequences)
    if split:
        sequences = [[pair.split() for pair in seq] for seq in sequences]
        sequences = [[[pair[i] for pair in s] for s in sequences] for i in [0, 1]]
    return sequences


def pairwise(sequence, include_start_stop=True):
    a, b = itertools.tee(sequence)
    next(b)
    pairs = zip(a, b)
    if include_start_stop:
        pairs = itertools.chain([("START", sequence[0])], pairs, [(sequence[-1], "STOP")])
    return pairs


def get_transition_probabilities(sequences):
    """
    Estimate transition paramters from a collection of sequences
    """
    transitions = defaultdict(lambda: defaultdict(int))  # Number of transitions along each edge
    transition_probabilities = defaultdict(lambda: defaultdict(float))
    for sequence in sequences:
        for origin, dest in pairwise(sequence, include_start_stop=True):
            transitions[origin][dest] += 1
        transition_probabilities = {
            origin: {
                dest: count / sum(dests.values()) for dest, count in dests.items()
            } for origin, dests in transitions.items()
        }
    return transition_probabilities

*Not sure if I should pass `features` or `labels` to `get_transition_probabilities`...*

In [96]:
for dataset in DATASETS:
    print(f"Dataset: {dataset}")
    features, labels = load_dataset(f"data/{dataset}/train", shuffle=False)
    transition_probabilities = get_transition_probabilities(labels)
    pprint(transition_probabilities, indent=2)

Dataset: SG
{ 'B-negative': { 'B-negative': 0.004140786749482402,
                  'B-neutral': 0.002329192546583851,
                  'I-negative': 0.28442028985507245,
                  'O': 0.681935817805383,
                  'STOP': 0.02717391304347826},
  'B-neutral': { 'B-negative': 0.00024764735017335313,
                 'B-neutral': 0.010599306587419515,
                 'B-positive': 0.0011391778107974245,
                 'I-neutral': 0.47206537890044575,
                 'O': 0.4958395245170877,
                 'STOP': 0.020108964834076277},
  'B-positive': { 'B-neutral': 0.002011802575107296,
                  'B-positive': 0.00536480686695279,
                  'I-positive': 0.4134924892703863,
                  'O': 0.5508315450643777,
                  'STOP': 0.028299356223175965},
  'I-negative': { 'B-negative': 0.0019880715705765406,
                  'B-neutral': 0.0013253810470510272,
                  'I-negative': 0.2717031146454606,
                  'O': 0.

### Viterbi Algorithm

In [78]:
def viterbi(sequence, states, x, y):
    pass

In [81]:
for dataset in DATASETS:
    sequences = load_dataset(f"data/{dataset}/train")

    states = [[]]
    for sequence in sequences:
        pass