# Hidden Markov Model

In [70]:
import itertools
import math
import random
from collections import defaultdict
from pathlib import Path
from pprint import pprint

## Constants

In [71]:
DATA_ROOT = Path("data")
DATASETS = ["SG", "EN", "CN", "AL"]

## Part 3

### Transition Parameters

In [72]:
def load_dataset(path, split=True, shuffle=True):
    """
    Load a dataset from a specified path.
    
    Args:
        path: The path to read the data from
        split (bool): Whether to split labels from each line of data
    """
    with open(path) as f:
        sequences = [sent.split("\n") for sent in f.read().split("\n\n")][:-1]
    if shuffle:
        random.shuffle(sequences)
    if split:
        sequences = [[pair.split() for pair in seq] for seq in sequences]
        sequences = [[[pair[i] for pair in s] for s in sequences] for i in [0, 1]]
    return sequences


def pairwise(sequence, include_start_stop=True):
    a, b = itertools.tee(sequence)
    next(b)
    pairs = zip(a, b)
    if include_start_stop:
        pairs = itertools.chain([("START", sequence[0])], pairs, [(sequence[-1], "STOP")])
    return pairs


def get_transition_probabilities(sequences):
    """
    Estimate transition paramters
    """
    transitions = defaultdict(lambda: defaultdict(int))  # Number of transitions along each edge
    for sequence in sequences:
        for origin, dest in pairwise(sequence, include_start_stop=True):
            transitions[origin][dest] += 1
    transitions = {k: dict(v) for k, v in transitions.items()}
    return transitions

*Not sure if I should pass `features` or `labels` to `get_transition_probabilities`...*

In [74]:
for dataset in DATASETS:
    print(f"Dataset: {dataset}")
    features, labels = load_dataset(f"data/{dataset}/train", shuffle=False)
    transition_probabilities = get_transition_probabilities(labels)
    pprint(transition_probabilities, indent=2)

Dataset: SG
{ 'B-negative': { 'B-negative': 16,
                  'B-neutral': 9,
                  'I-negative': 1099,
                  'O': 2635,
                  'STOP': 105},
  'B-neutral': { 'B-negative': 5,
                 'B-neutral': 214,
                 'B-positive': 23,
                 'I-neutral': 9531,
                 'O': 10011,
                 'STOP': 406},
  'B-positive': { 'B-neutral': 15,
                  'B-positive': 40,
                  'I-positive': 3083,
                  'O': 4107,
                  'STOP': 211},
  'I-negative': { 'B-negative': 3,
                  'B-neutral': 2,
                  'I-negative': 410,
                  'O': 1055,
                  'STOP': 39},
  'I-neutral': { 'B-negative': 1,
                 'B-neutral': 55,
                 'B-positive': 9,
                 'I-neutral': 9362,
                 'O': 9243,
                 'STOP': 223},
  'I-positive': { 'B-neutral': 5,
                  'B-positive': 9,
                 

### Viterbi Algorithm

In [78]:
def viterbi(sequence, states, x, y):
    pass

In [80]:
for dataset in DATASETS:
    sequences = load_dataset(f"data/{dataset}/train")

    states = [[]]
    for sequence in sequences:
        pass