In [1]:
import re
import string
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import plotly.express as px
from collections import defaultdict, Counter
from unidecode import unidecode
from typing import List, Tuple, Dict, Union
import log
import mynlputils as nu

In [2]:
logger = log.get_logger(__name__)

In [3]:
def load_data(raw_txt_train_path: str, raw_txt_test_path: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    df_train = pd.read_excel(raw_txt_train_path)
    df_test = pd.read_excel(raw_txt_test_path)
    logger.info(f"df_train.shape: {df_train.shape}")
    logger.info(f"df_train unique tokens: {df_train['Token'].nunique()}")
    logger.info(f"df_train unique POS: {df_train['POS'].nunique()}")
    logger.info(f"df_test.shape: {df_test.shape}")
    logger.info(f"df_test unique tokens: {df_test['Token'].nunique()}")
    logger.info(f"df_test unique POS: {df_test['POS'].nunique()}")
    return df_train, df_test


def remove_punctuation(data: pd.DataFrame):
    """
    Removes rows with 'PUNCT' in the 'POS' column from the dataset.

    Args:
    data (pd.DataFrame): DataFrame containing the tokenized isiZulu data with 'Token' and 'POS' columns.

    Returns:
    pd.DataFrame: DataFrame with rows containing 'PUNCT' removed.
    """
    return data[data['POS'] != 'PUNCT']


def split_sentences(data: pd.DataFrame):
    """
    Splits the dataset into sentences based on rows with NaN values.

    Args:
    data (pd.DataFrame): DataFrame containing the tokenized isiZulu data with 'Token' and 'POS' columns.

    Returns:
    list: List of sentences, where each sentence is a list of tuples (Token, POS).
    """
    sentences = []
    sentence = []
    for _, row in data.iterrows():
        if pd.isnull(row['Token']) and pd.isnull(row['POS']):
            if sentence:
                sentences.append(sentence)
                sentence = []
        else:
            sentence.append((row['Token'], row['POS']))
    if sentence:
        sentences.append(sentence)
    return sentences

def compute_transition_probabilities(sentences: List[Tuple[str, str]], smoothing: float = 0.0) -> dict:
    """
    Computes transition probabilities for the Hidden Markov Model (HMM) based on the given sentences.

    Args:
    sentences (List[Tuple[str, str]]): List of sentences where each sentence is a tuple of (token, POS).
    smoothing (float): Smoothing parameter for Laplace smoothing. Default is 0.0 for no smoothing.

    Returns:
    dict: Transition probabilities for the HMM.
    """
    transition_counts = {}
    transition_probabilities = {}

    for sentence in sentences:
        for i in range(len(sentence) - 1):
            current_token, current_pos = sentence[i]
            next_token, next_pos = sentence[i + 1]

            if current_pos not in transition_counts:
                transition_counts[current_pos] = {}

            if next_pos not in transition_counts[current_pos]:
                transition_counts[current_pos][next_pos] = smoothing

            transition_counts[current_pos][next_pos] += 1

    for current_pos, next_pos_counts in transition_counts.items():
        total_count = sum(next_pos_counts.values())
        transition_probabilities[current_pos] = {}

        for next_pos, count in next_pos_counts.items():
            if smoothing == 0.0:
                transition_probabilities[current_pos][next_pos] = count / total_count
            else:
                transition_probabilities[current_pos][next_pos] = count / (total_count + smoothing * len(transition_counts[current_pos]))

    return transition_probabilities


def compute_emission_probabilities(sentences: List[Tuple[str, str]], smoothing: float = 1.0) -> dict:
    """
    Computes emission probabilities for the Hidden Markov Model (HMM) based on the given sentences.

    Args:
    sentences (List[Tuple[str, str]]): List of sentences where each sentence is a tuple of (token, POS).
    smoothing (float): Smoothing parameter for Laplace smoothing. Default is 0.0 for no smoothing.

    Returns:
    dict: Emission probabilities for the HMM.
    """
    emission_counts = {}
    emission_probabilities = {}

    for sentence in sentences:
        for token, pos in sentence:
            if pos not in emission_counts:
                emission_counts[pos] = {}

            if token not in emission_counts[pos]:
                emission_counts[pos][token] = smoothing

            emission_counts[pos][token] += 1

    for pos, token_counts in emission_counts.items():
        total_count = sum(token_counts.values())
        emission_probabilities[pos] = {}

        for token, count in token_counts.items():
            if smoothing == 0.0:
                emission_probabilities[pos][token] = count / total_count
            else:
                emission_probabilities[pos][token] = count / (total_count + smoothing * len(emission_counts[pos]))

    return emission_probabilities


def train(sentences_train, smoothing):
    transitions = compute_transition_probabilities(sentences_train, smoothing) # Compute transition probabilities
    emissions = compute_emission_probabilities(sentences_train, smoothing) # Compute emission probabilities
    return {'transitions': transitions, 'emissions': emissions} # Create and return the trained HMM model


def evaluate_hmm_model(test_sentences: List[Tuple[str, str]], transition_probabilities: dict, emission_probabilities: dict) -> float:
    """
    Evaluates the Hidden Markov Model (HMM) on the test sentences and returns the accuracy.

    Args:
    test_sentences (List[Tuple[str, str]]): List of test sentences where each sentence is a tuple of (token, POS).
    transition_probabilities (dict): Transition probabilities for the HMM.
    emission_probabilities (dict): Emission probabilities for the HMM.

    Returns:
    float: Accuracy of the HMM on the test sentences.
    """
    total_tokens = 0
    correct_predictions = 0

    for sentence in test_sentences:
        tokens = [token for token, _ in sentence]
        true_pos_tags = [pos for _, pos in sentence]
        predicted_pos_tags = tag_sentence(tokens, transition_probabilities, emission_probabilities)

        total_tokens += len(tokens)
        correct_predictions += sum(1 for true_pos, predicted_pos in zip(true_pos_tags, predicted_pos_tags) if true_pos == predicted_pos)

    accuracy = correct_predictions / total_tokens
    return round(accuracy * 100, 2)


def tag_sentence(tokens: List[str], transition_probabilities: dict, emission_probabilities: dict) -> List[str]:
    """
    Tags a sentence with part-of-speech (POS) tags using the Hidden Markov Model (HMM).

    Args:
    tokens (List[str]): List of tokens in the sentence.
    transition_probabilities (dict): Transition probabilities for the HMM.
    emission_probabilities (dict): Emission probabilities for the HMM.

    Returns:
    List[str]: List of predicted POS tags for the sentence.
    """
    n = len(tokens)
    viterbi = []

    # Initialization
    viterbi.append({})

    for pos, emission_prob in emission_probabilities.items():
        transition_prob = transition_probabilities.get('START', {}).get(pos, 0)
        viterbi[0][pos] = transition_prob * emission_prob.get(tokens[0], 0)

    # Recursion
    for t in range(1, n):
        viterbi.append({})
        for pos, emission_prob in emission_probabilities.items():
            max_prob = max(
                viterbi[t - 1][prev_pos] * transition_probabilities[prev_pos].get(pos, 0) * emission_prob.get(tokens[t], 0)
                for prev_pos in transition_probabilities)
            viterbi[t][pos] = max_prob

    # Termination
    final_pos = max(viterbi[-1], key=viterbi[-1].get)
    predicted_pos_tags = [final_pos]

    # Backtracking
    for t in range(n - 2, -1, -1):
        predicted_pos = max(
            transition_probabilities[prev_pos].get(predicted_pos_tags[0], 0) * viterbi[t][prev_pos]
            for prev_pos in transition_probabilities)
        predicted_pos_tags.insert(0, predicted_pos)

    return predicted_pos_tags

In [4]:
conf = nu.load_config("a2")
df_train, df_test = load_data(conf.paths.raw_txt_train, conf.paths.raw_txt_test)
df_train = remove_punctuation(df_train)
df_test = remove_punctuation(df_test)
# Split sentences for training and test sets
sentences_train, sentences_valid = train_test_split(split_sentences(df_train), test_size=0.15, random_state=42)
sentences_test = split_sentences(df_test)
logger.info(f"Number of training sentences: {len(sentences_train)}")
logger.info(f"Number of validation sentences: {len(sentences_valid)}")
logger.info(f"Number of test sentences: {len(sentences_test)}")
model = train(sentences_train, conf.model.smoothing)
accuracy = evaluate_hmm_model(sentences_valid, model['transitions'], model['emissions'])
accuracy # = round(accuracy * 100, 2)

14-Jul-23 23:53:08 - INFO - Starting 'load_config'.
14-Jul-23 23:53:08 - INFO - Finished 'load_config' in 0.0101 secs.
14-Jul-23 23:53:08 - INFO - df_train.shape: (58096, 2)
14-Jul-23 23:53:08 - INFO - df_train unique tokens: 7108
14-Jul-23 23:53:08 - INFO - df_train unique POS: 97
14-Jul-23 23:53:08 - INFO - df_test.shape: (6163, 2)
14-Jul-23 23:53:08 - INFO - df_test unique tokens: 1675
14-Jul-23 23:53:08 - INFO - df_test unique POS: 86
14-Jul-23 23:53:09 - INFO - Number of training sentences: 2221
14-Jul-23 23:53:09 - INFO - Number of validation sentences: 392
14-Jul-23 23:53:09 - INFO - Number of test sentences: 329


0.0

In [7]:
df_train

Unnamed: 0,Token,POS
0,Die,LB
1,doel,NSE
2,van,SVS
3,die,LB
4,webtuiste,NSE
...,...,...
58091,daarvoor,PB
58092,moet,VTUOM
58093,betaal,VTHOG
58094,.,ZE


In [11]:
model['transitions']

{'ASA': {'NSE': 0.38330757341576505,
  nan: 0.10896445131375579,
  'NSM': 0.30139103554868624,
  'NM': 0.05795981452859351,
  'ZM': 0.012751159196290572,
  'KN': 0.040571870170015456,
  'ASA': 0.05409582689335394,
  'THAB': 0.002704791344667697,
  'KO': 0.0030911901081916537,
  'RWD': 0.00463678516228748,
  'NSED': 0.00115919629057187,
  'RS': 0.0038639876352395673,
  'NEE': 0.005023183925811438,
  'THPB': 0.0019319938176197836,
  'RK': 0.0007727975270479134,
  'ZPL': 0.0015455950540958269,
  'THAO': 0.00231839258114374,
  'ZPR': 0.00115919629057187,
  'TRAB': 0.0007727975270479134,
  'VTHOG': 0.0007727975270479134,
  'SVS': 0.0007727975270479134,
  'RL': 0.0007727975270479134,
  'LB': 0.0007727975270479134},
 'NSE': {'ZM': 0.06476881925869316,
  'KN': 0.061138708444784105,
  'SVS': 0.284486052732136,
  'VTHOO': 0.013565150936186474,
  'PB': 0.05215896064195644,
  'VTHOG': 0.08674054260603745,
  'KO': 0.014520443255636225,
  'LB': 0.010890332441727168,
  'VVHOG': 0.043943446694688575,


In [8]:
sentences_valid[0]

[('Hierdie', 'PA'),
 ('ryk', 'ASA'),
 ('erfenis', 'NSE'),
 ('dui', 'VTHOO'),
 ('op', 'SVS'),
 ('Afrika', 'NEE'),
 ('as', 'KO'),
 ('die', 'LB'),
 ('baken', 'NSE'),
 ('van', 'SVS'),
 ('wetenskap', nan),
 ('en', 'KN'),
 ('letterkunde', nan),
 (',', 'ZM'),
 ('filosofie', nan),
 ('en', 'KN'),
 ('handel', nan),
 (',', 'ZM'),
 ('wat', 'PB'),
 ('verbreek', 'VTHOG'),
 ('is', 'VVUOP'),
 ('deur', 'SVS'),
 ('die', 'LB'),
 ('slawehandel', nan),
 ('en', 'KN'),
 ("'n", 'LO'),
 ('gejaag', nan),
 ('na', 'SVS'),
 ('Afrika', 'NEE'),
 ('se', 'UPS'),
 ('rykdom', 'NM'),
 ('.', 'ZE')]

In [6]:
sentences_train[0]

[('Verlede', 'ASA'),
 ('maand', 'NSE'),
 (',', 'ZM'),
 ('tydens', 'SVS'),
 ('sy', 'PDHEB'),
 ('jaarlikse', 'ASA'),
 ('lekgotla', 'NSE'),
 ('of', 'KN'),
 ('bosberaad', 'NSE'),
 ('in', 'SVS'),
 ('Januarie', 'NEE'),
 (',', 'ZM'),
 ('het', 'VUOT'),
 ('die', 'LB'),
 ('nasionale', 'ASA'),
 ('Kabinet', 'NSE'),
 (',', 'ZM'),
 ('wat', 'PB'),
 ('aan', 'SVS'),
 ('die', 'LB'),
 ('spits', 'NSE'),
 ('staan', 'VTHOO'),
 ('van', 'SVS'),
 ('die', 'LB'),
 ('regeringstelsel', 'NSE'),
 ('wat', 'PB'),
 ('ons', 'PEMP'),
 ('die', 'LB'),
 ('voorreg', 'NSE'),
 ('het', 'VTHOG'),
 ('om', 'SVS'),
 ('te', 'UPI'),
 ('bestuur', 'VTHOG'),
 (',', 'ZM'),
 ('besin', 'VTHOO'),
 ('oor', 'SVS'),
 ('die', 'LB'),
 ('feit', 'NSE'),
 ('dat', 'KO'),
 ('daardie', 'PA'),
 ('ontmoeting', 'NSE'),
 ('die', 'LB'),
 ('halfpadmerk', 'NSE'),
 ('verteenwoordig', 'VTHOG'),
 ('het', 'VUOT'),
 ('in', 'SVS'),
 ('die', 'LB'),
 ('bestaan', nan),
 ('van', 'SVS'),
 ('die', 'LB'),
 ('Regering', 'NSE'),
 ('wat', 'PB'),
 ('tydens', 'SVS'),
 ('ons',