In [1]:
import re
import string
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import plotly.express as px
from collections import defaultdict, Counter
from unidecode import unidecode
from typing import List, Tuple, Dict, Union
import log
import mynlputils as nu

In [2]:
logger = log.get_logger(__name__)

In [3]:
def load_data(raw_txt_train_path: str, raw_txt_test_path: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    df_train = pd.read_excel(raw_txt_train_path)
    df_test = pd.read_excel(raw_txt_test_path)
    logger.info(f"df_train.shape: {df_train.shape}")
    logger.info(f"df_train unique tokens: {df_train['Token'].nunique()}")
    logger.info(f"df_train unique POS: {df_train['POS'].nunique()}")
    logger.info(f"df_test.shape: {df_test.shape}")
    logger.info(f"df_test unique tokens: {df_test['Token'].nunique()}")
    logger.info(f"df_test unique POS: {df_test['POS'].nunique()}")
    return df_train, df_test


def remove_punctuation(data: pd.DataFrame):
    """
    Removes rows with 'PUNCT' in the 'POS' column from the dataset.

    Args:
    data (pd.DataFrame): DataFrame containing the tokenized isiZulu data with 'Token' and 'POS' columns.

    Returns:
    pd.DataFrame: DataFrame with rows containing 'PUNCT' removed.
    """
    return data[data['POS'] != 'PUNCT']


def split_into_sentences(df: pd.DataFrame) -> List[pd.DataFrame]:
    """
    Splits a DataFrame into a list of DataFrames each representing a sentence.
    Adds start and stop tokens to each sentence.

    Args:
    df (pd.DataFrame): DataFrame containing the tokenized isiZulu data with 'Token' and 'POS' columns.

    Returns:
    List[pd.DataFrame]: List of DataFrames each representing a sentence.
    """
    df['Sentence'] = (df['Token'].isna().cumsum())
    df = df.dropna()
    sentences = [group for _, group in df.groupby('Sentence')]
    for i in range(len(sentences)):
        start_token = pd.DataFrame([['<s>', 'START']], columns=['Token', 'POS'])
        stop_token = pd.DataFrame([['<\s>', 'STOP']], columns=['Token', 'POS'])
        sentences[i] = pd.concat([start_token, sentences[i], stop_token], ignore_index=True)
    return sentences

def create_validation_set(sentences: List[pd.DataFrame], valid_size: float = 0.2) -> Tuple[List[pd.DataFrame], List[pd.DataFrame]]:
    """
    Creates a validation set from a list of DataFrames each representing a sentence.

    Args:
    sentences (List[pd.DataFrame]): List of DataFrames each representing a sentence.
    valid_size (float): Proportion of sentences to include in the validation set.

    Returns:
    Tuple[List[pd.DataFrame], List[pd.DataFrame]]: Training and validation sets.
    """
    train_sentences, valid_sentences = train_test_split(sentences, test_size=valid_size, random_state=1)
    return train_sentences, valid_sentences


class HMM:
    def __init__(self):
        self.transition_probs = dict()
        self.emission_probs = dict()
        self.start_probs = dict()

@nu.timer
def train_hmm(sentences: List[pd.DataFrame]) -> HMM:
    """
    Trains a Hidden Markov Model (HMM) given a list of sentences. Applies Laplace smoothing when calculating probabilities.

    Args:
    sentences (List[pd.DataFrame]): List of DataFrames, each representing a sentence.

    Returns:
    HMM: Trained HMM model.
    """
    model = HMM()

    transition_counts = defaultdict(int)
    emission_counts = defaultdict(int)
    start_counts = defaultdict(int)

    for sentence in sentences:
        prev_tag = None
        for _, row in sentence.iterrows():
            token, tag = row['Token'], row['POS']
            if prev_tag is None:
                start_counts[tag] += 1
            else:
                transition_counts[(prev_tag, tag)] += 1
            emission_counts[(tag, token)] += 1
            prev_tag = tag

    unique_transitions = len(transition_counts)
    unique_emissions = len(emission_counts)

    total_transitions = sum(transition_counts.values())
    total_emissions = sum(emission_counts.values())
    total_starts = sum(start_counts.values())

    model.transition_probs = {k: (v + 1) / (total_transitions + unique_transitions) for k, v in transition_counts.items()}
    model.emission_probs = {k: (v + 1) / (total_emissions + unique_emissions) for k, v in emission_counts.items()}
    model.start_probs = {k: v / total_starts for k, v in start_counts.items()}

    return model

@nu.timer
def viterbi_algorithm(model: HMM, sentence: pd.DataFrame) -> pd.DataFrame:
    """
    Uses the Viterbi algorithm to find the most probable sequence of hidden states (POS tags).
    Handles unknown words by assigning a small constant probability for every state.

    Args:
    model (HMM): Trained HMM model.
    sentence (pd.DataFrame): DataFrame representing a sentence.

    Returns:
    pd.DataFrame: DataFrame containing the tokens, actual tags, and predicted tags for each sentence.
    """
    tokens = sentence['Token'].tolist()
    actual_tags = sentence['POS'].tolist()
    states = list(set([state for state, _ in model.emission_probs.keys()]))
    n_states = len(states)
    n_tokens = len(tokens)

    dp = np.zeros((n_states, n_tokens))
    ptr = np.zeros((n_states, n_tokens), dtype=int)
    unknown_word_prob = 1e-6  # Small constant probability for unknown words

    # Initialization
    for i, state in enumerate(states):
        dp[i, 0] = model.start_probs.get(state, 0) * model.emission_probs.get((state, tokens[0]), unknown_word_prob)

    # Recursion
    for t in range(1, n_tokens):
        for j, state in enumerate(states):
            max_prob = 0
            max_state = 0
            for i, prev_state in enumerate(states):
                prob = dp[i, t-1] * model.transition_probs.get((prev_state, state), 0) * model.emission_probs.get((state, tokens[t]), unknown_word_prob)
                if prob > max_prob:
                    max_prob = prob
                    max_state = i
            dp[j, t] = max_prob
            ptr[j, t] = max_state

    # Traceback
    best_path = [np.argmax(dp[:, -1])]
    for t in range(n_tokens-1, 0, -1):
        best_path.append(ptr[best_path[-1], t])
    best_path = [states[i] for i in reversed(best_path)]

    result = pd.DataFrame({
        'Token': tokens,
        'Actual_POS': actual_tags,
        'Predicted_POS': best_path
    })
    return result

@nu.timer
def evaluate_hmm(model: HMM, sentences: List[pd.DataFrame]) -> float:
    """
    Evaluates the performance of the HMM model by calculating the accuracy of POS tagging.

    Args:
    model (HMM): Trained HMM model.
    sentences (List[pd.DataFrame]): List of DataFrames, each representing a sentence.

    Returns:
    float: Accuracy of POS tagging.
    df_results (pd.DataFrame): DataFrame containing the 'Token', 'Actual_POS', and 'Predicted_POS' for each sentence.

    """
    results = []
    for sentence in sentences:
        result = viterbi_algorithm(model, sentence)
        results.append(result)
    df_results = pd.concat(results, ignore_index=True)
    correct_tags = np.sum(df_results['Actual_POS'] == df_results['Predicted_POS'])
    total_tags = len(df_results)
    accuracy = correct_tags / total_tags
    print(f'Accuracy: {accuracy:.4f}')
    return accuracy, df_results

In [4]:
conf = nu.load_config("a2")
df_train, df_test = load_data(conf.paths.raw_txt_train, conf.paths.raw_txt_test)
df_train = remove_punctuation(df_train)
df_test = remove_punctuation(df_test)
sentences_train = split_into_sentences(df_train)
sentences_test = split_into_sentences(df_test)
sentences_train, sentences_valid = create_validation_set(sentences_train)
hmm_model = train_hmm(sentences_train)
# Evaluate model on the validation set
accuracy, df_results = evaluate_hmm(hmm_model, sentences_valid)

16-Jul-23 13:12:32 - INFO - Starting 'load_config'.
16-Jul-23 13:12:32 - INFO - Finished 'load_config' in 0.0080 secs.
16-Jul-23 13:12:32 - INFO - df_train.shape: (44324, 2)
16-Jul-23 13:12:32 - INFO - df_train unique tokens: 14125
16-Jul-23 13:12:32 - INFO - df_train unique POS: 99
16-Jul-23 13:12:32 - INFO - df_test.shape: (4676, 2)
16-Jul-23 13:12:32 - INFO - df_test unique tokens: 2415
16-Jul-23 13:12:32 - INFO - df_test unique POS: 77


Accuracy: 0.7066
