In [30]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import defaultdict
import plotly.express as px
from typing import List, Tuple, Dict, Union
import log
import mynlputils as nu

In [2]:
logger = log.get_logger(__name__)

In [3]:
def load_data(raw_txt_train_path: str, raw_txt_test_path: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    df_train = pd.read_excel(raw_txt_train_path)
    df_test = pd.read_excel(raw_txt_test_path)
    logger.info(f"df_train.shape: {df_train.shape}")
    logger.info(f"df_train unique tokens: {df_train['Token'].nunique()}")
    logger.info(f"df_train unique POS: {df_train['POS'].nunique()}")
    logger.info(f"df_test.shape: {df_test.shape}")
    logger.info(f"df_test unique tokens: {df_test['Token'].nunique()}")
    logger.info(f"df_test unique POS: {df_test['POS'].nunique()}")
    return df_train, df_test


def remove_punctuation(data: pd.DataFrame):
    """
    Removes rows with 'PUNCT' in the 'POS' column from the dataset.

    Args:
    data (pd.DataFrame): DataFrame containing the tokenized isiZulu data with 'Token' and 'POS' columns.

    Returns:
    pd.DataFrame: DataFrame with rows containing 'PUNCT' removed.
    """
    return data[data['POS'] != 'PUNCT']


def split_into_sentences(df: pd.DataFrame) -> List[pd.DataFrame]:
    """
    Splits a DataFrame into a list of DataFrames each representing a sentence.
    Adds start and stop tokens to each sentence.

    Args:
    df (pd.DataFrame): DataFrame containing the tokenized isiZulu data with 'Token' and 'POS' columns.

    Returns:
    List[pd.DataFrame]: List of DataFrames each representing a sentence.
    """
    df['Sentence'] = (df['Token'].isna().cumsum())
    df = df.dropna()
    sentences = [group for _, group in df.groupby('Sentence')]
    for i in range(len(sentences)):
        start_token = pd.DataFrame([['<s>', 'START']], columns=['Token', 'POS'])
        stop_token = pd.DataFrame([['<\s>', 'STOP']], columns=['Token', 'POS'])
        sentences[i] = pd.concat([start_token, sentences[i], stop_token], ignore_index=True)
    return sentences

def create_validation_set(sentences: List[pd.DataFrame], valid_size: float = 0.2) -> Tuple[List[pd.DataFrame], List[pd.DataFrame]]:
    """
    Creates a validation set from a list of DataFrames each representing a sentence.

    Args:
    sentences (List[pd.DataFrame]): List of DataFrames each representing a sentence.
    valid_size (float): Proportion of sentences to include in the validation set.

    Returns:
    Tuple[List[pd.DataFrame], List[pd.DataFrame]]: Training and validation sets.
    """
    train_sentences, valid_sentences = train_test_split(sentences, test_size=valid_size, random_state=1)
    return train_sentences, valid_sentences


class HMM:
    def __init__(self):
        self.transition_probs = dict()
        self.emission_probs = dict()
        self.start_probs = dict()

@nu.timer
def train_hmm(sentences: List[pd.DataFrame], smooth_value: float = 1.0) -> HMM:
    """
    Trains a Hidden Markov Model (HMM) given a list of sentences. 
    Applies Laplace smoothing when calculating probabilities.

    Args:
    sentences (List[pd.DataFrame]): List of DataFrames, each representing a sentence.

    Returns:
    HMM: Trained HMM model.
    """
    model = HMM()

    transition_counts = defaultdict(int)
    emission_counts = defaultdict(int)
    start_counts = defaultdict(int)

    for sentence in sentences:
        prev_tag = None
        for _, row in sentence.iterrows():
            token, tag = row['Token'], row['POS']
            if prev_tag is None:
                start_counts[tag] += 1
            else:
                transition_counts[(prev_tag, tag)] += 1
            emission_counts[(tag, token)] += 1
            prev_tag = tag

    unique_transitions = len(transition_counts)
    unique_emissions = len(emission_counts)

    total_transitions = sum(transition_counts.values())
    total_emissions = sum(emission_counts.values())
    total_starts = sum(start_counts.values())

    model.transition_probs = {k: (v + smooth_value) / (total_transitions + smooth_value*unique_transitions) for k, v in transition_counts.items()}
    model.emission_probs = {k: (v + smooth_value) / (total_emissions + smooth_value*unique_emissions) for k, v in emission_counts.items()}
    model.start_probs = {k: v / total_starts for k, v in start_counts.items()}
    return model


def viterbi_algorithm(model: HMM, sentence: pd.DataFrame) -> pd.DataFrame:
    """
    Uses the Viterbi algorithm to find the most probable sequence of hidden states (POS tags).
    Handles unknown words by assigning a small constant probability for every state.

    Args:
    model (HMM): Trained HMM model.
    sentence (pd.DataFrame): DataFrame representing a sentence.

    Returns:
    pd.DataFrame: DataFrame containing the tokens, actual tags, and predicted tags for each sentence.
    """
    tokens = sentence['Token'].tolist()
    actual_tags = sentence['POS'].tolist()
    states = list(set([state for state, _ in model.emission_probs.keys()]))
    n_states = len(states)
    n_tokens = len(tokens)

    dp = np.zeros((n_states, n_tokens))
    ptr = np.zeros((n_states, n_tokens), dtype=int)
    unknown_word_prob = 1e-6  # Small constant probability for unknown words

    # Initialization
    for i, state in enumerate(states):
        dp[i, 0] = model.start_probs.get(state, 0) * model.emission_probs.get((state, tokens[0]), unknown_word_prob)

    # Recursion
    for t in range(1, n_tokens):
        for j, state in enumerate(states):
            max_prob = 0
            max_state = 0
            for i, prev_state in enumerate(states):
                prob = dp[i, t-1] * model.transition_probs.get((prev_state, state), 0) * model.emission_probs.get((state, tokens[t]), unknown_word_prob)
                if prob > max_prob:
                    max_prob = prob
                    max_state = i
            dp[j, t] = max_prob
            ptr[j, t] = max_state

    # Traceback
    best_path = [np.argmax(dp[:, -1])]
    for t in range(n_tokens-1, 0, -1):
        best_path.append(ptr[best_path[-1], t])
    best_path = [states[i] for i in reversed(best_path)]

    result = pd.DataFrame({
        'Token': tokens,
        'Actual_POS': actual_tags,
        'Predicted_POS': best_path
    })
    return result

@nu.timer
def evaluate_hmm(model: HMM, sentences: List[pd.DataFrame]) -> float:
    """
    Evaluates the performance of the HMM model by calculating the accuracy of POS tagging.

    Args:
    model (HMM): Trained HMM model.
    sentences (List[pd.DataFrame]): List of DataFrames, each representing a sentence.

    Returns:
    float: Accuracy of POS tagging.
    df_results (pd.DataFrame): DataFrame containing the 'Token', 'Actual_POS', and 'Predicted_POS' for each sentence.

    """
    results = []
    for sentence in sentences:
        result = viterbi_algorithm(model, sentence)
        results.append(result)
    df_results = pd.concat(results, ignore_index=True)
    correct_tags = np.sum(df_results['Actual_POS'] == df_results['Predicted_POS'])
    total_tags = len(df_results)
    accuracy = correct_tags / total_tags
    print(f'Accuracy: {accuracy:.4f}')
    return accuracy, df_results

@nu.timer
def cross_validation(sentences: List[pd.DataFrame], smooth_values: List[float], k: int) -> float:
    """
    Performs k-fold cross-validation to choose the best smoothing value.

    Args:
    sentences (List[pd.DataFrame]): List of DataFrames, each representing a sentence.
    smooth_values (List[float]): The values to try for Laplace smoothing.
    k (int): The number of folds for cross-validation.

    Returns:
    float: The smoothing value that resulted in the highest average accuracy.
    """
    n = len(sentences)
    fold_size = n // k

    best_smooth_value = None
    best_accuracy = 0

    for smooth_value in smooth_values:
        accuracies = []
        for i in range(k):
            validation_sentences = sentences[i*fold_size:(i+1)*fold_size]
            training_sentences = sentences[:i*fold_size] + sentences[(i+1)*fold_size:]
            model = train_hmm(training_sentences, smooth_value)
            accuracy, _ = evaluate_hmm(model, validation_sentences)
            accuracies.append(accuracy)
        average_accuracy = sum(accuracies) / len(accuracies)
        if average_accuracy > best_accuracy:
            best_accuracy = average_accuracy
            best_smooth_value = smooth_value

    return best_smooth_value

In [4]:
conf = nu.load_config("a2")
df_train, df_test = load_data(conf.paths.raw_txt_train, conf.paths.raw_txt_test)
df_train = remove_punctuation(df_train)
df_test = remove_punctuation(df_test)
sentences_train = split_into_sentences(df_train)
sentences_test = split_into_sentences(df_test)
sentences_train, sentences_valid = create_validation_set(sentences_train)
hmm_model = train_hmm(sentences_train)
# Evaluate model on the validation set
valid_accuracy, valid_results = evaluate_hmm(hmm_model, sentences_valid)
test_accuracy, test_results = evaluate_hmm(hmm_model, sentences_test)
print(f'Validation Set Accuracy: {valid_accuracy * 100:.2f}%')
print(f'Test Set Accuracy: {test_accuracy * 100:.2f}%')

16-Jul-23 14:08:14 - INFO - Starting 'load_config'.
16-Jul-23 14:08:14 - INFO - Finished 'load_config' in 0.0080 secs.
16-Jul-23 14:08:15 - INFO - df_train.shape: (44324, 2)
16-Jul-23 14:08:15 - INFO - df_train unique tokens: 14125
16-Jul-23 14:08:15 - INFO - df_train unique POS: 99
16-Jul-23 14:08:15 - INFO - df_test.shape: (4676, 2)
16-Jul-23 14:08:15 - INFO - df_test unique tokens: 2415
16-Jul-23 14:08:15 - INFO - df_test unique POS: 77
16-Jul-23 14:08:16 - INFO - Starting 'train_hmm'.
16-Jul-23 14:08:16 - INFO - Finished 'train_hmm' in 0.5132 secs.
16-Jul-23 14:08:16 - INFO - Starting 'evaluate_hmm'.
16-Jul-23 14:08:48 - INFO - Finished 'evaluate_hmm' in 31.2593 secs.
16-Jul-23 14:08:48 - INFO - Starting 'evaluate_hmm'.


Accuracy: 0.7066


16-Jul-23 14:09:04 - INFO - Finished 'evaluate_hmm' in 16.3073 secs.


Accuracy: 0.6969
Validation Set Accuracy: 70.66%
Test Set Accuracy: 69.69%


## Let's try and understand some more insights:

### How many new words are there in the validation set and test set that were not there in the train set?

In [7]:
def new_words(train_set: pd.DataFrame, test_set: pd.DataFrame):
    train_words = set(train_set['Token'].unique())
    test_words = set(test_set['Token'].unique())
    new_words = test_words - train_words
    return new_words

In [8]:
new_words = new_words(df_train, df_test)
len(new_words)

1248

### What were the most common correct POS for those unknown words?

In [9]:
def common_pos_for_new_words(new_words: set, test_set: pd.DataFrame):
    new_word_rows = test_set[test_set['Token'].isin(new_words)]
    return new_word_rows['POS'].value_counts()

In [10]:
common_pos = common_pos_for_new_words(new_words, df_test)
common_pos

V          804
N           93
P           92
POS         47
N09         45
N07         40
N05         34
N06         32
REL         29
N03         28
N10         24
N01         23
LOC         20
N02         17
ADV         17
N11         16
N08         14
COP         13
N04         10
DEM01        5
N14          3
ADJ04        2
ADJ02        2
PRON01       2
ADJ05        1
ADJ00        1
M            1
ADJ07        1
QUANT01      1
POS09        1
REL02        1
DEM          1
ADJ08        1
N00          1
DEM02        1
ADJ01        1
Name: POS, dtype: int64

### How many of these did the model misclassify?

In [11]:
def misclassified_new_words(new_words: set, results: pd.DataFrame):
    new_word_rows = results[results['Token'].isin(new_words)]
    return sum(new_word_rows['Actual_POS'] != new_word_rows['Predicted_POS'])

In [12]:
misclassified_new_words(new_words, test_results)

620

### What are some of the other traits of the words that the model is misclassifying?

In [20]:
# def misclassified_traits(results: pd.DataFrame):
misclassified_rows = test_results[test_results['Actual_POS'] != test_results['Predicted_POS']]
word_lengths = misclassified_rows['Token'].str.len()
print("Average word length:", word_lengths.mean())
print("Median word length:", word_lengths.median())
print("Number of words with digits:", misclassified_rows['Token'].str.contains('\d').sum())
print("Number of words with special characters:", misclassified_rows['Token'].str.contains('\W').sum())
print("Word frequency:", misclassified_rows['Token'].value_counts())


Average word length: 8.331574981160513
Median word length: 8.0
Number of words with digits: 54
Number of words with special characters: 77
Word frequency: ngcono            15
khona              9
kwethu             8
lwethu             7
yethu              7
                  ..
yingxenye          1
yalo               1
ngogqozi           1
Zingamakhulu       1
ngoyimpumelelo     1
Name: Token, Length: 986, dtype: int64


In [26]:
misclassified_rows[misclassified_rows['Token'] == 'yethu']

Unnamed: 0,Token,Actual_POS,Predicted_POS
1989,yethu,POS09,V
1997,yethu,POS09,V
2612,yethu,POS09,V
3374,yethu,POS09,V
3433,yethu,POS09,V
3591,yethu,POS09,V
3731,yethu,POS09,V


In [27]:
test_results

Unnamed: 0,Token,Actual_POS,Predicted_POS
0,<s>,START,START
1,Thola,V,V
2,ifomu,N05,N05
3,lesicelo,N07,N07
4,kwi-inthanethi,COP,V
...,...,...,...
4373,kube,V,V
4374,ngoyimpumelelo,N09,V
4375,ezweni,N05,N05
4376,lethu,POS05,POS05


In [18]:
misclassified_traits(test_results)

Average word length: 8.331574981160513
Median word length: 8.0
Number of words with digits: 54
Number of words with special characters: 77
Word frequency: ngcono            15
khona              9
kwethu             8
lwethu             7
yethu              7
                  ..
yingxenye          1
yalo               1
ngogqozi           1
Zingamakhulu       1
ngoyimpumelelo     1
Name: Token, Length: 986, dtype: int64


In [29]:
misclassified_rows[misclassified_rows['Token'].str.contains('\d')]

Unnamed: 0,Token,Actual_POS,Predicted_POS
88,10,POS,V
541,08600,REL,V
542,10,P,V
604,08600,REL,V
605,10,P,V
720,angu-24,REL,V
751,ye-NI2,ADV,V
773,enga-20,REL,V
1134,engama-20,REL,V
1264,we-1910,P,V


In [42]:
# df_train[df_train['Token'] == 'ngcono']

In [43]:
# test_results[test_results['Actual_POS'] == 'STOP']

In [50]:
def plot_comparison(df_results: pd.DataFrame):
    df_melted = df_results.melt(id_vars="Token", value_vars=["Actual_POS", "Predicted_POS"], var_name="Tag Type", value_name="POS")

    # create a DataFrame with the count for each POS for each Tag Type
    df_count = df_melted.groupby(["Tag Type", "POS"]).size().reset_index(name='Count')

    # sort values by Count in descending order
    df_count = df_count.sort_values(by="Count", ascending=True)

    fig = px.bar(df_count, y="POS", x="Count", color="Tag Type", barmode='group', log_x=True, orientation='h',
                 title='Comparison of Actual and Predicted POS tags', labels={'Count': 'Log(Count)'})
    fig.update_layout(height=600, width=600)  # Adjusting the dimensions of the plot
    fig.show()

In [51]:
plot_comparison(test_results)

In [35]:
def get_pos_distribution(df: pd.DataFrame) -> pd.DataFrame:
    pos_counts = df['POS'].value_counts(normalize=True).reset_index()
    pos_counts.columns = ['POS', 'Frequency']
    return pos_counts

In [36]:
train_pos_distribution = get_pos_distribution(df_train)
test_pos_distribution = get_pos_distribution(df_test)
train_pos_distribution['Dataset'] = 'Train'
test_pos_distribution['Dataset'] = 'Test'
pos_distribution = pd.concat([train_pos_distribution, test_pos_distribution])

In [38]:
fig = px.bar(pos_distribution, x='POS', y='Frequency', color='Dataset', log_y=True,
             title='Normalized POS Tag Distributions in Training and Testing Data', barmode='group')
fig.show()
