In [17]:
import re
import string
import numpy as np
import pandas as pd
import nltk
from nltk.util import ngrams
from nltk.tokenize import sent_tokenize
import plotly.express as px
from collections import defaultdict, Counter
from unidecode import unidecode
from typing import List, Tuple, Dict, Union

import mynlputils as nu

In [2]:
conf = nu.load_config("a1")

03-Jul-23 23:20:55 - INFO - Starting 'load_config'.
03-Jul-23 23:20:55 - INFO - Finished 'load_config' in 0.0070 secs.


In [18]:
conf.langs[0]

'af'

In [3]:
train_af = nu.load_text_data(f"{conf.paths.raw_txt}train.{conf.langs[0]}.txt")
train_en = nu.load_text_data(f"{conf.paths.raw_txt}train.{conf.langs[1]}.txt")
train_nl = nu.load_text_data(f"{conf.paths.raw_txt}train.{conf.langs[2]}.txt")
train_xh = nu.load_text_data(f"{conf.paths.raw_txt}train.{conf.langs[3]}.txt")
train_zu = nu.load_text_data(f"{conf.paths.raw_txt}train.{conf.langs[4]}.txt")

03-Jul-23 23:20:55 - INFO - Starting 'load_text_data'.
03-Jul-23 23:20:55 - INFO - Finished 'load_text_data' in 0.0013 secs.
03-Jul-23 23:20:55 - INFO - Starting 'load_text_data'.
03-Jul-23 23:20:55 - INFO - Finished 'load_text_data' in 0.0018 secs.
03-Jul-23 23:20:55 - INFO - Starting 'load_text_data'.
03-Jul-23 23:20:55 - INFO - Finished 'load_text_data' in 0.0010 secs.
03-Jul-23 23:20:55 - INFO - Starting 'load_text_data'.
03-Jul-23 23:20:55 - INFO - Finished 'load_text_data' in 0.0007 secs.
03-Jul-23 23:20:55 - INFO - Starting 'load_text_data'.
03-Jul-23 23:20:55 - INFO - Finished 'load_text_data' in 0.0009 secs.


In [4]:
train_af

'Afrika\nAfrika\nAfrika is die wêreld se tweede grootste kontinent (na Asië) in sowel oppervlakte as bevolking. Saam met die eilande beslaan dit ongeveer 30\xa0221\xa0532\xa0km² wat 20,3% van die totale landoppervlakte van die aarde is en dit word bewoon deur meer as 1 miljard mense – ongeveer \'n sewende van die wêreldbevolking.\nTale.\nDie meeste amptelike tale van Afrika is Indo-Europese tale soos Frans, Engels, Portugees, Spaans en Afrikaans.\nNaas hierdie Indo-Europese tale word daar ook verskillende Afrikatale gepraat. Hulle is onderverdeel in Afro-Asiaties, Niger-Kongo, Khoisan, Nilo-Sahariese en Austronesies.\nGeografie.\nDie hoogste punt van Afrika is Kilimandjaro, 5895\xa0m bo seevlak en die laagste punt is die Assalmeer, 155\xa0m onder seevlak. Die Saharawoestyn is in die noorde van Afrika terwyl die Groot Skeurvallei in Oos-Afrika is. Die grootste rivier is die Nylrivier en die grootste meer is die Victoriameer. Die grootste land volgens oppervlakte is Algerië en volgens be

In [5]:
def normalize_text(text: str) -> str:
    """
    Function to normalize the text data.

    Args:
    text (str): Text data.

    Returns:
    str: Normalized text data.
    """
    text = text.lower() # Lowercase the text
    text = re.sub(r'\d', '0', text) # Remove digits
    text = text.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation
    sentences = sent_tokenize(text) # Tokenize into sentences
    normalized_sentences = []
    for sentence in sentences:
        sentence = re.sub(r"[^\w\s]", '', sentence) # Remove punctuation
        normalized_sentences.append(sentence)
    # Remove extra spaces
    normalized_sentences = normalized_sentences.strip()
    return normalized_sentences

In [6]:
norm_train_en = normalize_text(train_en)

In [7]:
norm_train_en



In [8]:
nu.write_text_data(f"{conf.paths.normalized_txt}norm_train_en.txt", norm_train_en)

In [9]:
def plot_character_frequency(data: str) -> None:
    """
    Function to plot character frequency.

    Args:
    data (str): Text data.

    Returns:
    None
    """
    # Count the frequency of each character
    character_counts = Counter(data)

    # Convert the counter to a DataFrame for plotting
    df = pd.DataFrame.from_dict(character_counts, orient='index').reset_index()

    # Plot character frequency
    fig = px.bar(df, x='index', y=0, labels={'index': 'Characters', '0': 'Count'}, title='Character Frequency',
                 log_y=True)
    fig.show()


def plot_word_length(data: str) -> None:
    """
    Function to plot word length.

    Args:
    data (str): Text data.
    """
    # Get the length of each word
    word_lengths = [len(word) for word in data.split()]

    # Convert the list to a DataFrame for plotting
    df = pd.DataFrame(word_lengths, columns=['Word Length'])

    # Plot word length
    fig = px.histogram(df, x='Word Length', nbins=50, title='Word Length Distribution', log_y=True)
    fig.show()


def plot_zipfs_law(data: str) -> None:
    """
    Function to plot Zipf's law.

    Args:
    data (str): Text data.

    Returns:
    None
    """
    # Count the frequency of each word
    word_counts = Counter(data.split())

    # Sort words by frequency
    words, counts = zip(*word_counts.most_common())

    # Convert the lists to a DataFrame for plotting
    df = pd.DataFrame({'Word': words, 'Frequency': counts})

    # Add a column for rank
    df['Rank'] = df['Frequency'].rank(method='min', ascending=False)

    # Plot Zipf's law
    fig = px.scatter(df, x='Rank', y='Frequency', title="Zipf's Law", log_x=True, log_y=True)
    # Add annotations for top 5 words
    for i in range(5):
        fig.add_annotation(
            x=np.log10(df.loc[i, 'Rank']),  # Apply log transformation
            y=np.log10(df.loc[i, 'Frequency']),  # Apply log transformation
            text=df.loc[i, 'Word'],
            # showarrow=False
        )

    fig.show()

In [16]:
# character_counts = Counter(norm_train_en)
# character_counts

In [10]:
plot_character_frequency(norm_train_en)
plot_word_length(norm_train_en)
plot_zipfs_law(norm_train_en)

In [None]:
def initial_eda(data: str) -> None:
    """
    Function to perform initial exploratory data analysis.

    Args:
    data (str): Text data.

    Returns:
    None
    """
    num_chars = len(data)
    num_words = len(data.split())
    num_sentences = data.count('.')

    print(f'Number of characters: {num_chars}')
    print(f'Number of words: {num_words}')
    print(f'Number of sentences: {num_sentences}')

### 1. Tokenize the text at the character level and create trigrams

In [None]:
def generate_trigrams(text: str) -> List[Tuple[str, str, str]]:
    """
    Function to generate character-level trigrams from text.

    Args:
    text (str): Text data.

    Returns:
    List[Tuple[str, str, str]]: List of character-level trigrams.
    """
    return list(ngrams(text, 3, pad_left=True, pad_right=True, left_pad_symbol='<s> <s>', right_pad_symbol='</s>'))

### 2. Build a language model for each language

In [12]:
def build_language_model(trigrams: List[Tuple[str, str, str]]) -> Dict[Tuple[str, str], Counter]:
    """
    Function to build a character-level trigram language model.

    Args:
    trigrams (List[Tuple[str, str, str]]): List of character-level trigrams.

    Returns:
    Dict[Tuple[str, str], Counter]: Language model.
    """
    model = defaultdict(Counter)

    for t1, t2, t3 in trigrams:
        model[(t1, t2)][t3] += 1

    return model

### 3. Function to generate text based on these models

In [13]:
def generate_text(model: Dict[Tuple[str, str], Counter], max_length: int = 200) -> str:
    """
    Function to generate text from a character-level trigram language model.

    Args:
    model (Dict[Tuple[str, str], Counter]): Language model.
    max_length (int, optional): Maximum length of the generated text. Defaults to 200.

    Returns:
    str: Generated text.
    """
    text = ['<s>', '<s>']
    while len(text) < max_length:
        t1, t2 = text[-2], text[-1]
        next_char = model[(t1, t2)].most_common(1)[0][0]
        text.append(next_char)
    return ''.join(text)

### 4. Compute the perplexity of each language model on a validation set

In [14]:
def calculate_perplexity(model: Dict[Tuple[str, str], Counter], text: str) -> float:
    """
    Function to calculate the perplexity of a language model on a given text.

    Args:
    model (Dict[Tuple[str, str], Counter]): Language model.
    text (str): Text data.

    Returns:
    float: Perplexity of the language model on the text.
    """
    trigrams = generate_trigrams(text)
    N = len(trigrams)
    log_prob = 0
    for t1, t2, t3 in trigrams:
        prob = model[(t1, t2)][t3] / sum(model[(t1, t2)].values())
        log_prob += np.log2(prob) if prob > 0 else 0
    return np.power(2, -log_prob/N)