# 1. All necessary libraries

In [15]:
import numpy as np
from typing import List, Tuple, Union
from collections import defaultdict, Counter
import random

import unittest
import timeit
from memory_profiler import memory_usage
import time

import sys
import os

# Access the project root directory
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..', '..'))
sys.path.append(project_root)

# Import the performance_test module from utils
from utils.performance_test import PerformanceTest

print('Successfully imported all necessary libraries.')

Successfully imported all necessary libraries.


# 2. Define the Solution class

In [22]:
class Solution:
    def __init__(self, N: int):
        """
        Initialize the n-gram model with a specified n-gram size.

        :param N: integer representing the size of n-gram (e.g., 2 for bigram, 3 for trigram)
        """
        self.N = N
        self.ngram_counts = defaultdict(Counter)

    def train(self, tokens: List[str]) -> None:
        """
        Train the n-gram model by counting occurrences of n-grams in the input tokens.

        :param tokens: List of tokens (words) representing the text
        """
        # Check if N is valid
        if self.N > len(tokens) or self.N <= 0:
            raise ValueError(f"Invalid n-gram size: {self.N}. It must be between 1 and the number of tokens.")

        for i in range(len(tokens) - self.N + 1):
            ngram = tuple(tokens[i:i + self.N - 1])
            next_word = tokens[i + self.N - 1]
            self.ngram_counts[ngram][next_word] += 1

    def predict(self, context: Tuple[str]) -> Union[str, None]:
        """
        Predict the next word based on the given context (previous n-1 words).

        :param context: Tuple of strings representing the n-1 words context
        :return: The predicted next word or None if the context is not found
        """
        if context in self.ngram_counts:
            possible_next_words = self.ngram_counts[context]
            result_list = random.choices(list(possible_next_words.keys()), weights=possible_next_words.values())
            # return the string, result_list is a list with one element
            return result_list[0]
        else:
            return "Your context is not found in the training data."

print('Successfully defined the Solution class.')

Successfully defined the Solution class.


# 3. Unit test

In [23]:
class TestSolution(unittest.TestCase):
    def setUp(self):
        # Instantiate Solution with N=2 for bigram testing before each test
        self.solution = Solution(N=2)

    # Type 1: Dimensions and Corner Cases

    def test_invalid_n_gram_size(self):
        """
        Test with invalid n-gram size (greater than token length).
        Expected result: ValueError
        """
        # 1. Inputs
        input_tokens = ["I"]

        # 2. Expected
        expected_exception = ValueError

        # 3. Result
        with self.assertRaises(expected_exception):
            self.solution.train(input_tokens)

    def test_context_not_in_training_data(self):
        """
        Test predict with context not found in the training data.
        Expected result: "Your context is not found in the training data."
        """
        # 1. Inputs
        input_tokens = ["hello", "world"]
        input_context = ("missing", "context")
        self.solution.train(input_tokens)

        # 2. Expected
        expected = "Your context is not found in the training data."

        # 3. Result
        result = self.solution.predict(input_context)
        self.assertEqual(result, expected)

    # Type 2: Input Types

    def test_valid_input_train(self):
        """
        Test train with a valid token list.
        Expected result: n-grams are counted correctly.
        """
        # 1. Inputs
        input_tokens = ["I", "love", "deep", "learning"]

        # 2. Expected
        expected_ngrams = {
            ("I",): Counter({"love": 1}),
            ("love",): Counter({"deep": 1}),
            ("deep",): Counter({"learning": 1})
        }

        # 3. Result
        self.solution.train(input_tokens)
        self.assertEqual(self.solution.ngram_counts, expected_ngrams)

    def test_predict_valid_context(self):
        """
        Test predict with a valid context that is in the n-grams.
        Expected result: Correct next word.
        """
        # 1. Inputs
        input_tokens = ["I", "enjoy", "NLP", "and", "ML"]
        input_context = ("I",)
        self.solution.train(input_tokens)

        # 2. Expected
        expected_words = ["enjoy"]

        # 3. Result
        result = self.solution.predict(input_context)
        self.assertIn(result, expected_words)

    # Type 3: Extreme Values

    def test_large_n_gram_size(self):
        """
        Test with large n-gram size close to the length of tokens.
        Expected result: model counts only one n-gram.
        """
        # 1. Inputs
        self.solution = Solution(N=4)
        input_tokens = ["machine", "learning", "is", "fun"]

        # 2. Expected
        expected_ngrams = {("machine", "learning", "is"): Counter({"fun": 1})}

        # 3. Result
        self.solution.train(input_tokens)
        self.assertEqual(self.solution.ngram_counts, expected_ngrams)

    def test_empty_token_list(self):
        """
        Test train with an empty token list.
        Expected result: ValueError
        """
        # 1. Inputs
        input_tokens = []

        # 2. Expected
        expected_exception = ValueError

        # 3. Result
        with self.assertRaises(expected_exception):
            self.solution.train(input_tokens)

    def test_predict_on_empty_model(self):
        """
        Test predict on a model without any training data.
        Expected result: "Your context is not found in the training data."
        """
        # 1. Inputs
        input_context = ("I", "love")

        # 2. Expected
        expected = "Your context is not found in the training data."

        # 3. Result
        result = self.solution.predict(input_context)
        self.assertEqual(result, expected)

def run_unit_tests():
    """
    Runs all unit tests for the Solution class.
    Uses TextTestRunner with verbosity level 2 for detailed output.
    """
    runner = unittest.TextTestRunner(verbosity=2)
    suite = unittest.TestLoader().loadTestsFromTestCase(TestSolution)
    runner.run(suite)

print('Successfully defined the TestSolution class.')
run_unit_tests()


test_context_not_in_training_data (__main__.TestSolution)
Test predict with context not found in the training data. ... ok
test_empty_token_list (__main__.TestSolution)
Test train with an empty token list. ... ok
test_invalid_n_gram_size (__main__.TestSolution)
Test with invalid n-gram size (greater than token length). ... ok
test_large_n_gram_size (__main__.TestSolution)
Test with large n-gram size close to the length of tokens. ... ok
test_predict_on_empty_model (__main__.TestSolution)
Test predict on a model without any training data. ... ok
test_predict_valid_context (__main__.TestSolution)
Test predict with a valid context that is in the n-grams. ... ok
test_valid_input_train (__main__.TestSolution)
Test train with a valid token list. ... 

Successfully defined the TestSolution class.


ok

----------------------------------------------------------------------
Ran 7 tests in 0.006s

OK


# 4. Performance test

In [26]:
def run_performance_tests():
    solution = Solution(N=3)
    tokens = ["I", "love", "natural", "language", "processing", "and", "I", "love", "machine", "learning"]
    context = ("I", "love")
    # Estimate time complexity
    PerformanceTest.estimate_time_complexity(solution.train, tokens)
    PerformanceTest.estimate_time_complexity(solution.predict, context)

    # Estimate space complexity
    PerformanceTest.estimate_space_complexity(solution.train, tokens)
    PerformanceTest.estimate_space_complexity(solution.predict, context)
print('Successfully defined the PerformanceTest class.')

Successfully defined the PerformanceTest class.


# 5. Main

In [27]:

if __name__ == "__main__":
    # Run unit tests
    # unittest.main(argv=['first-arg-is-ignored'], exit=False)
    run_unit_tests()

    # Add sleep and separator
    print("\n" + "-" * 100 + "\n")
    time.sleep(1)

    # Performance tests
    run_performance_tests()


test_context_not_in_training_data (__main__.TestSolution)
Test predict with context not found in the training data. ... ok
test_empty_token_list (__main__.TestSolution)
Test train with an empty token list. ... ok
test_invalid_n_gram_size (__main__.TestSolution)
Test with invalid n-gram size (greater than token length). ... ok
test_large_n_gram_size (__main__.TestSolution)
Test with large n-gram size close to the length of tokens. ... ok
test_predict_on_empty_model (__main__.TestSolution)
Test predict on a model without any training data. ... ok
test_predict_valid_context (__main__.TestSolution)
Test predict with a valid context that is in the n-grams. ... ok
test_valid_input_train (__main__.TestSolution)
Test train with a valid token list. ... ok

----------------------------------------------------------------------
Ran 7 tests in 0.004s

OK



----------------------------------------------------------------------------------------------------

Average execution time over 10 runs: 0.000010 seconds
Average execution time over 10 runs: 0.000005 seconds
Peak memory usage: 0.179688 MiB
Peak memory usage: 0.218750 MiB
