### CountVectorizer Implementation & Comparison with Sklearn's CountVectorizer

In [10]:
class CountVectorizer:
    """
    The CountVectorizer class is used for transforming text sequences into
    numerical vectors.

    Input:
        ngram_size (int): The size of the ngrams to be used for tokenization.

    Attributes:
        ngram_size (int): The size of the ngrams used for tokenization.
        vocabulary (dict): The vocabulary built during fitting.
    """

    def __init__(self, ngram_size):
        if not isinstance(ngram_size, int) or ngram_size <= 0:
            raise ValueError("ngram_size must be a positive integer.")
        self.ngram_size = ngram_size
        self.vocabulary = None

    def fit(self, corpus):
        """
        Fits the CountVectorizer on the provided corpus.

        Input:
            corpus (list): A list of strings where each string represents
            a sequence.

        Raises:
            ValueError: If the corpus is empty or if any sequence
            in the corpus is not a string or has length less than ngram_size.
        """
        if not corpus:
            raise ValueError("The corpus can't be empty.")

        tokens = set()
        max_len_sequence = max(len(sequence) for sequence in corpus)
        for sequence in corpus:
            if (not isinstance(sequence, str) or max_len_sequence < self.ngram_size):
                raise ValueError("Sequences must be strings of length at least ngram_size.")
            sequence = sequence.lower()
            for token in range(len(sequence) - self.ngram_size + 1):
                tokens.add(sequence[token : token + self.ngram_size])
        self.vocabulary = {token: index for index, token in enumerate(sorted(tokens))}

    def transform(self, corpus):
        """
        Transforms the provided corpus into numerical vectors based on
        the fitted vocabulary.

        Input:
            corpus (list): A list of strings where each string represents
            a sequence.

        Output:
            transformed_corpus (list): A list of numerical vectors
            representing the sequences.

        Raises:
            ValueError: If the corpus has not been fitted yet.
        """
        if self.vocabulary is None:
            raise ValueError("The corpus hasn't been fitted yet.")

        transformed_corpus = []
        for sequence in corpus:
            sequence = sequence.lower()
            token_counter = [0] * len(self.vocabulary)
            for i in range(len(sequence) - self.ngram_size + 1):
                token = sequence[i : i + self.ngram_size]
                if token in self.vocabulary:
                    token_counter[self.vocabulary[token]] += 1
            transformed_corpus.append(token_counter)
        return transformed_corpus

    def fit_transform(self, corpus):
        """
        Fits the CountVectorizer on the provided corpus and then transforms
        the corpus into numerical vectors.

        Input:
            corpus (list): A list of strings where each string represents
            a sequence.

        Output:
        transformed_corpus (list): A list of numerical vectors representing
        the sequences.
        """
        self.fit(corpus)
        return self.transform(corpus)

In [11]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer as SklearnCountVectorizer

# Sample corpus
corpus = [
    "Hello, world!",
    "This is a test."
]

# Custom CountVectorizer
vectorizer = CountVectorizer(ngram_size=3)
transformed_corpus = vectorizer.fit_transform(corpus)
print("Custom CountVectorizer output:")
print(np.array(transformed_corpus))
print()

# Sklearn's CountVectorizer
sklearn_vectorizer = SklearnCountVectorizer(analyzer='char', ngram_range=(3, 3))
sklearn_transformed_corpus = sklearn_vectorizer.fit_transform(corpus).toarray()
print("Sklearn's CountVectorizer output:")
print(sklearn_transformed_corpus)

Custom CountVectorizer output:
[[0 0 0 1 1 0 1 0 1 0 0 1 1 1 1 1 1 0 0 0 0 0 1]
 [1 1 1 0 0 1 0 1 0 1 2 0 0 0 0 0 0 1 1 1 1 1 0]]

Sklearn's CountVectorizer output:
[[0 0 0 1 1 0 1 0 1 0 0 1 1 1 1 1 1 0 0 0 0 0 1]
 [1 1 1 0 0 1 0 1 0 1 2 0 0 0 0 0 0 1 1 1 1 1 0]]


In [12]:
# Check if the outputs are the same
outputs_are_equal = np.array_equal(transformed_corpus, sklearn_transformed_corpus)
print(f"Are the outputs the same?")
print(outputs_are_equal)

Are the outputs the same?
True
