In [7]:
from datasets import load_dataset

In [9]:
ds = load_dataset("uonlp/CulturaX",
                  "fa")

Downloading data:   0%|          | 0/96 [18:15<?, ?files/s]


KeyboardInterrupt: 

In [1]:
import numpy as np
import re
from collections import defaultdict

class SimpleCountVectorizer:
    def __init__(self):
        self.vocabulary = {}
    
    def fit(self, corpus):
        # Build the vocabulary
        unique_words = set()
        for document in corpus:
            words = self.tokenize(document)
            unique_words.update(words)
        
        # Assign an index to each word
        self.vocabulary = {word: idx for idx, word in enumerate(unique_words)}
    
    def transform(self, corpus):
        # Create the document-term matrix
        dtm = np.zeros((len(corpus), len(self.vocabulary)), dtype=int)
        
        for doc_idx, document in enumerate(corpus):
            words = self.tokenize(document)
            for word in words:
                if word in self.vocabulary:
                    dtm[doc_idx, self.vocabulary[word]] += 1
        
        return dtm
    
    def fit_transform(self, corpus):
        self.fit(corpus)
        return self.transform(corpus)
    
    def tokenize(self, document):
        # Basic tokenization: lowercasing and splitting on non-word characters
        return re.findall(r'\w+', document.lower())

    def get_feature_names_out(self):
        # Return the words in the vocabulary
        return np.array(list(self.vocabulary.keys()))

# Sample corpus
corpus = [
    "The cat sat on the mat.",
    "Dogs are great pets.",
    "Cats and dogs are both popular pets.",
    "The dog chased the cat."
]

# Using the SimpleCountVectorizer
vectorizer = SimpleCountVectorizer()
embedding_matrix = vectorizer.fit_transform(corpus)

# Display the embedding matrix
print("Embedding Matrix:\n", embedding_matrix)
print("\nWords:\n", vectorizer.get_feature_names_out())

# Example: Access the embedding for the first sentence
first_sentence_embedding = embedding_matrix[0]
print("\nEmbedding for the first sentence:", first_sentence_embedding)


Embedding Matrix:
 [[0 0 0 1 0 0 0 0 1 0 1 1 0 0 2]
 [0 1 1 0 1 0 0 0 0 0 0 0 0 1 0]
 [0 1 1 0 0 1 0 1 0 1 0 0 1 1 0]
 [1 0 0 1 0 0 1 0 0 0 0 0 0 0 2]]

Words:
 ['dog' 'are' 'pets' 'cat' 'great' 'and' 'chased' 'both' 'on' 'popular'
 'sat' 'mat' 'cats' 'dogs' 'the']

Embedding for the first sentence: [0 0 0 1 0 0 0 0 1 0 1 1 0 0 2]
