<a href="https://colab.research.google.com/github/cannedhedgehog/Saturday/blob/main/Saharnova_lab2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [54]:
import re
import nltk
import string
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

Загружаем необходимые ресурсы NLTK

In [55]:
nltk.download('punkt_tab')
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [22]:
import nltk

nltk.download("punkt", force=True)
nltk.download("stopwords", force=True)
nltk.download("wordnet", force=True)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

Очистка текста: приведение к нижнему регистру, удаление знаков препинания

In [56]:
def preprocess_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)  # Удаляем знаки препинания
    return text


Токенизация, удаление стоп-слов и лемматизация

In [57]:
def tokenize_and_lemmatize(text: str) -> list[str]:
    """
    Токенизация, удаление стоп-слов и лемматизация.
    """
    stop_words = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()

    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    return words

Реализация Bag of Words

In [58]:
def bag_of_words(corpus: list[str]) -> np.ndarray:
    """
    Реализация Bag of Words.
    """
    vectorizer = CountVectorizer()
    bow_matrix = vectorizer.fit_transform(corpus)

    print("\nBag of Words (BoW) Matrix:")
    print(bow_matrix.toarray())
    print("\nFeature Names:", vectorizer.get_feature_names_out())

    return bow_matrix


Реализация TF-IDF

In [59]:
def tf_idf(corpus: list[str]) -> np.ndarray:
    """
    Реализация TF-IDF.
    """
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)

    print("\nTF-IDF Matrix:")
    print(tfidf_matrix.toarray())
    print("\nFeature Names:", vectorizer.get_feature_names_out())

    return tfidf_matrix

In [60]:
def ascii_tokenizer(text: str) -> list[str]:
    """
    Токенизация всех ASCII-символов.
    """
    return [char for char in text if char in string.printable]


In [61]:
def ascii_vectorizer(text: str) -> list[int]:
    """
    Векторизация ASCII-символов.
    """
    return [ord(char) for char in text if char in string.printable]


In [62]:
def vectorize(tokens: list[str]) -> list[int]:
    """
    Векторизация текста через индексацию уникальных токенов.
    """
    dict_vectors = {}
    result = []
    for word in tokens:
        if word in dict_vectors.keys():
            result.append(dict_vectors[word])
        else:
            dict_vectors[word] = len(dict_vectors)
            result.append(dict_vectors[word])
    return result

In [67]:
def main():
    """
    Основная функция программы.
    """
    documents = [
        "Beneath the sky so vast and bright,",
        "A silver river sings at night.",
        "The whispering leaves in twilight dance,",
        "Spinning dreams in moonlight’s glance.",
        "A distant echo, soft and free,",
        "Drifts across the endless sea.",
        "The stars ignite, then fade away—",
        "A fleeting spark, a breath, a day."
    ]

    preprocessed_docs = [preprocess_text(doc) for doc in documents]
    tokenized_docs = [" ".join(tokenize_and_lemmatize(doc)) for doc in preprocessed_docs]

    print("\nPreprocessed Documents:", tokenized_docs)

    bow_matrix = bag_of_words(tokenized_docs)
    tfidf_matrix = tf_idf(tokenized_docs)

    print("\nASCII Tokenization:", ascii_tokenizer(documents[0]))
    print("\nASCII Vectorization:", ascii_vectorizer(documents[0]))

    print("\nVectorized Tokens (Lemmatized):", vectorize(tokenize_and_lemmatize(documents[0])))
    print("\nVectorized Tokens (Preprocessed):", vectorize(preprocessed_docs[0].split()))

if __name__ == "__main__":
    main()


Preprocessed Documents: ['beneath sky vast bright', 'silver river sings night', 'whispering leaf twilight dance', 'spinning dream moonlight glance', 'distant echo soft free', 'drift across endless sea', 'star ignite fade away', 'fleeting spark breath day']

Bag of Words (BoW) Matrix:
[[0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]]

Feature Names: ['across' 'away' 'beneath' 'breath' 'bright' 'dance' 'day' 'distant'
 'dream' 'drift' 'echo' 'endless' 'fade' 'fleeting' 'free' 'glance'
 'ignite' 'leaf' 'moonli