In [2]:
# Данный ноутбук использовал окружение google-colab
%pip install catboost fasttext -q

In [3]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

CUDA available: True
Device: Tesla T4


# Домашнее задание "NLP. Часть 1"

In [4]:
import math
import re
import os
import random
import json
from collections import Counter, defaultdict
from typing import List, Dict, Tuple, Any

import torch
import numpy as np
import datasets
import fasttext
import fasttext.util
from transformers import BertTokenizer, BertModel

In [5]:
def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42)

In [6]:
def normalize_pretokenize_text(text: str) -> List[str]:
    text = text.lower()
    words = re.findall(r'\b\w+\b', text)
    return words

In [7]:
# This block is for tests only
test_corpus = [
    "the quick brown fox jumps over the lazy dog",
    "never jump over the lazy dog quickly",
    "brown foxes are quick and dogs are lazy"
]

def build_vocab(texts: List[str]) -> Tuple[List[str], Dict[str, int]]:
    all_words = []
    for text in texts:
        words = normalize_pretokenize_text(text)
        all_words.extend(words)
    vocab = sorted(set(all_words))
    vocab_index = {word: idx for idx, word in enumerate(vocab)}
    return vocab, vocab_index

vocab, vocab_index = build_vocab(test_corpus)

## Задание 1 (0.5 балла)
Реализовать One-Hot векторизацию текстов

In [8]:
def one_hot_vectorization(
    text: str,
    vocab: List[str] = None,
    vocab_index: Dict[str, int] = None
) -> List[List[int]]:
    if vocab is None or vocab_index is None:
        raise ValueError("vocab и vocab_index должны быть переданы")
    tokens = normalize_pretokenize_text(text)
    dim = len(vocab)
    result = []
    for t in tokens:
        vec = [0] * dim
        if t in vocab_index:
            vec[vocab_index[t]] = 1
        result.append(vec)
    return result


def test_one_hot_vectorization(
    vocab: List[str],
    vocab_index: Dict[str, int]
) -> bool:
    try:
        text = "the quick brown fox"
        result = one_hot_vectorization(text, vocab, vocab_index)

        if not isinstance(result, list):
            return False

        expected_length = len(vocab)
        if len(result[0]) != expected_length:
            return False

        words_in_text = normalize_pretokenize_text(text)
        for i, word in enumerate(words_in_text):
            if word in vocab_index:
                idx = vocab_index[word]
                if result[i][idx] != 1:
                    return False

        print("One-Hot-Vectors test PASSED")

        return True
    except Exception as e:
        print(f"One-Hot-Vectors test FAILED: {e}")
        return False

In [9]:
assert test_one_hot_vectorization(vocab, vocab_index)

One-Hot-Vectors test PASSED


## Задание 2 (0.5 балла)
Реализовать Bag-of-Words

In [10]:
def bag_of_words_vectorization(text: str) -> Dict[str, int]:
    tokens = normalize_pretokenize_text(text)
    return dict(Counter(tokens))


def test_bag_of_words_vectorization() -> bool:
    try:
        text = "the the quick brown brown brown"
        result = bag_of_words_vectorization(text)

        if not isinstance(result, dict):
            return False

        if result.get('the', 0) != 2:
            return False
        if result.get('quick', 0) != 1:
            return False
        if result.get('brown', 0) != 3:
            return False
        if result.get('nonexistent', 0) != 0:
            return False

        print("Bad-of-Words test PASSED")
        return True
    except Exception as e:
        print(f"Bag-of-Words test FAILED: {e}")
        return False

In [11]:
assert test_bag_of_words_vectorization()

Bad-of-Words test PASSED


## Задание 3 (0.5 балла)
Реализовать TF-IDF

In [12]:
def tf_idf_vectorization(text: str, corpus: List[str] = None, vocab: List[str] = None, vocab_index: Dict[str, int] = None) -> List[float]:
    if corpus is None or vocab is None or vocab_index is None:
        raise ValueError("corpus, vocab и vocab_index обязательны")
    N = len(corpus)
    df = Counter()
    for doc in corpus:
        tokens = set(normalize_pretokenize_text(doc))
        for t in tokens:
            df[t] += 1
    tokens = normalize_pretokenize_text(text)
    total = len(tokens) if len(tokens) > 0 else 1
    tf_counts = Counter(tokens)
    vector = []
    for word in vocab:
        tf = tf_counts.get(word, 0) / total
        idf = math.log((N + 1) / (df.get(word, 0) + 1)) + 1.0
        vector.append(float(tf * idf))
    return vector


def test_tf_idf_vectorization(corpus, vocab, vocab_index) -> bool:
    try:
        text = "the quick brown"
        result = tf_idf_vectorization(text, corpus, vocab, vocab_index)

        if not isinstance(result, list):
            return False

        expected_length = len(vocab)
        if len(result) != expected_length:
            return False

        for val in result:
            if not isinstance(val, float):
                return False

        print("TF-IDF test PASSED")
        return True
    except Exception as e:
        print(f"TF-IDF test FAILED: {e}")
        return False

In [13]:
assert test_tf_idf_vectorization(test_corpus, vocab, vocab_index)

TF-IDF test PASSED


## Задание 4 (1 балл)
Реализовать Positive Pointwise Mutual Information (PPMI).  
https://en.wikipedia.org/wiki/Pointwise_mutual_information
$$PPMI(word, context) = max(0, PMI(word, context))$$
$$PMI(word, context) = log \frac{P(word, context)}{P(word) P(context)} = log \frac{N(word, context)|(word, context)|}{N(word) N(context)}$$
где $N(word, context)$ -- число вхождений слова $word$ в окно $context$ (размер окна -- гиперпараметр)

In [14]:
def ppmi_vectorization(
    text: str,
    corpus: List[str] = None,
    vocab: List[str] = None,
    vocab_index: Dict[str, int] = None,
    window_size: int = 2
) -> List[float]:
    if corpus is None or vocab is None or vocab_index is None:
        raise ValueError("corpus, vocab и vocab_index обязательны")
    tokenized_corpus = [normalize_pretokenize_text(doc) for doc in corpus]
    total_tokens = sum(len(doc) for doc in tokenized_corpus)

    word_count = Counter()
    co_count = Counter()
    for doc in tokenized_corpus:
        L = len(doc)
        for i, w in enumerate(doc):
            word_count[w] += 1
            left = max(0, i - window_size)
            right = min(L, i + window_size + 1)
            for j in range(left, right):
                if j == i:
                    continue
                c = doc[j]
                co_count[(w, c)] += 1
    total_co = sum(co_count.values()) if len(co_count) > 0 else 1
    def p_word(w):
        return word_count.get(w, 0) / total_tokens if total_tokens > 0 else 0.0
    def p_co(w, c):
        return co_count.get((w, c), 0) / total_co if total_co > 0 else 0.0
    tokens = normalize_pretokenize_text(text)
    vector = []
    for v in vocab:
        vals = []
        for t in tokens:
            p_w = p_word(t)
            p_c = p_word(v)
            p_wc = p_co(t, v)
            if p_wc <= 0 or p_w <= 0 or p_c <= 0:
                vals.append(0.0)
            else:
                pmi = math.log(p_wc / (p_w * p_c) + 1e-12)
                vals.append(max(pmi, 0.0))
        avg = float(np.mean(vals)) if len(vals) > 0 else 0.0
        vector.append(avg)
    return vector

def test_ppmi_vectorization(corpus, vocab, vocab_index) -> bool:
    try:
        text = "quick brown fox"
        result = ppmi_vectorization(text, corpus, vocab, vocab_index)

        if not isinstance(result, list):
            return False

        expected_length = len(vocab)
        if len(result) != expected_length:
            return False

        for val in result:
            if not isinstance(val, float):
                return False

        print("PPMI test PASSED")
        return True
    except Exception as e:
        print(f"PPMI test FAILED: {e}")
        return False

In [15]:
assert test_ppmi_vectorization(test_corpus, vocab, vocab_index)

PPMI test PASSED


## Задание 5 (1 балл)
Реализовать получение эмбеддингов из fasttext и bert (для bert лучше использовать CLS токен)

In [16]:
def get_fasttext_embeddings(text: str, model_path: str = None, model: any = None) -> List[np.ndarray]:
    tokens = normalize_pretokenize_text(text)
    ft_model = None
    if model is not None:
        ft_model = model
    elif model_path is not None:
        ft_model = fasttext.load_model(model_path)
    else:
        return []
    embeddings = []
    for t in tokens:
        vec = ft_model.get_word_vector(t)
        embeddings.append(np.array(vec, dtype=float))
    return embeddings


In [17]:
_bert_cache = {}
def get_bert_embeddings(
    text: str,
    model_name: str = 'bert-base-uncased',
    pool_method: str = 'cls'
) -> np.ndarray:
    global _bert_cache
    if model_name not in _bert_cache:
        tokenizer = BertTokenizer.from_pretrained(model_name)
        model = BertModel.from_pretrained(model_name)
        model.eval()
        if torch.cuda.is_available():
            model.to('cuda')
        _bert_cache[model_name] = (tokenizer, model)
    tokenizer, model = _bert_cache[model_name]
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    if torch.cuda.is_available():
        inputs = {k: v.to('cuda') for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    cls_emb = outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]
    return cls_emb.astype(float)

## Задание 6 (1.5 балла)
Реализовать обучение так, чтобы можно было поверх эмбеддингов, реализованных в предыдущих заданиях, обучить какую-то модель (вероятно неглубокую, например, CatBoost) на задаче классификации текстов ([IMDB](https://huggingface.co/datasets/stanfordnlp/imdb)).

In [1]:
import numpy as np
import datasets
import fasttext
import fasttext.util
from typing import List, Dict, Tuple, Any
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.model_selection import train_test_split

def vectorize_dataset(
    dataset_name: str = "imdb",
    vectorizer_type: str = "bow",
    split: str = "train",
    sample_size: int = 50,
    vocab: List[str] = None,
    vocab_index: Dict[str, int] = None
) -> Tuple[List[str], List[Any], List[int], Dict[str, int]]:

    dataset = datasets.load_dataset(dataset_name, split=split)

    if sample_size:
        dataset = dataset.shuffle(seed=42)
        dataset = dataset.select(range(min(sample_size, len(dataset))))

    texts: List[str] = []
    labels: List[int] = []
    for item in dataset:
        text = item.get('text', None)
        label = item.get('label', None)
        if text is None:
            continue
        if not isinstance(text, str) or not text.strip():
            continue
        if label is None:
            continue
        texts.append(text)
        labels.append(label)

    unique, counts = np.unique(labels, return_counts=True)
    print(f"Collected {len(texts)} texts, label distribution: {dict(zip(unique, counts))}")

    if vocab is None or vocab_index is None:
        all_words = []
        for text in texts:
            words = normalize_pretokenize_text(text)
            all_words.extend(words)
        vocab = sorted(set(all_words))
        vocab_index = {word: idx for idx, word in enumerate(vocab)}

    vectorized_data: List[Any] = []

    ft_model = None
    if vectorizer_type == "fasttext":
        try:
            fasttext.util.download_model('en', if_exists='ignore')
            ft_model = fasttext.load_model('cc.en.300.bin')
        except Exception as e:
            print("Warning: fasttext model not available:", e)
            ft_model = None

    for text in texts:
        if vectorizer_type == "one_hot":
            vectorized_data.append(one_hot_vectorization(text, vocab, vocab_index))
        elif vectorizer_type == "bow":
            bow_dict = bag_of_words_vectorization(text)
            vector = [bow_dict.get(word, 0) for word in vocab]
            vectorized_data.append(vector)
        elif vectorizer_type == "tfidf":
            vectorized_data.append(tf_idf_vectorization(text, texts, vocab, vocab_index))
        elif vectorizer_type == "ppmi":
            vectorized_data.append(ppmi_vectorization(text, texts, vocab, vocab_index))
        elif vectorizer_type == "fasttext":
            if ft_model is None:
                vectorized_data.append(None)
            else:
                toks = normalize_pretokenize_text(text)
                embs = [np.array(ft_model.get_word_vector(t), dtype=float) for t in toks]
                if embs:
                    avg_embedding = np.mean(embs, axis=0)
                    vectorized_data.append(avg_embedding.tolist())
                else:
                    vectorized_data.append(None)
        elif vectorizer_type == "bert":
            embedding = get_bert_embeddings(text)
            vectorized_data.append(embedding.tolist())
        else:
            raise ValueError(f"Unknown vectorizer type: {vectorizer_type}")

    return vocab, vectorized_data, labels, vocab_index


In [None]:
def train(
    embeddings_method="bow",
    test_size=0.2,
    val_size=0.2,
    cv_folds=5,
    sample_size=50
):

    vocab, X, y, vocab_index = vectorize_dataset("imdb", embeddings_method, "train", sample_size)
    _, X_test, y_test, _ = vectorize_dataset("imdb", embeddings_method, "test", sample_size, vocab=vocab, vocab_index=vocab_index)

    if embeddings_method == "fasttext":
        if any(x is None for x in X) or any(x is None for x in X_test):
            print("FastText embeddings not available (None entries). Make sure fasttext model is downloaded and accessible. Skipping fasttext.")
            return None

    print(f"Method={embeddings_method}  train_docs={len(X)}  train_labels={len(y)}  test_docs={len(X_test)}  test_labels={len(y_test)}")
    print("Train label distribution:", np.unique(y, return_counts=True))
    print("Test label distribution: ", np.unique(y_test, return_counts=True))

    def to_fixed_vector(x):
        if isinstance(x, np.ndarray):
            return x
        if isinstance(x, list):
            if len(x) == 0:
                return np.zeros(0, dtype=float)
            if all(isinstance(el, (list, np.ndarray)) for el in x):
                arr = np.array([np.array(el, dtype=float) for el in x])
                return np.mean(arr, axis=0)
            if all(isinstance(el, (int, float, np.floating, np.integer)) for el in x):
                return np.array(x, dtype=float)
        try:
            return np.array(x, dtype=float)
        except Exception:
            return np.array([], dtype=float)

    X_proc = [to_fixed_vector(xi) for xi in X]
    X_test_proc = [to_fixed_vector(xi) for xi in X_test]

    def pad_or_stack(X_list):
        dims = [x.shape[0] for x in X_list]
        max_dim = max(dims) if len(dims) > 0 else 0
        stacked = []
        for x in X_list:
            if x.shape[0] == max_dim:
                stacked.append(x)
            elif x.shape[0] == 0:
                stacked.append(np.zeros(max_dim, dtype=float))
            else:
                if x.shape[0] < max_dim:
                    padded = np.zeros(max_dim, dtype=float)
                    padded[: x.shape[0]] = x
                    stacked.append(padded)
                else:
                    stacked.append(x[:max_dim])
        if len(stacked) == 0:
            return np.zeros((0, max_dim), dtype=float)
        return np.vstack(stacked)

    X_mat = pad_or_stack(X_proc)
    X_test_mat = pad_or_stack(X_test_proc)
    y = np.array(y)
    y_test = np.array(y_test)

    if X_mat.shape[0] != y.shape[0]:
        n = min(X_mat.shape[0], y.shape[0])
        print(f"Warning: mismatch train sizes, trimming to {n}")
        X_mat = X_mat[:n]
        y = y[:n]
    if X_test_mat.shape[0] != y_test.shape[0]:
        n = min(X_test_mat.shape[0], y_test.shape[0])
        print(f"Warning: mismatch test sizes, trimming test to {n}")
        X_test_mat = X_test_mat[:n]
        y_test = y_test[:n]

    unique_train = np.unique(y)
    if unique_train.shape[0] < 2:
        raise RuntimeError(f"В y_train обнаружен только один уникальный класс: {unique_train}. Проверьте сбор меток и фильтрацию данных.")

    stratify_arg = y if np.unique(y).shape[0] > 1 else None
    X_train, X_val, y_train, y_val = train_test_split(
        X_mat, y, test_size=val_size, random_state=42, stratify=stratify_arg
    )

    print("After split train distribution:", np.unique(y_train, return_counts=True))
    print("After split val   distribution:", np.unique(y_val, return_counts=True))

    model = CatBoostClassifier(
        iterations=200,
        learning_rate=0.1,
        depth=6,
        loss_function='Logloss',
        verbose=False,
        random_seed=42
    )

    variances = X_train.var(axis=0)
    if np.all(variances == 0):
        raise RuntimeError("All features in X_train are constant.")

    model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=False)

    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test_mat)

    val_acc = accuracy_score(y_val, y_val_pred)
    test_acc = accuracy_score(y_test, y_test_pred)
    val_f1 = f1_score(y_val, y_val_pred, average='weighted')
    test_f1 = f1_score(y_test, y_test_pred, average='weighted')

    print(f"Embeddings method: {embeddings_method}")
    print(f"Validation Accuracy: {val_acc:.4f}  F1: {val_f1:.4f}")
    print(f"Test Accuracy:       {test_acc:.4f}  F1: {test_f1:.4f}")
    print("Classification report (test):")
    print(classification_report(y_test, y_test_pred, digits=4))

    return model

In [19]:
for embeddings_method in ["bow", "one_hot", "tfidf", "ppmi", "fasttext", "bert"]:
    train(embeddings_method=embeddings_method)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Collected 50 texts, label distribution: {np.int64(0): np.int64(28), np.int64(1): np.int64(22)}
Collected 50 texts, label distribution: {np.int64(0): np.int64(28), np.int64(1): np.int64(22)}
Method=bow  train_docs=50  train_labels=50  test_docs=50  test_labels=50
Train label distribution: (array([0, 1]), array([28, 22]))
Test label distribution:  (array([0, 1]), array([28, 22]))
After split train distribution: (array([0, 1]), array([22, 18]))
After split val   distribution: (array([0, 1]), array([6, 4]))
Embeddings method: bow
Validation Accuracy: 0.8000  F1: 0.8000
Test Accuracy:       0.6000  F1: 0.5936
Classification report (test):
              precision    recall  f1-score   support

           0     0.7222    0.4643    0.5652        28
           1     0.5312    0.7727    0.6296        22

    accuracy                         0.6000        50
   macro avg     0.6267    0.6185    0.5974        50
weighted avg     0.6382    0.6000    0.5936        50

Collected 50 texts, label distr

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Collected 50 texts, label distribution: {np.int64(0): np.int64(28), np.int64(1): np.int64(22)}
Method=bert  train_docs=50  train_labels=50  test_docs=50  test_labels=50
Train label distribution: (array([0, 1]), array([28, 22]))
Test label distribution:  (array([0, 1]), array([28, 22]))
After split train distribution: (array([0, 1]), array([22, 18]))
After split val   distribution: (array([0, 1]), array([6, 4]))
Embeddings method: bert
Validation Accuracy: 0.5000  F1: 0.4000
Test Accuracy:       0.6800  F1: 0.6810
Classification report (test):
              precision    recall  f1-score   support

           0     0.7500    0.6429    0.6923        28
           1     0.6154    0.7273    0.6667        22

    accuracy                         0.6800        50
   macro avg     0.6827    0.6851    0.6795        50
weighted avg     0.6908    0.6800    0.6810        50

