In [None]:
#bow

import numpy as np

def build_vocabulary(corpus):
  vocab = set()
  for text in corpus:
    vocab.update(text.split())
  return {word: idx for idx, word in enumerate(sorted(vocab))}

def bow_vectorize(corpus, vocab):
  vectors = np.zeros((len(corpus), len(vocab)))
  for i, text in enumerate(corpus):
    for word in text.split():
      if word in vocab:
        vectors[i, vocab[word]] += 1
  return vectors

corpus_bow = ["hello word", "hello there", "hello hello hello word"]
vocab_bow = build_vocabulary(corpus_bow)
print(vocab_bow)
bow_vectors = bow_vectorize(corpus_bow, vocab_bow)
print(bow_vectors)

{'hello': 0, 'there': 1, 'word': 2}
[[1. 0. 1.]
 [1. 1. 0.]
 [3. 0. 1.]]


In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import re

newsgroups = fetch_20newsgroups(subset='all', categories=['sci.space', 'rec.autos', 'talk.politics.mideast'], remove=('headers', 'footers', 'quotes'))
corpus = newsgroups.data[:1000]
labels = newsgroups.target[:1000]

from collections import Counter

category_counts = Counter(labels)
category_names = {i: newsgroups.target_names[i] for i in category_counts}

for category_id, count in category_counts.items():
    print(f"{category_names[category_id]}: {count} новин")

sci.space: 338 новин
rec.autos: 347 новин
talk.politics.mideast: 315 новин


In [None]:
from sklearn.utils import shuffle

# Мінімальна кількість новин серед усіх категорій
min_count = min(category_counts.values())

# Вибираємо рівну кількість новин для кожної категорії
balanced_corpus = []
balanced_labels = []

for category_id in category_counts:
    category_indices = [i for i, label in enumerate(labels) if label == category_id]
    selected_indices = category_indices[:min_count]  # Вибираємо тільки min_count елементів

    balanced_corpus.extend([corpus[i] for i in selected_indices])
    balanced_labels.extend([labels[i] for i in selected_indices])

# Перемішуємо, щоб уникнути порядкового впливу
balanced_corpus, balanced_labels = shuffle(balanced_corpus, balanced_labels, random_state=42)

# Нові збалансовані категорії
new_counts = Counter(balanced_labels)
for category_id, count in new_counts.items():
    print(f"{category_names[category_id]}: {count} новин")

talk.politics.mideast: 315 новин
rec.autos: 315 новин
sci.space: 315 новин


In [None]:
X_train, X_test, y_train, y_test = train_test_split(balanced_corpus, balanced_labels, test_size=0.2, random_state=42)

def clean_text(text):
    text = text.lower()  
    text = re.sub(r'[^\w\s]', '', text) 
vocab = build_vocabulary([clean_text(text) for text in balanced_corpus])

In [None]:
from sklearn.metrics import f1_score

X_train_bow = bow_vectorize(X_train, vocab)
X_test_bow = bow_vectorize(X_test, vocab)

clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train_bow, y_train)

y_pred_bow = clf_bow.predict(X_test_bow)

f1 = f1_score(y_test, y_pred_bow, average='weighted') 
print(f"BoW F1-score: {f1:.4f}")

BoW F1-score: 0.7071


In [None]:
#Tf-Idf

def compute_tf(corpus, vocab):
  tf_matrix = np.zeros((len(corpus), len(vocab)))
  for i, text in enumerate(corpus):
    words = text.split()
    if len(words) == 0:
      continue
    for word in words:
      if word in vocab:
         tf_matrix[i, vocab[word]] += 1
    tf_matrix[i] /= len(words)
  return tf_matrix

def compute_idf(corpus, vocab):
    doc_count = len(corpus)
    idf = np.zeros(len(vocab))  
    for word, idx in vocab.items():
        df = sum(1 for text in corpus if word in text.split())
        idf[idx] = np.log(doc_count / (df + 1)) 
    return idf

def compute_tfidf(corpus, vocab):
  tf_matrix = compute_tf(corpus, vocab)
  idx_vector = compute_idf(corpus, vocab)

  return tf_matrix * idx_vector

In [None]:
tfidf_vectors_train = compute_tfidf(X_train, vocab)
tfidf_vectors_test = compute_tfidf(X_test, vocab)

clf_tfidf = LogisticRegression(max_iter=1000)
clf_tfidf.fit(tfidf_vectors_train, y_train)

y_pred_tfidf = clf_tfidf.predict(tfidf_vectors_test)

f1 = f1_score(y_test, y_pred_tfidf, average='weighted')  
print(f"BoW F1-score: {f1:.4f}")

BoW F1-score: 0.7597


In [None]:
#Word2Vec (Skip-Gram)

import torch
import torch.nn as nn
import torch.optim as optim
from collections import defaultdict
import random

class SkipGramModel(nn.Module):
  def __init__(self, vocab_size, embedding_dim):
    super(SkipGramModel, self).__init__()
    self.embeddings = nn.Embedding(vocab_size, embedding_dim)
    self.output_layer = nn.Linear(embedding_dim, vocab_size)

  def forward(self, center_word):
    embed = self.embeddings(center_word)
    output = self.output_layer(embed)
    return output

def generate_skip_gram_pairs(corpus, vocab, window_size=2):
    pairs = []
    for text in corpus:
        words = text.split()
        for i, center in enumerate(words):
            if center not in vocab:
                continue
            context_words = words[max(0, i-window_size):i] + words[i+1:i+1+window_size]
            pairs.extend([(vocab[center], vocab[w]) for w in context_words if w in vocab])
    return pairs

def create_batches(pairs, batch_size):
    random.shuffle(pairs)
    batches = [pairs[i:i+batch_size] for i in range(0, len(pairs), batch_size)]
    return batches

# Навчання моделі
def train_skip_gram(corpus, vocab, embedding_dim=100, epochs=50, lr=0.01, batch_size=256):
    pairs = generate_skip_gram_pairs(corpus, vocab)
    batches = create_batches(pairs, batch_size)
    model = SkipGramModel(len(vocab), embedding_dim)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_function = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        total_loss = 0
        for batch in batches:
            center_words = torch.tensor([pair[0] for pair in batch], dtype=torch.long)
            context_words = torch.tensor([pair[1] for pair in batch], dtype=torch.long)

            optimizer.zero_grad()
            output = model(center_words)
            loss = loss_function(output, context_words)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        average_loss = total_loss / len(batches)

        print(f"Epoch {epoch}, Loss: {average_loss:.4f}")

    return model

In [None]:
skip_gram_train = train_skip_gram(balanced_corpus, vocab)

Epoch 0, Loss: 7.0003
Epoch 1, Loss: 6.3559
Epoch 2, Loss: 6.1200
Epoch 3, Loss: 5.9846
Epoch 4, Loss: 5.9055
Epoch 5, Loss: 5.8549
Epoch 6, Loss: 5.8240
Epoch 7, Loss: 5.8020
Epoch 8, Loss: 5.7875
Epoch 9, Loss: 5.7765
Epoch 10, Loss: 5.7683
Epoch 11, Loss: 5.7619
Epoch 12, Loss: 5.7574
Epoch 13, Loss: 5.7535
Epoch 14, Loss: 5.7506
Epoch 15, Loss: 5.7474
Epoch 16, Loss: 5.7459
Epoch 17, Loss: 5.7440
Epoch 18, Loss: 5.7427
Epoch 19, Loss: 5.7421
Epoch 20, Loss: 5.7411
Epoch 21, Loss: 5.7396
Epoch 22, Loss: 5.7391
Epoch 23, Loss: 5.7386
Epoch 24, Loss: 5.7383
Epoch 25, Loss: 5.7379
Epoch 26, Loss: 5.7375
Epoch 27, Loss: 5.7371
Epoch 28, Loss: 5.7364
Epoch 29, Loss: 5.7359
Epoch 30, Loss: 5.7352
Epoch 31, Loss: 5.7349
Epoch 32, Loss: 5.7346
Epoch 33, Loss: 5.7344
Epoch 34, Loss: 5.7341
Epoch 35, Loss: 5.7337
Epoch 36, Loss: 5.7338
Epoch 37, Loss: 5.7341
Epoch 38, Loss: 5.7325
Epoch 39, Loss: 5.7328
Epoch 40, Loss: 5.7329
Epoch 41, Loss: 5.7331
Epoch 42, Loss: 5.7321
Epoch 43, Loss: 5.733

In [None]:
from sklearn.linear_model import LogisticRegression

# Функція для отримання середнього векторного представлення тексту
def get_text_embedding(text, model, vocab):
    words = text.split()
    vectors = [model.embeddings.weight[vocab[word]].detach().numpy() for word in words if word in vocab]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.embeddings.embedding_dim)

X_train_vectors = np.array([get_text_embedding(clean_text(text), skip_gram_train, vocab) for text in X_train])
X_test_vectors = np.array([get_text_embedding(clean_text(text), skip_gram_train, vocab) for text in X_test])

classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_vectors, y_train)

# Оцінка моделі
y_pred = classifier.predict(X_test_vectors)

f1 = f1_score(y_test, y_pred, average='weighted') 
print(f"BoW F1-score: {f1:.4f}")


BoW F1-score: 0.7253


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import defaultdict
import random
import re

#Word2Vec (CBOW)

class CBOWModel(nn.Module):
  def __init__(self, vocab_size, embedding_dim):
    super(CBOWModel, self).__init__()
    self.embeddings = nn.Embedding(vocab_size, embedding_dim)
    self.linear = nn.Linear(embedding_dim, vocab_size)

  def forward(self, center_word):
    embeds = self.embeddings(center_word)
    avg_embeds = torch.mean(embeds, dim=1)
    output = self.linear(avg_embeds)
    return output

def preprocess_text(text):
    """Lowercases text and removes punctuation."""
    return re.sub(r'[^\w\s]', '', text.lower())

def generate_cbow_pairs(corpus, vocab, window_size=2):
    pairs = []
    for text in corpus:
        words = preprocess_text(text).split()
        for i, center in enumerate(words):
            context_words = words[max(0, i-window_size):i] + words[i+1:i+1+window_size]
            if len(context_words) == 2 * window_size:
              pairs.append(([vocab[w] for w in context_words if w in vocab], vocab[center]))
    return pairs

def create_batches(pairs, batch_size):
    random.shuffle(pairs)
    batches = [pairs[i:i+batch_size] for i in range(0, len(pairs), batch_size)]
    return batches

# Навчання моделі
def train_cbow(corpus, vocab, embedding_dim=100, epochs=50, lr=0.01, batch_size=256):
    pairs = generate_cbow_pairs(corpus, vocab)
    batches = create_batches(pairs, batch_size)
    model = CBOWModel(len(vocab), embedding_dim)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_function = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        total_loss = 0
        for batch in batches:
            center_words = torch.tensor([pair[0] for pair in batch], dtype=torch.long)
            context_words = torch.tensor([pair[1] for pair in batch], dtype=torch.long)

            optimizer.zero_grad()
            output = model(center_words)
            loss = loss_function(output, context_words)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        average_loss = total_loss / len(batches)

        print(f"Epoch {epoch}, Loss: {average_loss:.4f}")

    return model

In [None]:
cbow_train = train_cbow(balanced_corpus, vocab)

Epoch 0, Loss: 7.1699
Epoch 1, Loss: 5.3017
Epoch 2, Loss: 4.1452
Epoch 3, Loss: 3.3306
Epoch 4, Loss: 2.7977
Epoch 5, Loss: 2.4532
Epoch 6, Loss: 2.2201
Epoch 7, Loss: 2.0573
Epoch 8, Loss: 1.9355
Epoch 9, Loss: 1.8436
Epoch 10, Loss: 1.7687
Epoch 11, Loss: 1.7082
Epoch 12, Loss: 1.6585
Epoch 13, Loss: 1.6128
Epoch 14, Loss: 1.5738
Epoch 15, Loss: 1.5409
Epoch 16, Loss: 1.5134
Epoch 17, Loss: 1.4873
Epoch 18, Loss: 1.4641
Epoch 19, Loss: 1.4451
Epoch 20, Loss: 1.4252
Epoch 21, Loss: 1.4102
Epoch 22, Loss: 1.3946
Epoch 23, Loss: 1.3825
Epoch 24, Loss: 1.3703
Epoch 25, Loss: 1.3562
Epoch 26, Loss: 1.3427
Epoch 27, Loss: 1.3359
Epoch 28, Loss: 1.3268
Epoch 29, Loss: 1.3207
Epoch 30, Loss: 1.3128
Epoch 31, Loss: 1.3019
Epoch 32, Loss: 1.2935
Epoch 33, Loss: 1.2861
Epoch 34, Loss: 1.2836
Epoch 35, Loss: 1.2756
Epoch 36, Loss: 1.2700
Epoch 37, Loss: 1.2638
Epoch 38, Loss: 1.2609
Epoch 39, Loss: 1.2538
Epoch 40, Loss: 1.2517
Epoch 41, Loss: 1.2474
Epoch 42, Loss: 1.2424
Epoch 43, Loss: 1.238

In [None]:
from sklearn.linear_model import LogisticRegression

def get_text_embedding(text, model, vocab):
    words = text.split()
    vectors = [model.embeddings.weight[vocab[word]].detach().numpy() for word in words if word in vocab]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.embeddings.embedding_dim)

X_train_vectors = np.array([get_text_embedding(clean_text(text), cbow_train, vocab) for text in X_train])
X_test_vectors = np.array([get_text_embedding(clean_text(text), cbow_train, vocab) for text in X_test])

classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_vectors, y_train)

# Оцінка моделі
y_pred_cbow = classifier.predict(X_test_vectors)

f1 = f1_score(y_test, y_pred_cbow, average='weighted') 
print(f"CBOW F1-score: {f1:.4f}")

CBOW F1-score: 0.7829
