# Контекстно-независимое векторное представление слов

In [1]:
import os
current_directory = os.getcwd() 
file_path = os.path.join(current_directory, 'data', 'news.txt.gz') 

In [6]:
import numpy as np

In [38]:
import gzip
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

import gensim

from gensim.models import Word2Vec

from dataclasses import dataclass
from typing import Iterator, List

@dataclass
class Text:
    label: str
    title: str
    text: str

# Чтение файла данных
def read_texts(fn: str=file_path) -> Iterator[Text]:
    with gzip.open(fn, "rt", encoding="utf-8") as f:
        for line in f:
            yield Text(*line.strip().split("\t"))
                    
# Разбиение текста на слова                 
def tokenize_text(title: str, text: str) -> List[str]:
    text = title.lower() + text.lower()
    text = re.sub(r'[^a-zA-Zа-яА-ЯёЁ]', ' ', text)
    words = re.findall(r'\b\w+\b', text.lower())
    russian_stopwords = set(stopwords.words("russian"))
    words = [t for t in words if t not in russian_stopwords and len(t) > 2]
    stemmer = SnowballStemmer("russian")
    stemmed_words = [stemmer.stem(word) for word in words]
    return stemmed_words

# Текст без знаков припенания (нужен для gensim)
def normalize_text(text: str) -> str:
    return ' '.join(tokenize_text(text))

In [None]:
## Обучение word2vec
# каждый текст - набор слов через пробел
texts = list(read_texts())
sentences = [tokenize_text(text.title, text.text) for text in texts]
labels = [text.label for text in texts]

w2v = Word2Vec(sentences)

w2v.wv.save_word2vec_format('w2v_vectors.bin')
# пример
w2v.wv.most_similar("новост")

[('жанейр', 0.7212162613868713),
 ('марака', 0.619356095790863),
 ('итар', 0.5389225482940674),
 ('интерфакс', 0.5348697900772095),
 ('прайм', 0.5296918749809265),
 ('белт', 0.5146872401237488),
 ('агентств', 0.5137584209442139),
 ('собеседник', 0.5066440105438232),
 ('тасс', 0.506440281867981),
 ('корреспондент', 0.5062633752822876)]

Усреднение векторов слов

In [None]:
def vector_mean(tokens, model):
    vectors = []
    for token in tokens:
        if token in model.wv:
            vectors.append(model.wv[token])
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    else:
        return np.mean(vectors, axis=0)

X_mean = []
for tokens in sentences:
    X_mean.append(vector_mean(tokens, w2v))

X_mean = np.array(X_mean)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(
    X_mean,
    labels,
    test_size=0.2,
    random_state=42
)

clf = SVC(kernel='linear', C=1)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

    business       0.56      0.18      0.27        79
     culture       0.86      0.84      0.85       279
   economics       0.73      0.91      0.81       266
      forces       0.72      0.83      0.77       149
        life       0.77      0.78      0.78       288
       media       0.80      0.77      0.79       299
     science       0.85      0.82      0.83       288
       sport       0.95      0.97      0.96       276
       style       0.93      0.74      0.82        38
      travel       0.63      0.45      0.52        38

    accuracy                           0.81      2000
   macro avg       0.78      0.73      0.74      2000
weighted avg       0.81      0.81      0.80      2000



Tfidf усреднение

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

texts_for_tfidf = [" ".join(tokens) for tokens in sentences]

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(texts_for_tfidf)
vocab = tfidf.vocabulary_

def vector_tfidf_mean(tokens, model, tfidf, vocab, doc_id):
    vectors = []
    weights = []
    for token in tokens:
        if token in model.wv and token in vocab:
            token_id = vocab[token]
            tfidf_weight = tfidf_matrix[doc_id, token_id]
            vectors.append(model.wv[token] * tfidf_weight)
            weights.append(tfidf_weight)
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    else:
        return np.sum(vectors, axis=0) / np.sum(weights)

X_tfidf_mean = []
for i, tokens in enumerate(sentences):
    X_tfidf_mean.append(vector_tfidf_mean(tokens, w2v, tfidf, vocab, i))

X_tfidf_mean = np.array(X_tfidf_mean)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf_mean,
    labels,
    test_size=0.2,
    random_state=42
)

clf = SVC(kernel='linear', C=1)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

    business       0.50      0.18      0.26        79
     culture       0.86      0.84      0.85       279
   economics       0.72      0.89      0.79       266
      forces       0.73      0.81      0.77       149
        life       0.75      0.77      0.76       288
       media       0.80      0.74      0.77       299
     science       0.82      0.83      0.82       288
       sport       0.95      0.96      0.95       276
       style       0.80      0.74      0.77        38
      travel       0.67      0.47      0.55        38

    accuracy                           0.80      2000
   macro avg       0.76      0.72      0.73      2000
weighted avg       0.79      0.80      0.79      2000

