Контекстно-независимое векторное представление слов


In [1]:
import nltk
nltk.download('punkt')
import gzip
import re
from tqdm import tqdm
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from typing import List

# Loading texts
def read_texts(fn: str) -> List[str]:
    texts = []
    labels = []
    with gzip.open(fn, "rt", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            labels.append(parts[0])
            texts.append(parts[2])
    return labels, texts

labels, texts = read_texts("news.txt.gz")  

# Text preprocessing
def preprocess_text(text: str) -> list:
    text = text.lower()
    text = re.sub(r'\(.*?\)', '', text)
    text = re.sub(r'\W', ' ', text)
    stop_words = set(stopwords.words('russian'))
    words = word_tokenize(text)
    words = [word for word in words if word.isalpha() and word not in stop_words]
    return words

processed_texts = [preprocess_text(text) for text in tqdm(texts, disable=True)]

# Separation into training set and test set
X_train, X_test, y_train, y_test = train_test_split(processed_texts, labels, test_size=0.2, random_state=42)
model = Word2Vec(sentences=X_train, vector_size=100, window=5, min_count=1, workers=4)

def document_vector(model, words):
    vectors = [model.wv[word] for word in words if word in model.wv]
    if not vectors:
        return None
    return sum(vectors) / len(vectors)

X_train_vectors = [document_vector(model, words) for words in X_train]
X_test_vectors = [document_vector(model, words) for words in X_test]

X_train_vectors = [vec for vec in X_train_vectors if vec is not None]
y_train = [label for vec, label in zip(X_train_vectors, y_train) if vec is not None]
X_test_vectors = [vec for vec in X_test_vectors if vec is not None]
y_test = [label for vec, label in zip(X_test_vectors, y_test) if vec is not None]

# SVM Training
classifier = SVC(kernel='linear')
classifier.fit(X_train_vectors, y_train)

y_pred = classifier.predict(X_test_vectors)

# Evaluating the results
print(classification_report(y_test, y_pred))




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\张佳\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


              precision    recall  f1-score   support

    business       0.00      0.00      0.00        79
     culture       0.59      0.61      0.60       279
   economics       0.62      0.86      0.72       266
      forces       0.54      0.51      0.53       149
        life       0.68      0.55      0.61       288
       media       0.55      0.68      0.61       299
     science       0.60      0.66      0.63       288
       sport       0.86      0.88      0.87       276
       style       0.89      0.21      0.34        38
      travel       0.00      0.00      0.00        38

    accuracy                           0.64      2000
   macro avg       0.53      0.49      0.49      2000
weighted avg       0.61      0.64      0.61      2000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
