In [9]:
import pandas as pd
from sklearn.datasets import load_files

# Загрузка данных IMDB
data = load_files('aclImdb/train', categories=['pos', 'neg'], encoding='utf-8', decode_error='ignore')
texts, labels = data.data, data.target

# Разделение на тренировочную и тестовую выборки
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)


In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Использование CountVectorizer
count_vectorizer = CountVectorizer()
X_train_counts = count_vectorizer.fit_transform(X_train)
X_test_counts = count_vectorizer.transform(X_test)

# Обучение модели логистической регрессии
clf_count = LogisticRegression(max_iter=1000)
clf_count.fit(X_train_counts, y_train)

# Предсказание и оценка
y_pred_count = clf_count.predict(X_test_counts)
print("CountVectorizer:")
print(classification_report(y_test, y_pred_count))

# Использование TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Обучение модели логистической регрессии
clf_tfidf = LogisticRegression(max_iter=1000)
clf_tfidf.fit(X_train_tfidf, y_train)

# Предсказание и оценка
y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)
print("TfidfVectorizer:")
print(classification_report(y_test, y_pred_tfidf))


CountVectorizer:
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      2482
           1       0.88      0.88      0.88      2518

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000

TfidfVectorizer:
              precision    recall  f1-score   support

           0       0.88      0.87      0.88      2482
           1       0.87      0.88      0.88      2518

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000



In [12]:
import gensim
from gensim.models import Word2Vec
import numpy as np

# Токенизация текстов
def tokenize_text(text):
    return gensim.utils.simple_preprocess(text)

X_train_tokens = [tokenize_text(text) for text in X_train]
X_test_tokens = [tokenize_text(text) for text in X_test]

# Обучение модели Word2Vec
model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, workers=4)
model.train(X_train_tokens, total_examples=len(X_train_tokens), epochs=10)

# Преобразование текстов в эмбеддинги
def get_embedding(tokens, model):
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

X_train_vecs = np.array([get_embedding(tokens, model) for tokens in X_train_tokens])
X_test_vecs = np.array([get_embedding(tokens, model) for tokens in X_test_tokens])

# Обучение модели логистической регрессии
clf_w2v = LogisticRegression(max_iter=1000)
clf_w2v.fit(X_train_vecs, y_train)

# Предсказание и оценка
y_pred_w2v = clf_w2v.predict(X_test_vecs)
print("Word2Vec:")
print(classification_report(y_test, y_pred_w2v))


Word2Vec:
              precision    recall  f1-score   support

           0       0.85      0.84      0.84      2482
           1       0.85      0.85      0.85      2518

    accuracy                           0.85      5000
   macro avg       0.85      0.85      0.85      5000
weighted avg       0.85      0.85      0.85      5000

