In [1]:
import pandas as pd
from sklearn.datasets import load_files

# Загрузка датасета IMDB Movie Reviews
def load_imdb_dataset():
    dataset = load_files('aclImdb/train/', shuffle=True)
    texts, labels = dataset.data, dataset.target
    return texts, labels

texts, labels = load_imdb_dataset()


In [2]:
import numpy as np
from sklearn.model_selection import train_test_split

# Преобразование байтов в строки
texts = [text.decode('utf-8') for text in texts]

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)


In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Векторизация с использованием CountVectorizer
count_vectorizer = CountVectorizer(max_features=5000)
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

# Векторизация с использованием TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [4]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# GradientBoostingClassifier с CountVectorizer
gbc_count = GradientBoostingClassifier()
gbc_count.fit(X_train_count, y_train)
y_pred_gbc_count = gbc_count.predict(X_test_count)
accuracy_gbc_count = accuracy_score(y_test, y_pred_gbc_count)
report_gbc_count = classification_report(y_test, y_pred_gbc_count)

# GradientBoostingClassifier с TfidfVectorizer
gbc_tfidf = GradientBoostingClassifier()
gbc_tfidf.fit(X_train_tfidf, y_train)
y_pred_gbc_tfidf = gbc_tfidf.predict(X_test_tfidf)
accuracy_gbc_tfidf = accuracy_score(y_test, y_pred_gbc_tfidf)
report_gbc_tfidf = classification_report(y_test, y_pred_gbc_tfidf)

# LogisticRegression с CountVectorizer
lr_count = LogisticRegression(max_iter=1000)
lr_count.fit(X_train_count, y_train)
y_pred_lr_count = lr_count.predict(X_test_count)
accuracy_lr_count = accuracy_score(y_test, y_pred_lr_count)
report_lr_count = classification_report(y_test, y_pred_lr_count)

# LogisticRegression с TfidfVectorizer
lr_tfidf = LogisticRegression(max_iter=1000)
lr_tfidf.fit(X_train_tfidf, y_train)
y_pred_lr_tfidf = lr_tfidf.predict(X_test_tfidf)
accuracy_lr_tfidf = accuracy_score(y_test, y_pred_lr_tfidf)
report_lr_tfidf = classification_report(y_test, y_pred_lr_tfidf)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
print("GradientBoostingClassifier с CountVectorizer")
print("Accuracy:", accuracy_gbc_count)
print(report_gbc_count)

print("GradientBoostingClassifier с TfidfVectorizer")
print("Accuracy:", accuracy_gbc_tfidf)
print(report_gbc_tfidf)

print("LogisticRegression с CountVectorizer")
print("Accuracy:", accuracy_lr_count)
print(report_lr_count)

print("LogisticRegression с TfidfVectorizer")
print("Accuracy:", accuracy_lr_tfidf)
print(report_lr_tfidf)


GradientBoostingClassifier с CountVectorizer
Accuracy: 0.6325644098262433
              precision    recall  f1-score   support

           0       0.74      0.76      0.75      2620
           1       0.55      0.86      0.67      2429
           2       0.66      0.10      0.17      1627

    accuracy                           0.63      6676
   macro avg       0.65      0.57      0.53      6676
weighted avg       0.65      0.63      0.58      6676

GradientBoostingClassifier с TfidfVectorizer
Accuracy: 0.6331635710005992
              precision    recall  f1-score   support

           0       0.74      0.76      0.75      2620
           1       0.56      0.85      0.67      2429
           2       0.67      0.11      0.19      1627

    accuracy                           0.63      6676
   macro avg       0.65      0.57      0.54      6676
weighted avg       0.65      0.63      0.58      6676

LogisticRegression с CountVectorizer
Accuracy: 0.6503894547633313
              precision 

In [6]:
results = {
    "GBC + Count": accuracy_gbc_count,
    "GBC + Tfidf": accuracy_gbc_tfidf,
    "LR + Count": accuracy_lr_count,
    "LR + Tfidf": accuracy_lr_tfidf
}

best_method = max(results, key=results.get)
print(f"Лучший метод: {best_method} с точностью {results[best_method]:.4f}")


Лучший метод: LR + Tfidf с точностью 0.6980
