Задание 2. На примере задачи классификации текстов определить насколько предобработка текста (стемминг, лемматизация, стоп-слова и т.д.) влияет на качество обучения модели. Сделать выводы.  

Датасет:

from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train')

Темы: alt.atheism, misc.forsale, soc.religion.christian, talk.politics.mideast

Алгоритм МО: логистическая регрессия

In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import nltk

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text, use_stemming=True, use_stopwords=True):
    text = re.sub("[^a-zA-Z]", " ", text.lower())
    
    words = word_tokenize(text)

    if use_stopwords:
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word not in stop_words]

    if use_stemming:
        stemmer = SnowballStemmer('english')
        words = [stemmer.stem(word) for word in words]

    return ' '.join(words)

categories = ['alt.atheism', 'misc.forsale', 'soc.religion.christian', 'talk.politics.mideast']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

texts_preprocessed = [preprocess_text(text) for text in newsgroups_train.data]

vectorizer_preprocessed = CountVectorizer()
X_train_preprocessed = vectorizer_preprocessed.fit_transform(texts_preprocessed)
y_train = newsgroups_train.target

model_preprocessed = LogisticRegression(max_iter=5000)
model_preprocessed.fit(X_train_preprocessed, y_train)

vectorizer_raw = CountVectorizer()
X_train_raw = vectorizer_raw.fit_transform(newsgroups_train.data)

model_raw = LogisticRegression(max_iter=5000)
model_raw.fit(X_train_raw, y_train)

X_test_preprocessed = vectorizer_preprocessed.transform([preprocess_text(text) for text in newsgroups_test.data])
X_test_raw = vectorizer_raw.transform(newsgroups_test.data)
y_test = newsgroups_test.target

report_preprocessed = classification_report(y_test, model_preprocessed.predict(X_test_preprocessed), target_names=newsgroups_test.target_names)
report_raw = classification_report(y_test, model_raw.predict(X_test_raw), target_names=newsgroups_test.target_names)

print("Отчет о классификации с предобработкой:\n", report_preprocessed)
print("Отчет о классификации без предобработки:\n", report_raw)


[nltk_data] Downloading package punkt to /home/qtr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/qtr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Отчет о классификации с предобработкой:
                         precision    recall  f1-score   support

           alt.atheism       0.90      0.82      0.86       319
          misc.forsale       0.94      0.99      0.97       390
soc.religion.christian       0.88      0.96      0.92       398
 talk.politics.mideast       0.98      0.90      0.94       376

              accuracy                           0.92      1483
             macro avg       0.93      0.92      0.92      1483
          weighted avg       0.93      0.92      0.92      1483

Отчет о классификации без предобработки:
                         precision    recall  f1-score   support

           alt.atheism       0.89      0.84      0.87       319
          misc.forsale       0.95      0.99      0.97       390
soc.religion.christian       0.89      0.97      0.93       398
 talk.politics.mideast       0.97      0.88      0.92       376

              accuracy                           0.93      1483
             mac

# Вывод

конечно, с текст предобработкой лучше, но для нашего случая особо сильной разницы нет