# Классификация отзывов на фильмы IMDB

#### **Задача:** научиться отличать позитивные отзывы от негативных

Для начала импортируем все, что нам понадобиться

In [137]:
import pandas as pd
import numpy as np
import nltk
import re
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from nltk import pos_tag, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from catboost import CatBoostClassifier, Pool, cv
import random
np.random.seed(42)
random.seed(42)

Для предобработки текстов для алгоритмов классического ML, будем пользоваться библиотекой NLTK. Подгрузим необходимые данные:

In [2]:
nltk.download('punkt') # для токенизации
nltk.download('averaged_perceptron_tagger') # для разметки частей речи
nltk.download('wordnet') # для лемматизации
nltk.download('stopwords') # для удаления ненужных слов

Загрузим обучающие данные:

In [125]:
with open('train.texts') as file:
    train_texts = [line.strip() for line in file.readlines()]

In [126]:
with open('train.labels') as file:
    train_labels = [line.strip() for line in file.readlines()]

In [127]:
train_texts[0]

'If the myth regarding broken mirrors would be accurate, everybody involved in this production would now face approximately 170 years of bad luck, because there are a lot of mirrors falling to little pieces here. If only the script was as shattering as the glass, then "The Broken" would have been a brilliant film. Now it\'s sadly just an overlong, derivative and dull movie with only just a handful of remarkable ideas and memorable sequences. Sean Ellis made a very stylish and elegantly photographed movie, but the story is lackluster and the total absence of logic and explanation is really frustrating. I got into a discussion with a friend regarding the basic concept and "meaning" of the film. He thinks Ellis found inspiration in an old legend claiming that spotting your doppelganger is a foreboding of how you\'re going to die. Interesting theory, but I\'m not familiar with this legend and couldn\'t find anything on the Internet about this, neither. Personally, I just think "The Broken"

Напишем 2 вспомогательные функции: для препроцессинга и для перевода обозначения частей речи к формату wordnet.

In [179]:
def get_wordnet_pos(treebank_tag):
    '''
    Transform parts of speech to wordnet format.
    '''
    switch = {
        'J': wordnet.ADJ,
        'V': wordnet.VERB,
        'N': wordnet.NOUN,
        'R': wordnet.ADV,
    }
    for key, item in switch.items():
        if treebank_tag.startswith(key):
            return item
    return wordnet.NOUN

def preprocess(sent, stop_words):
    '''
    Preprocessing from raw text to lemmatized text.
    '''
    lemmatizer = WordNetLemmatizer()
    sanitized_sent = ' '.join(re.findall(r"\b[A-Za-z]+\b", sent)) # удалим все, что не слова английского языка
    tokenized_sent = word_tokenize(sanitized_sent) # токенизируем
    pos_tagged = [(word, get_wordnet_pos(tag)) # разметим части речи
                 for word, tag in pos_tag(tokenized_sent)]
    lemmatized_sent = [lemmatizer.lemmatize(word, tag) # приведем к одной форме (лемматизируем)
                    for word, tag in pos_tagged]
    reduced_sent = [word for word in lemmatized_sent if word not in stop_words] # уберем стоп-слова
    return ' '.join(reduced_sent)

Преобразуем текст, выкидываем слова, не несущие смысловой нагрузки, делаем TF-IDF векторизацию. При этом будем брать н-граммы от 1 до 3 и ограничим число итоговых признаков до 20000 (подбиралось вручную).

In [None]:
stop_words = set(stopwords.words("english"))
processed = [preprocess(sent, stop_words) for sent in train_texts]
labels = [0 if label == 'neg' else 1 for label in train_labels]
vectorizer = TfidfVectorizer(ngram_range=(1,3), max_features=20000)
vectorizer.fit(processed)
vectorized = vectorizer.transform(processed)

Протестируем логистическую регрессию по кросс-валидации, подберем параметр l2 регуляризации, а затем обучим на всем датасете.

In [13]:
lr = LogisticRegression(C=3, max_iter=500)
print((np.mean(cross_val_score(lr, vectorized, labels, scoring="accuracy", cv=5))))

0.8871333333333332


In [None]:
lr = LogisticRegression(C=3, max_iter=500)
lr.fit(vectorized, labels)

test_df = pd.read_csv('texts.csv')
test_texts = test_df['texts'].to_list()
test_texts = [preprocess(sent, stop_words) for sent in test_texts]
test = vectorizer.transform(test_texts)
lr_predictions = lr.predict(test)
test_df['labels'] = ['neg' if pred == 0 else 'pos' for pred in  cat_predictions]
test_df[['id', 'labels']].to_csv('y_pred.csv', index=False)

#### Собственно, все мое SOTA решение)

## Кратко о том, что не сработало

Хотелось на тех же TF-IDF векторах обучить Catboost, но почему-то сработало хуже линейной модели. Была идея подобрать гиперпараметры с помощью Optuna, но времени не хватило. 

In [47]:
params = {'verbose': False, 'iterations': 2000, 'loss_function': 'CrossEntropy', 'od_wait': 200, 'eval_metric': 'Accuracy'}
pool = Pool(data=vectorized, label=labels)
cv(pool, params, fold_count=5, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]

bestTest = 0.8673333333
bestIteration = 1355

Training on fold [1/5]

bestTest = 0.8503333333
bestIteration = 1338

Training on fold [2/5]

bestTest = 0.8706666667
bestIteration = 1954

Training on fold [3/5]

bestTest = 0.8543333333
bestIteration = 889

Training on fold [4/5]

bestTest = 0.8603333333
bestIteration = 1707



Unnamed: 0,iterations,test-Accuracy-mean,test-Accuracy-std,train-Accuracy-mean,train-Accuracy-std,test-CrossEntropy-mean,test-CrossEntropy-std,train-CrossEntropy-mean,train-CrossEntropy-std
0,0,0.704867,0.013952,0.707350,0.008670,0.684720,0.000463,0.684483,0.000314
1,1,0.709600,0.013041,0.713967,0.008048,0.677199,0.000672,0.676759,0.000369
2,2,0.715667,0.012730,0.716250,0.003764,0.669943,0.001088,0.669454,0.000547
3,3,0.715200,0.013201,0.716533,0.005375,0.663185,0.001750,0.662546,0.001088
4,4,0.717933,0.011658,0.720383,0.004667,0.655975,0.001788,0.655220,0.000853
...,...,...,...,...,...,...,...,...,...
1995,1995,0.859267,0.008877,0.974767,0.017623,0.332684,0.006487,0.182775,0.029206
1996,1996,0.859267,0.008877,0.974783,0.017642,0.332688,0.006483,0.182761,0.029224
1997,1997,0.859267,0.008877,0.974783,0.017642,0.332682,0.006491,0.182745,0.029244
1998,1998,0.859200,0.008774,0.974800,0.017661,0.332683,0.006489,0.182732,0.029260


Далее пробуем подход с предобученным трансформером - distilled BERT

Импортируем все необходимое

In [28]:
from datasets import Dataset
from transformers import AutoTokenizer, pipeline, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
import evaluate

In [59]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

Загрузим необходимые объекты: tokenizer для токенизации, collator для составления батчей и саму модель.

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
accuracy = evaluate.load("accuracy")
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

Напишем вспомогательные функции для токенизации и рассчета accuracy.

In [60]:
def preprocessing(examples):
    '''
    Tokenizes text.
    '''
    return tokenizer(examples["text"], truncation=True)
    
def compute_metrics(eval_pred):
    '''
    Computes accuracy.
    '''
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Переведем данные в формат для transformers.

In [61]:
ds = Dataset.from_dict({'label': labels, 'text': train_texts})
ds = ds.train_test_split(test_size=0.2)

In [64]:
tokenized_ds = ds.map(preprocessing, batched=True)

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

И, наконец, зафайнтюним наш BERT

In [72]:
training_args = TrainingArguments(
    output_dir="distbert1",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2801,0.230696,0.916
2,0.1649,0.347833,0.914667


TrainOutput(global_step=3000, training_loss=0.23519743347167968, metrics={'train_runtime': 1840.7675, 'train_samples_per_second': 13.038, 'train_steps_per_second': 1.63, 'total_flos': 2994117145956384.0, 'train_loss': 0.23519743347167968, 'epoch': 2.0})

Качество выглядит многообещающим, но на тесте ничего хорошего не вышло.

Далее применяем модель к тестовым данным.

In [119]:
bert_preds = []
bad_counter = 0
tokenizer = AutoTokenizer.from_pretrained("distbert1/checkpoint-3000")
model = AutoModelForSequenceClassification.from_pretrained("distbert1/checkpoint-3000")
for txt in tqdm(test_texts):
    tokenized_txt = tokenizer(txt, return_tensors="pt")
    if tokenized_txt['input_ids'].shape[1] > 512: # в тесте оказались примеры длиннее, чем может обработать модель, они просто обрезались
        tokenized_txt['input_ids'] = tokenized_txt['input_ids'][:, :512]
        tokenized_txt['attention_mask'] = tokenized_txt['attention_mask'][:, :512]
        bad_counter += 1
    with torch.no_grad():
        logits = model(**tokenized_txt).logits
        predicted_class_id = logits.argmax().item()
        ans = model.config.id2label[predicted_class_id]
    if ans == 'NEGATIVE':
        bert_preds.append('neg')
    else:
        bert_preds.append('pos')

  0%|                                        | 10/10000 [00:00<06:25, 25.95it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (542 > 512). Running this sequence through the model will result in indexing errors
100%|█████████████████████████████████████| 10000/10000 [06:58<00:00, 23.89it/s]


In [122]:
test_df = pd.read_csv('texts.csv')
test_df['labels'] = bert_preds
test_df[['id', 'labels']].to_csv('y_pred.csv', index=False)

Напоследок попробуем построить **стэкинг** из разных базовых моделей, в качетсве финальной - логистическая регрессия.

In [170]:
l2 = LogisticRegression(penalty='l2', C=3, max_iter=500) # логрег с l2 регуляризацией и коэффициентом 3
l2_2 = LogisticRegression(penalty='l2', C=1, max_iter=500) # логрег с l2 регуляризацией и коэффициентом 1
l2_3 = LogisticRegression(penalty='l2', C=5, max_iter=500) # логрег с l2 регуляризацией и коэффициентом 5
fin = LogisticRegression(penalty='l2', C=3, max_iter=500) # финальный логрег
l1 = LogisticRegression(penalty='l1', C=3, solver='saga', max_iter=500) # логрег с l1 регуляризацией и коэффициентом 3
lr = LogisticRegression(penalty=None, max_iter=500) # логрег без регуляризации
naive = MultinomialNB() # наивный байесовский классификатор
rf = RandomForestClassifier(n_estimators=300, max_depth=7, random_state=42, n_jobs=8) # случайный лес
estimators = [('l2', l2), ('l2_2', l2_2), ('l2_3', l2_3), ('l1', l1), ('lr', lr), ('naive', naive),
('rf_simple', rf_simple)]
cls = StackingClassifier(estimators=estimators, final_estimator=fin, n_jobs=8, cv=3)

In [None]:
cls.fit(vectorized, labels)

In [173]:
stack_predictions = cls.predict(test)
test_df['labels'] = ['neg' if pred == 0 else 'pos' for pred in  stack_predictions]
test_df[['id', 'labels']].to_csv('y_pred.csv', index=False)

Тоже ничего прорывного на тесте из этого не вышло