<a href="https://colab.research.google.com/github/aovolkov/sentiment_analysis/blob/main/notebooks/twitter_sentiment_analysis_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Подбор baseline для задачи определения тональности твитов

## Установка библиотек

In [1]:
!pip install pymorphy2

Collecting pymorphy2
[?25l  Downloading https://files.pythonhosted.org/packages/07/57/b2ff2fae3376d4f3c697b9886b64a54b476e1a332c67eee9f88e7f1ae8c9/pymorphy2-0.9.1-py3-none-any.whl (55kB)
[K     |██████                          | 10kB 16.9MB/s eta 0:00:01[K     |███████████▉                    | 20kB 11.9MB/s eta 0:00:01[K     |█████████████████▊              | 30kB 9.7MB/s eta 0:00:01[K     |███████████████████████▋        | 40kB 8.9MB/s eta 0:00:01[K     |█████████████████████████████▌  | 51kB 5.5MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 3.7MB/s 
Collecting pymorphy2-dicts-ru<3.0,>=2.4
[?25l  Downloading https://files.pythonhosted.org/packages/3a/79/bea0021eeb7eeefde22ef9e96badf174068a2dd20264b9a378f2be1cdd9e/pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2MB)
[K     |████████████████████████████████| 8.2MB 4.7MB/s 
[?25hCollecting dawg-python>=0.7.1
  Downloading https://files.pythonhosted.org/packages/6a/84/ff1ce2071d4c650ec8574576

In [2]:
from IPython.display import clear_output
import numpy as np
import pandas as pd
import warnings
import random
from tqdm import tqdm
from string import punctuation
import re

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer
from sklearn.utils import shuffle
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, f1_score, roc_auc_score

SEED = 42

random.seed(SEED)
np.random.seed(SEED)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Загрузка данных


Для данной задачи был выбран датасет с разметкой сентимента русскоязычных твитов (подробнее про него в [статье](http://www.swsys.ru/index.php?page=article&id=3962&lang=)). Корпус твитов содержит 114,911 положительных и 111,923 отрицательных записей. Загрузить его можно [тут](https://study.mokoron.com/).

Так как данный датасет не принадлежит к какой-то конкретной категории (отзывы на фильмы, отзывы на продукты и т.д.), то он должен прекрасно подходить под выявление неких общих закономерностей свойственных задаче sentiment analysis.




In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
negative_texts = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Projects/Sentiment/negative_twitter.csv', encoding='utf8', sep=';', header=None)
positive_texts = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Projects/Sentiment/positive_twitter.csv', encoding='utf8', sep=';', header=None)

In [5]:
positive_texts.sample(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
45225,409983419545055232,1386582639,Mer1990S,Уходит № 5 / Герой не армянин )))) !!! Следую...,1,0,0,0,50822,53,0,0
41554,409932763077554176,1386570561,AnatoliyKurg,В Госпитале Ветеранов защита окон 80 Lvl - реш...,1,0,0,0,876,73,123,0
88893,410826246520442881,1386783584,VictoriaGoldSha,Завтра день самоуправления-_-5 уроков и все со...,1,0,0,0,661,364,627,0
58058,410128512259862528,1386617231,L_toutprix,"@Denis_Shvedak ой, господи, кому чего, выражен...",1,0,0,0,5359,57,33,1
38606,409886942520950784,1386559637,cikivyjakic,Возвключим же православный Grinder в честь т...,1,0,0,0,133,134,129,0


In [6]:
sentences = np.concatenate([positive_texts[3].values, negative_texts[3].values])

labels = [[1] for _ in range(positive_texts.shape[0])] + [[0] for _ in range(negative_texts.shape[0])]

# проверка на длину 
assert len(sentences) == len(labels) == positive_texts.shape[0] + negative_texts.shape[0]

In [7]:
sentences[210000]

'Шрамы от сладкого ;( p.s слабонервным не смотреть! http://t.co/kMoYf4vXqk'

In [8]:
train_sentences, test_sentences, train_labels, test_labels = train_test_split(sentences, labels, test_size=0.2)
train_sentences, valid_sentences, train_labels, valid_labels = train_test_split(train_sentences, train_labels, test_size=0.25)

print('Размер тренировочной выборки:', len(train_labels))
print('Размер валидационной выборки:', len(valid_labels))
print('Размер тестовой выборки:', len(test_labels))

Размер тренировочной выборки: 136100
Размер валидационной выборки: 45367
Размер тестовой выборки: 45367


## Подбор Baseline 

### Подбор классификатора и векторайзера

In [9]:
def create_pipeline(vectorizer, classifier):
    return Pipeline([('vectorizer', vectorizer), 
                     ('classifier', classifier)])

In [10]:
%%time
warnings.filterwarnings('ignore')
list_of_algos = [LogisticRegression, LinearSVC, SGDClassifier,
                 MultinomialNB, GradientBoostingClassifier]
list_of_algos_str = ['LogisticRegression', 'LinearSVC', 'SGDClassifier',
                 'MultinomialNB', ' GradientBoostingClassifier']

tfidf_results = []
countvect_results = []

for clf, clf_str  in tqdm(zip(list_of_algos, list_of_algos_str)):
    countvect_results.append(round(cross_val_score(create_pipeline(CountVectorizer(), clf()), valid_sentences, valid_labels).mean(), 5))
    tfidf_results.append(round(cross_val_score(create_pipeline(TfidfVectorizer(), clf()), valid_sentences, valid_labels).mean(), 5))

5it [04:20, 52.06s/it]

CPU times: user 4min 22s, sys: 14.3 s, total: 4min 36s
Wall time: 4min 20s





In [11]:
results_table = pd.DataFrame({'CountVectorizer': countvect_results, 'TfidfVectorizer': tfidf_results}, index=list_of_algos_str)
results_table

Unnamed: 0,CountVectorizer,TfidfVectorizer
LogisticRegression,0.73192,0.72844
LinearSVC,0.71153,0.72687
SGDClassifier,0.72837,0.71907
MultinomialNB,0.72608,0.72301
GradientBoostingClassifier,0.64935,0.64476


### Лемматизация + стоп-слова

In [12]:
# в контексте задачи анализа тнальности частица 'не' и слово 'хорошо'
# являются достаточно важными

russian_stopwords = stopwords.words("russian")
del russian_stopwords[135]
del russian_stopwords[3]

# убираем обращения по @ и все символы, кроме скобочек))
TOKENIZE_RE = re.compile(r'@+[\w\d]*|[\.\^\$\*\?\{\}\[\]\|]+', re.I)

morph = MorphAnalyzer()


def preprocess_text(text):
    prep_text = []
    
    for txt in text:
        txt = txt.lower()
        tokens = [morph.parse(token)[0].normal_form for token in txt.strip().split()\
          if token not in russian_stopwords\
          and TOKENIZE_RE.findall(token) == []]
        txt = ' '.join(tokens)
        prep_text.append(txt)
    
    return np.array(prep_text)

In [13]:
valid_sentences_preprocesed = preprocess_text(valid_sentences) 

In [14]:
warnings.filterwarnings('ignore')

tfidf_prepr_results = []
countvect_prepr_results = []

for clf, clf_str  in zip(list_of_algos, list_of_algos_str):
    countvect_prepr_results.append(round(cross_val_score(create_pipeline(CountVectorizer(), clf()), valid_sentences_preprocesed, valid_labels).mean(), 5))
    tfidf_prepr_results.append(round(cross_val_score(create_pipeline(TfidfVectorizer(), clf()), valid_sentences_preprocesed, valid_labels).mean(), 5))

In [15]:
results_table['CountVectorizer (preprocessed)'] = countvect_prepr_results
results_table['TfidfVectorizer (preprocessed)'] = tfidf_prepr_results
results_table


Unnamed: 0,CountVectorizer,TfidfVectorizer,CountVectorizer (preprocessed),TfidfVectorizer (preprocessed)
LogisticRegression,0.73192,0.72844,0.69985,0.70199
LinearSVC,0.71153,0.72687,0.68043,0.69149
SGDClassifier,0.72837,0.71907,0.69861,0.69672
MultinomialNB,0.72608,0.72301,0.69875,0.69606
GradientBoostingClassifier,0.64935,0.64476,0.62947,0.63328


Качество после препроцессинга только ухудшилось. 

Наилучшее качество среди всех классификаторов показали LinearSVC и LogisticRegression. Остановимся на них.


## Подбор параметров для Baseline

In [16]:
def estimate(classifier, params_grid, scorer, data, labels):
    pipeline = create_pipeline(TfidfVectorizer(), classifier)
    grid_cv = RandomizedSearchCV(pipeline, params_grid, scoring=scorer, cv=5, 
                                 random_state=SEED, n_iter=100, verbose=1, n_jobs=-1)
    grid_cv.fit(data, labels)
    return grid_cv

In [17]:
# сетка для перебора параметров

params_grid_vectorizer = {
    'vectorizer__max_df': [0.85, 0.9, 0.95, 1.0],
    'vectorizer__min_df': [1, 10, 20],
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6),
                               (2, 2), (2, 3), (2, 4), (3, 3), (3, 4)],
    'vectorizer__stop_words': [russian_stopwords, None],
    'vectorizer__norm': ['l1', 'l2'],
    'vectorizer__smooth_idf': [True, False],
    'vectorizer__use_idf': [True, False],
    'vectorizer__sublinear_tf': [True, False]
}

params_grid_log_regr = {
    'classifier__penalty': ['l1', 'l2', 'elasticnet'],
    'classifier__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'classifier__max_iter': np.arange(100, 1000, 100),
    'classifier__tol': [1e-5, 1e-4, 1e-3],
    'classifier__C': np.arange(0.5, 5, 0.1)
}

params_grid_lsvc = {
    'classifier__loss': ['hinge', 'squared_hinge'],
    'classifier__max_iter': np.arange(100, 1000, 100),
    'classifier__tol': [1e-5, 1e-4, 1e-3],
    'classifier__C': np.arange(0.5, 1.2, 0.1)
}

Подберем параметры для линейного SVM.

In [18]:
%%time
grid_search_lsvc = estimate(LinearSVC(random_state=SEED), 
                                  {**params_grid_vectorizer, **params_grid_lsvc}, 'accuracy', valid_sentences, valid_labels)
print("LinearSVC:")
print(f"Лучшее качество - {grid_search_lsvc.best_score_}")
print(f"Параметры - {grid_search_lsvc.best_params_}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed: 11.8min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed: 26.2min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 29.3min finished


LinearSVC:
Лучшее качество - 0.7377389518625662
Параметры - {'vectorizer__use_idf': False, 'vectorizer__sublinear_tf': True, 'vectorizer__stop_words': None, 'vectorizer__smooth_idf': False, 'vectorizer__norm': 'l2', 'vectorizer__ngram_range': (1, 2), 'vectorizer__min_df': 1, 'vectorizer__max_df': 0.85, 'classifier__tol': 1e-05, 'classifier__max_iter': 400, 'classifier__loss': 'hinge', 'classifier__C': 0.9999999999999999}
CPU times: user 4min 45s, sys: 1.76 s, total: 4min 47s
Wall time: 29min 26s


Подберем параметры для линейного LogisticRegression.

In [19]:
%%time
grid_search_log_regr = estimate(LogisticRegression(random_state=SEED), 
                                  {**params_grid_vectorizer, **params_grid_log_regr}, 'accuracy', valid_sentences, valid_labels)
print("LogisticRegression:")
print(f"Лучшее качество - {grid_search_log_regr.best_score_}")
print(f"Параметры - {grid_search_log_regr.best_params_}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 19.8min
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed: 29.9min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed: 57.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 64.2min finished


LogisticRegression:
Лучшее качество - 0.7331982734220932
Параметры - {'vectorizer__use_idf': True, 'vectorizer__sublinear_tf': False, 'vectorizer__stop_words': None, 'vectorizer__smooth_idf': True, 'vectorizer__norm': 'l2', 'vectorizer__ngram_range': (1, 1), 'vectorizer__min_df': 1, 'vectorizer__max_df': 0.95, 'classifier__tol': 0.001, 'classifier__solver': 'saga', 'classifier__penalty': 'l2', 'classifier__max_iter': 600, 'classifier__C': 1.9999999999999996}
CPU times: user 4min 37s, sys: 1.85 s, total: 4min 39s
Wall time: 1h 4min 16s


## Тренировка и тестирование модели

Linear SVC

In [20]:
model_lvc = grid_search_lsvc.best_estimator_

model_lvc.fit(train_sentences, train_labels)

pred_labels_lvc = model_lvc.predict(test_sentences)

print('Accuracy:', round(accuracy_score(pred_labels_lvc, test_labels), 4))
print('F1 score:', round(f1_score(test_labels, pred_labels_lvc), 4))
print('ROC AUC score:', round(roc_auc_score(test_labels, pred_labels_lvc), 4))

Accuracy: 0.7705
F1 score: 0.7761
ROC AUC score: 0.7705


LogisticRegression

In [21]:
model_log_regr = grid_search_log_regr.best_estimator_

model_log_regr.fit(train_sentences, train_labels)

pred_labels_log_regr = model_log_regr.predict(test_sentences)

print('Accuracy:', round(accuracy_score(pred_labels_log_regr, test_labels), 4))
print('F1 score:', round(f1_score(test_labels, pred_labels_log_regr), 4))
print('ROC AUC score:', round(roc_auc_score(test_labels, pred_labels_log_regr), 4))

Accuracy: 0.7586
F1 score: 0.7647
ROC AUC score: 0.7585


### Примеры предсказаний моделей

Пример негативного твита

In [49]:
example_neg = ['Очень неприятный фильм!']
print('Метка LVC:', model_lvc.predict(example_neg)[0])
print('Вероятность LogisticRegression:', max(model_log_regr.predict_proba(example_neg)[0]))

Метка LVC: 0
Вероятность LogisticRegression: 0.6304333918924427


Пример неоднозначного твита

In [50]:
example_neutral = ['Хорошо поиграли, жаль Витя ушел рано:(((']
print('Метка LVC:', model_lvc.predict(example_neutral)[0])
print('Метка LogisticRegression:', max(model_log_regr.predict_proba(example_neutral)[0]))

Метка LVC: 0
Метка LogisticRegression: 0.7509092722396051


Пример позитивного твита

In [51]:
example_pos = ['Отлично провели время вместе!))']
print('Метка LVC:', model_lvc.predict(example_pos)[0])
print('Метка LogisticRegression:', max(model_log_regr.predict_proba(example_pos)[0]))

Метка LVC: 1
Метка LogisticRegression: 0.9066715394614961
