In [31]:
from sklearn.exceptions import UndefinedMetricWarning
import warnings
warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)


import numpy as np
import pandas as pd
import json

from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier

from nltk.corpus import stopwords
from many_stop_words import get_stop_words



from tqdm import tqdm_notebook

from data.utils.preprocessing import Preprocessing, preprocess
from data.utils.processing import Processing


In [32]:
POLARITIES = [
    'negative',
    'neutral',
    'positive'
]
preprocessing = Preprocessing.PREPROCESSING_1

train_batch = {}
test_batch = {}

train_sentences = list(TrainSentence.objects.all())
test_sentences = list(TestSentence.objects.all())

def get_ys_polarity(sentences, target_polarity):
    ys = []
    for sentence in sentences:
        if sentence.polarities:
            y = int(target_polarity.upper() == sentence.polarities[0].upper())
        else:
            y = 0
        ys.append(y)
    return ys

def get_sentence_batch(sentence_type, polarity):
    sentences = train_sentences if sentence_type == 'train' else test_sentences
    X_raw = [x.text for x in sentences]
    X_preproc = preprocess(X_raw, preprocessing)
    y = get_ys_polarity(sentences, polarity)
    return X_preproc, y


for idx, polarity in tqdm_notebook(enumerate(POLARITIES)):
    train_batch[polarity] = get_sentence_batch('train', polarity)
    test_batch[polarity]  = get_sentence_batch('test', polarity)





# Исследование: выбор алгоритма классификации и способа векторизации

In [33]:
colors = [
    '#109618', '#FF9900', '#3366CC', '#DC3912', '#0099C6','#DD4477', '#AAAA11','#994499','#22AA99','#6633CC','#E67300','#8B0707','#329262','#5574A6','#3B3EAC'
]

In [34]:
def test(clfs, vectorizers):
    chart_data = {
        'data': {
            'labels': [vectorizer['name'] for vectorizer in vectorizers],
            'datasets': []
        },
        'options': {
            'title': {
                'text': "Классификаторы тональностей",
                'display': True,
                'fontSize': 18
            },
            'scales': {
                'yAxes': [{
                    'scaleLabel': {
                        'display': True,
                        'labelString': 'Accuracy'
                    }
                }]
            },
            'responsive': True
        }
    }
    for i, clf in tqdm_notebook(enumerate(clfs)):
        dataset = {
            'label': clf['name'],
            'data': [],
            'backgroundColor': colors[i]
        }
        for vectroizer in vectorizers:
            result = pd.DataFrame(columns=['Polarity', 'Precision', 'Recall', 'F1-score', 'Accuracy'])
            for idx, polarity in enumerate(POLARITIES):
                polarity_clf = clf['clf']
                cv = vectroizer['vect']
                X_train_raw, y_train = train_batch[polarity]
                X_train = cv.fit_transform(X_train_raw)
                if clf['name'] in ['GaussianNB', 'QuadraticDiscriminantAnalysis']:
                    X_train = X_train.toarray()
                polarity_clf.fit(X_train, y_train)

                X_test_raw, y_test = test_batch[polarity]
                X_test = cv.transform(X_test_raw)
                if clf['name'] in ['GaussianNB', 'QuadraticDiscriminantAnalysis']:
                    X_test = X_test.toarray()
                y_pred = polarity_clf.predict(X_test)

                precision, recall, f1_score, _ = metrics.precision_recall_fscore_support(y_true=y_test, y_pred=y_pred, beta=1, average='binary')
                accuracy = metrics.accuracy_score(y_true=y_test, y_pred=y_pred)
                result.loc[idx] = [polarity, precision, recall, f1_score, accuracy]

            F1_macro = np.mean(result['F1-score'])
            accuracy_macro = np.mean(result['Accuracy'])
            dataset['data'].append(accuracy_macro)
        chart_data['data']['datasets'].append(dataset)
    return chart_data

In [35]:
clfs = [
    {
        'name': 'MultinomialNB',
        'clf': MultinomialNB()
    },
    {
        'name': 'GaussianNB',
        'clf': GaussianNB()
    },
    {
        'name': 'LinearSVC',
        'clf': LinearSVC(C=2)
    },
    {
        'name': 'GradientBoostingClassifier',
        'clf': GradientBoostingClassifier()
    },
    {
        'name': 'RandomForestClassifier',
        'clf': RandomForestClassifier()
    },
    {
        'name': 'AdaBoostClassifier',
        'clf': AdaBoostClassifier()
    },
    {
        'name': 'DecisionTreeClassifier',
        'clf': DecisionTreeClassifier()
    },
#     {
#         'name': 'MLPClassifier',
#         'clf': MLPClassifier(verbose=True, early_stopping=True)
#     }
]

vectorizers = [
    {
        'name': 'Булевские вектора',
        'vect': CountVectorizer(max_features=2000, binary=True)
    },
    {
        'name': 'Частотные вектора',
        'vect': CountVectorizer(max_features=2000)
    },
    {
        'name': 'Норм. частотные вектора',
        'vect': TfidfVectorizer(max_features=2000, use_idf=False)
    },
    {
        'name': 'TF_IDF',
        'vect': TfidfVectorizer(max_features=2000, use_idf=True)
    }    
]

chart_data = test(clfs, vectorizers)




In [38]:
import json
json.dumps(chart_data, ensure_ascii=False)

'{"data": {"labels": ["Булевские вектора", "Частотные вектора", "Норм. частотные вектора", "TF_IDF"], "datasets": [{"label": "MultinomialNB", "data": [0.847808105872622, 0.8467052660601048, 0.83264405845051, 0.8351254480286738], "backgroundColor": "#109618"}, {"label": "GaussianNB", "data": [0.6247587537910119, 0.6242073338847532, 0.6368899917287014, 0.6385442514474774], "backgroundColor": "#FF9900"}, {"label": "LinearSVC", "data": [0.8227185001378551, 0.8229942100909843, 0.847808105872622, 0.841466776950648], "backgroundColor": "#3366CC"}, {"label": "GradientBoostingClassifier", "data": [0.8309897987317342, 0.8331954783567687, 0.8362282878411911, 0.830714088778605], "backgroundColor": "#DC3912"}, {"label": "RandomForestClassifier", "data": [0.8243727598566308, 0.8240970499035015, 0.8235456299972429, 0.830714088778605], "backgroundColor": "#0099C6"}, {"label": "AdaBoostClassifier", "data": [0.8263027295285359, 0.8274055693410531, 0.8251998897160187, 0.8274055693410531], "backgroundColo

# Исследование: удаление стоп слов

In [39]:
## STOP_WORDS

# TEST 1
STOP_WORDS_1 = stopwords.words('russian')

# TEST 2
STOP_WORDS_2 = stopwords.words('russian')
STOP_WORDS_2.extend(['что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', 'к', 'на'])

# TEST 3
STOP_WORDS_3 = get_stop_words('ru')

In [40]:
clfs = [
    {
        'name': 'MultinomialNB',
        'clf': MultinomialNB()
    },
    {
        'name': 'LinearSVC',
        'clf': LinearSVC(C=2)
    },
    {
        'name': 'GradientBoostingClassifier',
        'clf': GradientBoostingClassifier()
    },
]

vectorizers = [
    {
        'name': 'Исходный текст',
        'vect': TfidfVectorizer(max_features=2000, use_idf=False)
    },
    {
        'name': 'Стоп-слова nltk',
        'vect': TfidfVectorizer(max_features=2000, stop_words=STOP_WORDS_1, use_idf=False)
    },
    {
        'name': 'Стоп-слова nltk + доп.',
        'vect': TfidfVectorizer(max_features=2000, stop_words=STOP_WORDS_2, use_idf=False)
    },
    {
        'name': 'Стоп-слова many-stop-words',
        'vect': TfidfVectorizer(max_features=2000, stop_words=STOP_WORDS_3, use_idf=False)
    },
]

colors = [
    '#109618', '#3366CC', '#DC3912', '#0099C6','#DD4477', '#AAAA11','#994499','#22AA99','#6633CC','#E67300','#8B0707','#329262','#5574A6','#3B3EAC'
]
chart_data = test(clfs, vectorizers)




In [41]:
json.dumps(chart_data, ensure_ascii=False)

'{"data": {"labels": ["Исходный текст", "Стоп-слова nltk", "Стоп-слова nltk + доп.", "Стоп-слова many-stop-words"], "datasets": [{"label": "MultinomialNB", "data": [0.83264405845051, 0.8276812792941826, 0.8268541494347946, 0.8213399503722084], "backgroundColor": "#109618"}, {"label": "LinearSVC", "data": [0.847808105872622, 0.8392610973256135, 0.8400882271850013, 0.8312655086848636], "backgroundColor": "#3366CC"}, {"label": "GradientBoostingClassifier", "data": [0.8354011579818031, 0.8199614006065619, 0.821891370278467, 0.8180314309346568], "backgroundColor": "#DC3912"}]}, "options": {"title": {"text": "Классификаторы тональностей", "display": true, "fontSize": 18}, "scales": {"yAxes": [{"scaleLabel": {"display": true, "labelString": "Accuracy"}}]}, "responsive": true}}'

In [7]:
result.style.bar(subset=['Precision', 'Recall', 'F1-score'], color='#5fba7d').format("{:.2}", subset=['Precision', 'Recall', 'F1-score'], )

NameError: name 'result' is not defined

In [None]:
F1_macro = np.mean(result['F1-score'])
F1_macro