In [3]:
from sklearn.exceptions import UndefinedMetricWarning
import warnings
warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)


import numpy as np
import pandas as pd
import json

from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier

from nltk.corpus import stopwords
from many_stop_words import get_stop_words



from tqdm import tqdm_notebook

from data.utils.preprocessing import Preprocessing, preprocess
from data.utils.processing import Processing


In [22]:
POLARITIES = [
    'negative',
    'neutral',
    'positive'
]
preprocessing = Preprocessing.PREPROCESSING_1

train_batch = {}
test_batch = {}

train_sentences = TrainSentence.objects.filter(
    out_of_scope=False,
    polarities__len=1
).exclude(
    polarities=['conflict']
)

test_sentences = TestSentence.objects.filter(
    out_of_scope=False,
    polarities__len=1
).exclude(
    polarities=['conflict']
)


def get_ys_polarity(sentences, target_polarity):
    ys = []
    for sentence in sentences:
        if sentence.polarities:
            y = int(target_polarity.upper() == sentence.polarities[0].upper())
        else:
            y = 0
        ys.append(y)
    return ys

def get_sentence_batch(sentence_type, polarity):
    sentences = train_sentences if sentence_type == 'train' else test_sentences
    X_raw = [x.text for x in sentences]
    X_preproc = preprocess(X_raw, preprocessing)
    y = get_ys_polarity(sentences, polarity)
    return X_preproc, y


for idx, polarity in tqdm_notebook(enumerate(POLARITIES)):
    train_batch[polarity] = get_sentence_batch('train', polarity)
    test_batch[polarity]  = get_sentence_batch('test', polarity)





# Исследование: выбор алгоритма классификации и способа векторизации

In [23]:
colors = [
    '#109618', '#FF9900', '#3366CC', '#DC3912', '#0099C6','#DD4477', '#AAAA11','#994499','#22AA99','#6633CC','#E67300','#8B0707','#329262','#5574A6','#3B3EAC'
]

In [24]:
def test(clfs, vectorizers):
    chart_data = {
        'data': {
            'labels': [vectorizer['name'] for vectorizer in vectorizers],
            'datasets': []
        },
        'options': {
            'title': {
                'text': "Классификаторы тональностей",
                'display': True,
                'fontSize': 18
            },
            'scales': {
                'yAxes': [{
                    'scaleLabel': {
                        'display': True,
                        'labelString': 'Accuracy'
                    }
                }]
            },
            'responsive': True
        }
    }
    for i, clf in tqdm_notebook(enumerate(clfs)):
        dataset = {
            'label': clf['name'],
            'data': [],
            'backgroundColor': colors[i]
        }
        for vectroizer in vectorizers:
            result = pd.DataFrame(columns=['Polarity', 'Precision', 'Recall', 'F1-score', 'Accuracy'])
            tp_total = 0
            total = 0
            for idx, polarity in enumerate(POLARITIES):
                polarity_clf = clf['clf']
                cv = vectroizer['vect']
                X_train_raw, y_train = train_batch[polarity]
                X_train = cv.fit_transform(X_train_raw)
                if clf['name'] in ['GaussianNB', 'QuadraticDiscriminantAnalysis']:
                    X_train = X_train.toarray()
                polarity_clf.fit(X_train, y_train)

                X_test_raw, y_test = test_batch[polarity]
                X_test = cv.transform(X_test_raw)
                if clf['name'] in ['GaussianNB', 'QuadraticDiscriminantAnalysis']:
                    X_test = X_test.toarray()
                y_pred = polarity_clf.predict(X_test)

                precision, recall, f1_score, support = metrics.precision_recall_fscore_support(y_true=y_test, y_pred=y_pred, beta=1, average='binary')
                accuracy = metrics.accuracy_score(y_true=y_test, y_pred=y_pred)
                tn, fp, fn, tp = metrics.confusion_matrix(y_true=y_test, y_pred=y_pred).ravel()
                result.loc[idx] = [polarity, precision, recall, f1_score, accuracy]
                
                tp_total += tp
                total += sum([y for y in y_test])

            F1_macro = np.mean(result['F1-score'])
            accuracy = tp_total / total
            dataset['data'].append(accuracy)
        chart_data['data']['datasets'].append(dataset)
    return chart_data

In [25]:
clfs = [
    {
        'name': 'MultinomialNB',
        'clf': MultinomialNB()
    },
    {
        'name': 'GaussianNB',
        'clf': GaussianNB()
    },
    {
        'name': 'LinearSVC',
        'clf': LinearSVC(C=2)
    },
    {
        'name': 'GradientBoostingClassifier',
        'clf': GradientBoostingClassifier()
    },
    {
        'name': 'RandomForestClassifier',
        'clf': RandomForestClassifier()
    },
    {
        'name': 'AdaBoostClassifier',
        'clf': AdaBoostClassifier()
    },
    {
        'name': 'DecisionTreeClassifier',
        'clf': DecisionTreeClassifier()
    },
#     {
#         'name': 'MLPClassifier',
#         'clf': MLPClassifier(verbose=True, early_stopping=True)
#     }
]

vectorizers = [
    {
        'name': 'Булевские вектора',
        'vect': CountVectorizer(max_features=2000, binary=True)
    },
    {
        'name': 'Частотные вектора',
        'vect': CountVectorizer(max_features=2000)
    },
    {
        'name': 'Норм. частотные вектора',
        'vect': TfidfVectorizer(max_features=2000, use_idf=False)
    },
    {
        'name': 'TF_IDF',
        'vect': TfidfVectorizer(max_features=2000, use_idf=True)
    }    
]

chart_data = test(clfs, vectorizers)




In [26]:
import json
json.dumps(chart_data, ensure_ascii=False)

'{"data": {"labels": ["Булевские вектора", "Частотные вектора", "Норм. частотные вектора", "TF_IDF"], "datasets": [{"label": "MultinomialNB", "data": [0.7752675386444708, 0.7824019024970273, 0.6801426872770512, 0.6872770511296076], "backgroundColor": "#109618"}, {"label": "GaussianNB", "data": [0.5505350772889417, 0.5552913198573127, 0.5624256837098692, 0.5695600475624257], "backgroundColor": "#FF9900"}, {"label": "LinearSVC", "data": [0.7526753864447087, 0.7491082045184304, 0.7752675386444708, 0.7657550535077289], "backgroundColor": "#3366CC"}, {"label": "GradientBoostingClassifier", "data": [0.7086801426872771, 0.7086801426872771, 0.7027348394768134, 0.6944114149821641], "backgroundColor": "#DC3912"}, {"label": "RandomForestClassifier", "data": [0.6563614744351962, 0.6444708680142688, 0.6611177170035671, 0.6813317479191439], "backgroundColor": "#0099C6"}, {"label": "AdaBoostClassifier", "data": [0.7360285374554102, 0.7312722948870393, 0.7170035671819263, 0.7110582639714625], "backgro

# Исследование: удаление стоп слов

In [27]:
## STOP_WORDS

# TEST 1
STOP_WORDS_1 = stopwords.words('russian')

# TEST 2
STOP_WORDS_2 = stopwords.words('russian')
STOP_WORDS_2.extend(['что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', 'к', 'на'])

# TEST 3
STOP_WORDS_3 = get_stop_words('ru')

In [28]:
clfs = [
    {
        'name': 'MultinomialNB',
        'clf': MultinomialNB()
    },
    {
        'name': 'LinearSVC',
        'clf': LinearSVC(C=2)
    },
    {
        'name': 'GradientBoostingClassifier',
        'clf': GradientBoostingClassifier()
    },
]

vectorizers = [
    {
        'name': 'Исходный текст',
        'vect': CountVectorizer(max_features=2000)
    },
    {
        'name': 'Стоп-слова nltk',
        'vect': CountVectorizer(max_features=2000, stop_words=STOP_WORDS_1)
    },
    {
        'name': 'Стоп-слова nltk + доп.',
        'vect': CountVectorizer(max_features=2000, stop_words=STOP_WORDS_2)
    },
    {
        'name': 'Стоп-слова many-stop-words',
        'vect': CountVectorizer(max_features=2000, stop_words=STOP_WORDS_3)
    },
]

colors = [
    '#109618', '#3366CC', '#DC3912', '#0099C6','#DD4477', '#AAAA11','#994499','#22AA99','#6633CC','#E67300','#8B0707','#329262','#5574A6','#3B3EAC'
]
chart_data = test(clfs, vectorizers)




In [29]:
json.dumps(chart_data, ensure_ascii=False)

'{"data": {"labels": ["Исходный текст", "Стоп-слова nltk", "Стоп-слова nltk + доп.", "Стоп-слова many-stop-words"], "datasets": [{"label": "MultinomialNB", "data": [0.7824019024970273, 0.7621878715814506, 0.7645659928656362, 0.7502972651605232], "backgroundColor": "#109618"}, {"label": "LinearSVC", "data": [0.7491082045184304, 0.72294887039239, 0.7205707491082045, 0.7074910820451843], "backgroundColor": "#3366CC"}, {"label": "GradientBoostingClassifier", "data": [0.7051129607609988, 0.7110582639714625, 0.7074910820451843, 0.7098692033293698], "backgroundColor": "#DC3912"}]}, "options": {"title": {"text": "Классификаторы тональностей", "display": true, "fontSize": 18}, "scales": {"yAxes": [{"scaleLabel": {"display": true, "labelString": "Accuracy"}}]}, "responsive": true}}'

In [7]:
result.style.bar(subset=['Precision', 'Recall', 'F1-score'], color='#5fba7d').format("{:.2}", subset=['Precision', 'Recall', 'F1-score'], )

NameError: name 'result' is not defined

In [None]:
F1_macro = np.mean(result['F1-score'])
F1_macro

In [36]:
# negative 4668
# neutral 4237
# positive 4376

# negative = TrainSession.objects.get(id=4668)
# y_pred = negative.y_pred
# y_test = negative.batch.y_test
# tn, fp, fn, tp = metrics.confusion_matrix(y_true=y_test, y_pred=y_pred).ravel()


tp_total = 0
total = 0
            
for session in TrainSession.objects.filter(id__in=[4668,4921,4952]):
    y_pred = session.y_pred
    y_test = session.batch.y_test
    tn, fp, fn, tp = metrics.confusion_matrix(y_true=y_test, y_pred=y_pred).ravel()
    tp_total += tp
    total += sum([y for y in y_test])
    
acc = tp_total / total
acc

0.8204518430439952

In [None]:
# negative 4668
# neutral 4921
# poistive 4952

[[0.0],
 [0.0],
 [0.0],
 [1.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [1.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [1.0],
 [0.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [0.0],
 [1.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [1.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [1.0],
 [1.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [1.0],
 [0.0],
 [1.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [1.0],
 [1.0],
 [0.0],
 [1.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [1.0],
 [1.0],
 [1.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [1.0],
 [1.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [1.0],
 [0.0],
 [0.0],
 [1.0],
 [0.0],
 [0.0],
 [1.0],
 [1.0],
 [1.0],
